# Decision Tree

## module import

In [6]:
import numpy as np
import pandas as pd

## 데이터 불러오기

In [7]:
datapath = './data/diagnosis2.data'

original_data = pd.read_csv(datapath, sep='\t')

## 데이터 구조 
## row 119개 col 8개
original_data.shape

(119, 8)

## columns 이름

In [8]:
original_data.columns

Index(['35,5', 'no', 'yes', 'no.1', 'no.2', 'no.3', 'no.4', 'no.5'], dtype='object')

## columns 이름을 추가해서 불러오기

In [9]:
datapath = './data/diagnosis2.data'

original_data = pd.read_csv( datapath
                           , sep='\t'
                           , names=[ 'temperatuer'
                                   , 'nausea'
                                   , 'lumbar_pain'
                                   , 'urine_pushing'
                                   , 'micturition_pains'
                                   , 'burning_of_urethra'
                                   , 'inflamation'
                                   , 'nephritis'])

original_data.shape

(120, 8)

### 데이터 확인하기

In [10]:
original_data.head()

Unnamed: 0,temperatuer,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflamation,nephritis
0,355,no,yes,no,no,no,no,no
1,359,no,no,yes,yes,yes,yes,no
2,359,no,yes,no,no,no,no,no
3,360,no,no,yes,yes,yes,yes,no
4,360,no,yes,no,no,no,no,no


### Temperatuer 데이터 ,  > . 로 변경

In [11]:
original_data['temperatuer'] = original_data['temperatuer'].apply(lambda x: x.replace(',','.'))
original_data.head()

Unnamed: 0,temperatuer,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflamation,nephritis
0,35.5,no,yes,no,no,no,no,no
1,35.9,no,no,yes,yes,yes,yes,no
2,35.9,no,yes,no,no,no,no,no
3,36.0,no,no,yes,yes,yes,yes,no
4,36.0,no,yes,no,no,no,no,no


# 모델 사용 sklearn

이걸 사용하려면 Nominal (categorical) 값인 attribute 를 다 숫자로 변경해줘야 한다.

쉽게 말해서
string > int 변경

In [12]:
from sklearn import preprocessing

le_nausea = preprocessing.LabelEncoder() # 0에서 1사이로 변경

le_nausea.fit(original_data['nausea'])

print(le_nausea.classes_)
print(le_nausea.transform(['no', 'yes', 'yes'])) # 현재 칼럼의 값
print(le_nausea.inverse_transform([0, 0, 1])) # 수정할 값

['no' 'yes']
[0 1 1]
['no' 'no' 'yes']


## 데이터 범주를 숫자형으로 변경

In [13]:
dicted_data = original_data.copy()
dicted_data['nausea'] = le_nausea.transform(dicted_data['nausea'])

dicted_data.head()

Unnamed: 0,temperatuer,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflamation,nephritis
0,35.5,0,yes,no,no,no,no,no
1,35.9,0,no,yes,yes,yes,yes,no
2,35.9,0,yes,no,no,no,no,no
3,36.0,0,no,yes,yes,yes,yes,no
4,36.0,0,yes,no,no,no,no,no


## 다른 칼럼도 적용

In [14]:
tmp = [ 'lumbar_pain'
      , 'urine_pushing'
      , 'micturition_pains'
      , 'burning_of_urethra'
      , 'inflamation'
      , 'nephritis']

les = {'nausea' : le_nausea}

for x in tmp:
    les[x] = preprocessing.LabelEncoder()
    dicted_data[x] = les[x].fit_transform(original_data[x])

dicted_data.head()

Unnamed: 0,temperatuer,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflamation,nephritis
0,35.5,0,1,0,0,0,0,0
1,35.9,0,0,1,1,1,1,0
2,35.9,0,1,0,0,0,0,0
3,36.0,0,0,1,1,1,1,0
4,36.0,0,1,0,0,0,0,0


In [15]:
print(les)

{'nausea': LabelEncoder(), 'lumbar_pain': LabelEncoder(), 'urine_pushing': LabelEncoder(), 'micturition_pains': LabelEncoder(), 'burning_of_urethra': LabelEncoder(), 'inflamation': LabelEncoder(), 'nephritis': LabelEncoder()}


## x, y 모양 확인

In [16]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree

y = dicted_data['inflamation']

features = ['temperatuer']
x = dicted_data[features]

model = DecisionTreeClassifier(min_samples_split=20, random_state=99)

y 데이터 확인

In [17]:
y.head() # inflamation

0    0
1    1
2    0
3    1
4    0
Name: inflamation, dtype: int32

x 데이터 확인

In [18]:
x.head() # temperatuer

Unnamed: 0,temperatuer
0,35.5
1,35.9
2,35.9
3,36.0
4,36.0


### 학습 적용

In [19]:
model.fit(x,y)

DecisionTreeClassifier(min_samples_split=20, random_state=99)

## 시각화 하기

In [16]:
from graphviz import Source
from sklearn.tree import export_graphviz
from IPython.display import SVG

inflamtion_labels = 'temperatuer'
print(inflamtion_labels)


temperatuer
