In [42]:
import numpy as np
import pandas as pd 

In [43]:
df = pd.read_csv('pima-indians-diabetes.csv', skiprows=9, header=None)
df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [44]:
df.columns = ['P','plasma','bp','thick','insuline','BMI','pedigree','age','class']
df.head(5)

Unnamed: 0,P,plasma,bp,thick,insuline,BMI,pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [45]:
X = df.iloc[:,:-1].values   #  넘파이 어레이로 만들어주는 values : 학습,테스트 분할시 인덱스 정리됨(X_test)
X.shape

(768, 8)

In [46]:
y = df.iloc[:,-1].values
y.shape

(768,)

- 학습, 테스트 데이터 분할하기

In [47]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=2021
)
    

In [48]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

- 모델 생성 및 학습 

In [49]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=2021)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

In [50]:
dtc.score(X_test, y_test)

0.7077922077922078

In [51]:
pred_dt = dtc.predict(X_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_dt)

0.7077922077922078

- GridSearchCV : 하이퍼 파라메타 튜닝 + 교차검증

In [52]:
params = { 
    'max_depth': [2,4,6],
    'min_samples_split': [2,4,6]
}

In [53]:
from sklearn.model_selection import GridSearchCV
grid_dt = GridSearchCV(dtc, param_grid = params, scoring='accuracy', cv = 3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 6],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [54]:
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [55]:
grid_dt = GridSearchCV(dtc, param_grid = params, scoring='accuracy', cv = 3)
grid_dt.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 6],
                         'min_samples_split': [2, 4, 6]},
             scoring='accuracy')

In [56]:
params = { 
    'max_depth': [2,3,4],
    'min_samples_split': [2,3,4]            # 왜 튜닝? 2,4,6 사이 값에 최적이 있을까봐? 
}

In [57]:
grid_dt.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [58]:
best_dt = grid_dt.best_estimator_
best_dt.score(X_test, y_test)

0.7337662337662337

- 실제값 하나가 주어졌을 때 당뇨병 여부를 확인하는 방법

In [59]:
y_test      #X/y = df.iloc[].'values' 안했을 시와 비교 

array([1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
      dtype=int64)

In [65]:
y_test[33]

0

In [66]:
X_test[33]

array([  0.   , 126.   ,  86.   ,  27.   , 120.   ,  27.4  ,   0.515,
        21.   ])

In [67]:
test_data = X_test[33]

In [69]:
result = best_dt.predict(test_data.reshape(1,8))[0]
print('음성' if result == 0 else '양성')
    

음성
