### Cross Validation Task

### 약물 A,B,C,X,Y

다중 분류(Multiclass Classification)
- 의학 연구원으로서 동일한 질병을 앓고 있는 일련의 환자에 대한 데이터를 수집한다.
- 치료 과정 동안 각 환자는 5가지 약물 A, C, B, X, Y 중 하나에 반응했다.
- 미래에 동일한 질병을 앓는 환자에게 어떤 약물이 적합할 수 있는지 알아보기 위한 모델을 구축한다.

feature
- Age: 환자의 나이
- Sex: 환자의 성별
- BP: 혈압
- Cholesterol: 콜레스테롤 수치
- Na_to_K: 나트륨-칼륨

target
- Drug: 의약품, 환자에게 효과가 있었던 약

In [88]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score



In [89]:
drug_df = pd.read_csv('./datasets/drugs.csv')
drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [90]:
na_to_k_df = drug_df.loc[:, 'Na_to_K'].reset_index()
na_to_k_df

Unnamed: 0,index,Na_to_K
0,0,25.355
1,1,13.093
2,2,10.114
3,3,7.798
4,4,18.043
...,...,...
195,195,11.567
196,196,12.006
197,197,9.894
198,198,14.020


In [91]:
std = StandardScaler()
std_na_to_k = std.fit_transform(na_to_k_df[['Na_to_K']])
std_na_to_k

array([[ 1.28652212],
       [-0.4151454 ],
       [-0.82855818],
       [-1.14996267],
       [ 0.27179427],
       [-1.03769314],
       [ 0.02643885],
       [-0.70046821],
       [-0.12676951],
       [ 0.45567206],
       [-0.59916196],
       [ 0.43221897],
       [-0.09832049],
       [ 0.674105  ],
       [-0.46926791],
       [-0.0788919 ],
       [-0.64245998],
       [-0.29316156],
       [-1.21935052],
       [ 1.37242427],
       [ 0.42236589],
       [ 1.36451406],
       [ 2.00995979],
       [-0.14550423],
       [ 2.41490725],
       [ 0.37809645],
       [ 1.9819271 ],
       [-0.93028076],
       [ 0.91765633],
       [ 0.25902691],
       [-1.01784822],
       [-0.90446848],
       [-0.70366006],
       [ 2.19147839],
       [-0.27081868],
       [-1.2211546 ],
       [-0.92139911],
       [-0.29787994],
       [-0.88476233],
       [-0.97149714],
       [ 0.43527203],
       [-0.25610845],
       [-0.04086736],
       [-0.53074555],
       [-0.5258884 ],
       [-1

In [92]:
na_to_k_df['Na_to_K'] = std_na_to_k
na_to_k_df

Unnamed: 0,index,Na_to_K
0,0,1.286522
1,1,-0.415145
2,2,-0.828558
3,3,-1.149963
4,4,0.271794
...,...,...
195,195,-0.626917
196,196,-0.565995
197,197,-0.859089
198,198,-0.286500


In [93]:
con1 = na_to_k_df['Na_to_K'] >= -1.96
con2 = na_to_k_df['Na_to_K'] <= 1.96
con = con1 & con2

In [94]:
na_to_k_df[con]

Unnamed: 0,index,Na_to_K
0,0,1.286522
1,1,-0.415145
2,2,-0.828558
3,3,-1.149963
4,4,0.271794
...,...,...
195,195,-0.626917
196,196,-0.565995
197,197,-0.859089
198,198,-0.286500


In [95]:
drug_df = drug_df.iloc[na_to_k_df[con].index].reset_index(drop=True)
drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
183,56,F,LOW,HIGH,11.567,drugC
184,16,M,LOW,HIGH,12.006,drugC
185,52,M,NORMAL,HIGH,9.894,drugX
186,23,M,NORMAL,NORMAL,14.020,drugX


전처리

In [96]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder


In [97]:
drug_df1 = drug_df.copy()

In [98]:
drug_encoder = LabelEncoder()
targets = drug_encoder.fit_transform(drug_df['Drug'].tolist())
drug_df1['Drug'] = targets

sex_encoder = LabelEncoder()
targets = sex_encoder.fit_transform(drug_df['Sex'].tolist())
drug_df1['Sex'] = targets

BP_encoder = LabelEncoder()
targets = BP_encoder.fit_transform(drug_df['BP'].tolist())
drug_df1['BP'] = targets

CH_encoder = LabelEncoder()
targets = CH_encoder.fit_transform(drug_df['Cholesterol'].tolist())
drug_df1['Cholesterol'] = targets

drug_df1

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,4
1,47,1,1,0,13.093,2
2,47,1,1,0,10.114,2
3,28,0,2,0,7.798,3
4,61,0,1,0,18.043,4
...,...,...,...,...,...,...
183,56,0,1,0,11.567,2
184,16,1,1,0,12.006,2
185,52,1,2,0,9.894,3
186,23,1,2,1,14.020,3


In [99]:
drug_encoder.classes_

array(['drugA', 'drugB', 'drugC', 'drugX', 'drugY'], dtype='<U5')

In [100]:
drug_encoder.inverse_transform(drug_df1['Drug'])

array(['drugY', 'drugC', 'drugC', 'drugX', 'drugY', 'drugX', 'drugY',
       'drugC', 'drugY', 'drugY', 'drugC', 'drugY', 'drugY', 'drugY',
       'drugX', 'drugY', 'drugX', 'drugA', 'drugC', 'drugY', 'drugY',
       'drugY', 'drugY', 'drugY', 'drugX', 'drugY', 'drugY', 'drugX',
       'drugB', 'drugX', 'drugX', 'drugX', 'drugA', 'drugX', 'drugX',
       'drugX', 'drugY', 'drugB', 'drugY', 'drugX', 'drugX', 'drugX',
       'drugA', 'drugC', 'drugY', 'drugY', 'drugX', 'drugY', 'drugY',
       'drugB', 'drugC', 'drugB', 'drugY', 'drugX', 'drugY', 'drugY',
       'drugA', 'drugY', 'drugX', 'drugB', 'drugY', 'drugA', 'drugX',
       'drugY', 'drugY', 'drugB', 'drugY', 'drugX', 'drugY', 'drugY',
       'drugY', 'drugA', 'drugY', 'drugA', 'drugX', 'drugB', 'drugX',
       'drugC', 'drugA', 'drugC', 'drugB', 'drugX', 'drugY', 'drugY',
       'drugY', 'drugY', 'drugY', 'drugY', 'drugY', 'drugY', 'drugX',
       'drugY', 'drugY', 'drugA', 'drugA', 'drugC', 'drugX', 'drugY',
       'drugX', 'dru

In [101]:
dtc = DecisionTreeClassifier()
features, targets = drug_df1.iloc[:, :-1], drug_df1.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=124)
parameters = {'max_depth': [3,4,5],'min_samples_split': [15,16,17]}

In [102]:
from sklearn.model_selection import GridSearchCV

g_dtc = GridSearchCV(dtc,
                    param_grid = parameters,
                    cv=5,
                    refit=True,
                    return_train_score=True,
                    n_jobs=-1)

In [103]:
g_dtc.fit(X_train, y_train)

In [104]:
g_dtc.cv_results_

{'mean_fit_time': array([0.00464392, 0.00439978, 0.00504351, 0.005444  , 0.00484328,
        0.00480051, 0.00344315, 0.00199986, 0.00180001]),
 'std_fit_time': array([7.71766398e-04, 4.89882014e-04, 5.66450072e-04, 1.00633096e-03,
        7.05575967e-04, 9.79802776e-04, 1.04896461e-03, 1.50789149e-07,
        4.00042545e-04]),
 'mean_score_time': array([0.0018446 , 0.00188584, 0.00180078, 0.00224442, 0.0020009 ,
        0.00208797, 0.00200086, 0.0012001 , 0.00140033]),
 'std_score_time': array([4.29120682e-04, 4.72564038e-04, 7.47692129e-04, 3.87002089e-04,
        1.51990675e-06, 1.07924651e-04, 1.40969710e-06, 3.99994861e-04,
        4.89512666e-04]),
 'param_max_depth': masked_array(data=[3, 3, 3, 4, 4, 4, 5, 5, 5],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[15, 16, 17, 15, 16, 17, 15, 16, 17],
              mask=[False, Fals

In [105]:
result_df = pd.DataFrame(g_dtc.cv_results_)
result_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.004644,0.0007717664,0.001845,0.000429,3,15,"{'max_depth': 3, 'min_samples_split': 15}",0.8,0.9,0.9,...,0.873333,0.04899,7,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
1,0.0044,0.000489882,0.001886,0.000473,3,16,"{'max_depth': 3, 'min_samples_split': 16}",0.8,0.9,0.9,...,0.873333,0.04899,7,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
2,0.005044,0.0005664501,0.001801,0.000748,3,17,"{'max_depth': 3, 'min_samples_split': 17}",0.8,0.9,0.9,...,0.873333,0.04899,7,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
3,0.005444,0.001006331,0.002244,0.000387,4,15,"{'max_depth': 4, 'min_samples_split': 15}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.004843,0.000705576,0.002001,2e-06,4,16,"{'max_depth': 4, 'min_samples_split': 16}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,0.004801,0.0009798028,0.002088,0.000108,4,17,"{'max_depth': 4, 'min_samples_split': 17}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
6,0.003443,0.001048965,0.002001,1e-06,5,15,"{'max_depth': 5, 'min_samples_split': 15}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
7,0.002,1.507891e-07,0.0012,0.0004,5,16,"{'max_depth': 5, 'min_samples_split': 16}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
8,0.0018,0.0004000425,0.0014,0.00049,5,17,"{'max_depth': 5, 'min_samples_split': 17}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [106]:
print(g_dtc.best_params_, g_dtc.best_score_, sep='\n')

{'max_depth': 4, 'min_samples_split': 15}
0.9933333333333334


In [107]:
g_dtc.best_estimator_

In [108]:
from sklearn.metrics import accuracy_score

dtc = g_dtc.best_estimator_
prediction = dtc.predict(X_test)
accuracy_score(y_test, prediction)

0.9736842105263158

---------------

KFold

In [109]:
from sklearn.model_selection import KFold

dtc = DecisionTreeClassifier(random_state=124, min_samples_leaf=6)
kfold = KFold(n_splits=5)

In [110]:
features.shape

(188, 5)

In [111]:
drug_df1

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,0,0,0,25.355,4
1,47,1,1,0,13.093,2
2,47,1,1,0,10.114,2
3,28,0,2,0,7.798,3
4,61,0,1,0,18.043,4
...,...,...,...,...,...,...
183,56,0,1,0,11.567,2
184,16,1,1,0,12.006,2
185,52,1,2,0,9.894,3
186,23,1,2,1,14.020,3


In [112]:
features.iloc[:,0]

0      23
1      47
2      47
3      28
4      61
       ..
183    56
184    16
185    52
186    23
187    40
Name: Age, Length: 188, dtype: int64

In [113]:
targets

0      4
1      2
2      2
3      3
4      4
      ..
183    2
184    2
185    3
186    3
187    3
Name: Drug, Length: 188, dtype: int64

In [114]:
count = 0
features_array = features.values
targets_array = targets.values
accuracy_list = []
for train_index, test_index in kfold.split(features_array):
    #print(train_index, test_index, sep='\n')
    X_train, X_test = features_array[train_index], features_array[test_index]
    y_train, y_test = targets_array[train_index], targets_array[test_index]

    dtc.fit(X_train, y_train)
    prediction = dtc.predict(X_test)

    accuracy = np.round(accuracy_score(y_test, prediction), 4)
    accuracy_list.append(accuracy)
    
    train_targets = pd.DataFrame(y_train)
    test_targets = pd.DataFrame(y_test)

    
    count += 1

    print(f'{count} 회차')
    print(f'학습 타겟 데이터 분포: {train_targets.value_counts()}')
    print(f'검증 타겟 데이터 분포: {test_targets.value_counts()}')
    print(f'정확도: {accuracy}')
print(f'평균 정확도: {np.mean(accuracy_list)}')

1 회차
학습 타겟 데이터 분포: 4    62
3    42
0    21
1    14
2    11
Name: count, dtype: int64
검증 타겟 데이터 분포: 4    17
3    12
2     5
0     2
1     2
Name: count, dtype: int64
정확도: 1.0
2 회차
학습 타겟 데이터 분포: 4    62
3    45
0    18
2    14
1    11
Name: count, dtype: int64
검증 타겟 데이터 분포: 4    17
3     9
0     5
1     5
2     2
Name: count, dtype: int64
정확도: 1.0
3 회차
학습 타겟 데이터 분포: 4    63
3    42
0    18
1    14
2    13
Name: count, dtype: int64
검증 타겟 데이터 분포: 4    16
3    12
0     5
2     3
1     2
Name: count, dtype: int64
정확도: 1.0
4 회차
학습 타겟 데이터 분포: 4    67
3    43
0    17
2    13
1    11
Name: count, dtype: int64
검증 타겟 데이터 분포: 4    12
3    11
0     6
1     5
2     3
Name: count, dtype: int64
정확도: 0.9189
5 회차
학습 타겟 데이터 분포: 4    62
3    44
0    18
1    14
2    13
Name: count, dtype: int64
검증 타겟 데이터 분포: 4    17
3    10
0     5
2     3
1     2
Name: count, dtype: int64
정확도: 1.0
평균 정확도: 0.98378


------------
Stratified K Fold

In [115]:
from sklearn.model_selection import StratifiedKFold

s_kfold = StratifiedKFold(n_splits=5)

In [116]:
count = 0
accuracy_list = []

for train_index, test_index in s_kfold.split(features_array, targets_array):
    X_train, X_test = features_array[train_index], features_array[test_index]
    y_train, y_test = targets_array[train_index], targets_array[test_index]

    dtc.fit(X_train, y_train)
    prediction = dtc.predict(X_test)

    accuracy = np.round(accuracy_score(y_test, prediction), 4)
    accuracy_list.append(accuracy)

    train_targets = pd.DataFrame(y_train)
    test_targets = pd.DataFrame(y_test)

    count += 1

    print(f'{count} 회차')
    print(f' 학습 타겟 데이터 분포: {train_targets.value_counts()}')
    print(f' 검증 타겟 데이터 분포: {test_targets.value_counts()}')
    print(f' 정확도: {accuracy}')

print(f'평균 정확도: {np.mean(accuracy_list)}')

1 회차
 학습 타겟 데이터 분포: 4    63
3    43
0    18
1    13
2    13
Name: count, dtype: int64
 검증 타겟 데이터 분포: 4    16
3    11
0     5
1     3
2     3
Name: count, dtype: int64
 정확도: 1.0
2 회차
 학습 타겟 데이터 분포: 4    63
3    43
0    18
1    13
2    13
Name: count, dtype: int64
 검증 타겟 데이터 분포: 4    16
3    11
0     5
1     3
2     3
Name: count, dtype: int64
 정확도: 1.0
3 회차
 학습 타겟 데이터 분포: 4    63
3    43
0    19
2    13
1    12
Name: count, dtype: int64
 검증 타겟 데이터 분포: 4    16
3    11
0     4
1     4
2     3
Name: count, dtype: int64
 정확도: 1.0
4 회차
 학습 타겟 데이터 분포: 4    63
3    43
0    19
1    13
2    13
Name: count, dtype: int64
 검증 타겟 데이터 분포: 4    16
3    11
0     4
1     3
2     3
Name: count, dtype: int64
 정확도: 0.9459
5 회차
 학습 타겟 데이터 분포: 4    64
3    44
0    18
1    13
2    12
Name: count, dtype: int64
 검증 타겟 데이터 분포: 4    15
3    10
0     5
2     4
1     3
Name: count, dtype: int64
 정확도: 1.0
평균 정확도: 0.98918
