### Cross Validation Task

### 약물 A, B, C, X, Y
##### 다중 분류(Multiclass Classification)
- 의학 연구원으로서 동일한 질병을 앓고 있는 일련의 환자에 대한 데이터를 수집했다.
- 치료 과정 동안 각 환자는 5가지 약물, 즉 약물 A, 약물 B, 약물 c, 약물 x 및 y 중 하나에 반응했다.
-  미래에 동일한 질병을 앓는 환자에게 어떤 약물이 적합할 수 있는지 알아보기 위한 모델을 구축한다.

결측치
중복값
이상치(-1.96 ~ 1.96) 넘어가면 이상치

##### feature
- Age: 환자의 나이
- Sex: 환자의 성별
- BP: 혈압
- Cholesterol: 콜레스테롤 수치
- Na_to_K: 나트륨-칼륨

##### target
- Drug: 의약품, 환자에게 효과가 있었던 약

In [1]:
# 데이터 소환
import pandas as pd
import numpy as np

drug_df = pd.read_csv('./datasets/drugs.csv')
drug_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [2]:
# 이상치 제거

na_to_k_df = drug_df.loc[:, 'Na_to_K'].reset_index()
na_to_k_df

Unnamed: 0,index,Na_to_K
0,0,25.355
1,1,13.093
2,2,10.114
3,3,7.798
4,4,18.043
...,...,...
195,195,11.567
196,196,12.006
197,197,9.894
198,198,14.020


In [3]:
# Na_to_K 컬럼을 fit_tranform(데이터를 학습시켜서 실제로 학습시킨것을 적용하는 것)을 쓴다.
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
std_na_to_k = std.fit_transform(na_to_k_df[['Na_to_K']])
std_na_to_k

array([[ 1.28652212],
       [-0.4151454 ],
       [-0.82855818],
       [-1.14996267],
       [ 0.27179427],
       [-1.03769314],
       [ 0.02643885],
       [-0.70046821],
       [-0.12676951],
       [ 0.45567206],
       [-0.59916196],
       [ 0.43221897],
       [-0.09832049],
       [ 0.674105  ],
       [-0.46926791],
       [-0.0788919 ],
       [-0.64245998],
       [-0.29316156],
       [-1.21935052],
       [ 1.37242427],
       [ 0.42236589],
       [ 1.36451406],
       [ 2.00995979],
       [-0.14550423],
       [ 2.41490725],
       [ 0.37809645],
       [ 1.9819271 ],
       [-0.93028076],
       [ 0.91765633],
       [ 0.25902691],
       [-1.01784822],
       [-0.90446848],
       [-0.70366006],
       [ 2.19147839],
       [-0.27081868],
       [-1.2211546 ],
       [-0.92139911],
       [-0.29787994],
       [-0.88476233],
       [-0.97149714],
       [ 0.43527203],
       [-0.25610845],
       [-0.04086736],
       [-0.53074555],
       [-0.5258884 ],
       [-1

In [4]:
# std_na_to_k에 na_to_k_df['Na_to_K']을 대입시킨다.
na_to_k_df['Na_to_K'] = std_na_to_k
na_to_k_df

Unnamed: 0,index,Na_to_K
0,0,1.286522
1,1,-0.415145
2,2,-0.828558
3,3,-1.149963
4,4,0.271794
...,...,...
195,195,-0.626917
196,196,-0.565995
197,197,-0.859089
198,198,-0.286500


In [5]:
# 오차범위 -1.96 ~ 1.96의 범위를 cond로 대입해 놓는다.

cond1 = na_to_k_df['Na_to_K'] >= -1.96
cond2 = na_to_k_df['Na_to_K'] <= 1.96

cond = cond1 & cond2

# 그 안에 있는 values를 reset_index로 하여 다시 재정의 한다.
drug_df = drug_df.iloc[na_to_k_df[cond].index].reset_index()
drug_df

Unnamed: 0,index,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,0,23,F,HIGH,HIGH,25.355,drugY
1,1,47,M,LOW,HIGH,13.093,drugC
2,2,47,M,LOW,HIGH,10.114,drugC
3,3,28,F,NORMAL,HIGH,7.798,drugX
4,4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...,...
183,195,56,F,LOW,HIGH,11.567,drugC
184,196,16,M,LOW,HIGH,12.006,drugC
185,197,52,M,NORMAL,HIGH,9.894,drugX
186,198,23,M,NORMAL,NORMAL,14.020,drugX


In [6]:
# copy
drug_enc_df = drug_df.copy()

In [7]:
from sklearn.preprocessing import LabelEncoder

drug_encoder = LabelEncoder()

targets = drug_encoder.fit_transform(drug_df['Drug'].tolist())
drug_enc_df['Drug'] = targets

gender_encoder = LabelEncoder()
genders = gender_encoder.fit_transform(drug_df['Sex'].tolist())
drug_enc_df['Sex'] = genders

blood_pressure_encoder = LabelEncoder()
blood_pressures = blood_pressure_encoder.fit_transform(drug_df['BP'].tolist())
drug_enc_df['BP'] = blood_pressures

cholesterol_encoder = LabelEncoder()
blood_pressures = cholesterol_encoder.fit_transform(drug_df['Cholesterol'].tolist())
drug_enc_df['Cholesterol'] = blood_pressures

In [8]:
drug_encoder.classes_

array(['drugA', 'drugB', 'drugC', 'drugX', 'drugY'], dtype='<U5')

In [9]:
drug_enc_df.iloc[:, -1]

0      4
1      2
2      2
3      3
4      4
      ..
183    2
184    2
185    3
186    3
187    3
Name: Drug, Length: 188, dtype: int64

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

dtc = DecisionTreeClassifier()

features, targets = drug_enc_df.iloc[:, :-1], drug_enc_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

dtc.fit(X_train.values, y_train.values)

In [11]:
features.shape

(188, 6)

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

dtc = DecisionTreeClassifier(random_state=124, min_samples_leaf=6)
kfold = KFold(n_splits=5)

In [17]:
X_train = X_train.values
X_test = X_test.values

In [18]:
features = features.values

In [19]:
count = 0

for train_index, test_index in kfold.split(features):
    # 분리
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = targets[train_index], targets[test_index]

    # 학습
    dtc.fit(X_train, y_train)
    prediction =  dtc.predict(X_test)


    # 평가
    accuracy = np.round(accuracy_score(y_test, prediction),4)

    # 검증
    train_targets = pd.DataFrame(y_train)
    test_targets = pd.DataFrame(y_test)

    count += 1
    
    print(f'{count} 회차')
    print(f'학습 타겟 데이터 분포: \n{train_targets.value_counts()}')
    print(f'검증 타겟 데이터 분포: \n{test_targets.value_counts()}')
    print(f'정확도 {accuracy}')

1 회차
학습 타겟 데이터 분포: 
Drug
4       62
3       42
0       21
1       14
2       11
Name: count, dtype: int64
검증 타겟 데이터 분포: 
Drug
4       17
3       12
2        5
0        2
1        2
Name: count, dtype: int64
정확도 1.0
2 회차
학습 타겟 데이터 분포: 
Drug
4       62
3       45
0       18
2       14
1       11
Name: count, dtype: int64
검증 타겟 데이터 분포: 
Drug
4       17
3        9
0        5
1        5
2        2
Name: count, dtype: int64
정확도 1.0
3 회차
학습 타겟 데이터 분포: 
Drug
4       63
3       42
0       18
1       14
2       13
Name: count, dtype: int64
검증 타겟 데이터 분포: 
Drug
4       16
3       12
0        5
2        3
1        2
Name: count, dtype: int64
정확도 1.0
4 회차
학습 타겟 데이터 분포: 
Drug
4       67
3       43
0       17
2       13
1       11
Name: count, dtype: int64
검증 타겟 데이터 분포: 
Drug
4       12
3       11
0        6
1        5
2        3
Name: count, dtype: int64
정확도 0.9189
5 회차
학습 타겟 데이터 분포: 
Drug
4       62
3       44
0       18
1       14
2       13
Name: count, dtype: int64
검증 타겟 데이터 분포: 
Drug
4       17


In [20]:
from sklearn.model_selection import StratifiedKFold

s_fold = StratifiedKFold(n_splits=5)

In [21]:
count = 0
accuracy_list = []

for train_index, test_index in s_fold.split(features, targets):
    # 분리
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = targets[train_index], targets[test_index]

    # 학습
    dtc.fit(X_train, y_train)
    prediction =  dtc.predict(X_test)


    # 평가
    accuracy = np.round(accuracy_score(y_test, prediction),4)
    accuracy_list.append(accuracy)

    # 검증
    train_targets = pd.DataFrame(y_train)
    test_targets = pd.DataFrame(y_test)

    count += 1
    
    print(f'{count} 회차')
    print(f'학습 타겟 데이터 분포: \n{train_targets.value_counts()}')
    print(f'검증 타겟 데이터 분포: \n{test_targets.value_counts()}')
    print(f'정확도 {accuracy}')

print(f'평균 정확도: {np.mean(accuracy_list)}')

1 회차
학습 타겟 데이터 분포: 
Drug
4       63
3       43
0       18
1       13
2       13
Name: count, dtype: int64
검증 타겟 데이터 분포: 
Drug
4       16
3       11
0        5
1        3
2        3
Name: count, dtype: int64
정확도 1.0
2 회차
학습 타겟 데이터 분포: 
Drug
4       63
3       43
0       18
1       13
2       13
Name: count, dtype: int64
검증 타겟 데이터 분포: 
Drug
4       16
3       11
0        5
1        3
2        3
Name: count, dtype: int64
정확도 1.0
3 회차
학습 타겟 데이터 분포: 
Drug
4       63
3       43
0       19
2       13
1       12
Name: count, dtype: int64
검증 타겟 데이터 분포: 
Drug
4       16
3       11
0        4
1        4
2        3
Name: count, dtype: int64
정확도 1.0
4 회차
학습 타겟 데이터 분포: 
Drug
4       63
3       43
0       19
1       13
2       13
Name: count, dtype: int64
검증 타겟 데이터 분포: 
Drug
4       16
3       11
0        4
1        3
2        3
Name: count, dtype: int64
정확도 0.9459
5 회차
학습 타겟 데이터 분포: 
Drug
4       64
3       44
0       18
1       13
2       12
Name: count, dtype: int64
검증 타겟 데이터 분포: 
Drug
4       15


In [22]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

In [24]:
dtc = DecisionTreeClassifier()

features, targets = drug_enc_df.iloc[:, :-1], drug_enc_df.iloc[:, -1]

X_train, X_test, y_train, y_test = \
train_test_split(features, targets, test_size=0.2, random_state=124)

dtc.fit(X_train.values, y_train.values)
parameters = {'max_depth': [2, 3, 4], 'min_samples_split': [6, 7]}

In [25]:
g_dtc = GridSearchCV(dtc, 
                     param_grid=parameters, 
                     cv=5, 
                     refit=True, 
                     return_train_score=True, 
                     # 코어 개수: -1를 쓰면 최대 개수를 쓰는 것이다.
                     n_jobs=-1)

In [26]:
g_dtc.fit(X_train, y_train)

In [27]:
g_dtc.cv_results_

{'mean_fit_time': array([0.00220742, 0.0016809 , 0.00483875, 0.00091782, 0.00087061,
        0.00091496]),
 'std_fit_time': array([9.45204755e-04, 1.37065047e-04, 5.34833174e-03, 1.72464268e-04,
        6.13498028e-05, 1.31768546e-04]),
 'mean_score_time': array([0.00106997, 0.00072317, 0.00060244, 0.00055041, 0.00051875,
        0.00053024]),
 'std_score_time': array([6.33708840e-04, 5.12497692e-05, 6.45906513e-05, 6.63421878e-05,
        3.43035869e-05, 4.50824253e-05]),
 'param_max_depth': masked_array(data=[2, 2, 3, 3, 4, 4],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[6, 7, 6, 7, 6, 7],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 2, 'min_samples_split': 6},
  {'max_depth': 2, 'min_samples_split': 7},
  {'max_depth': 3, 'min_samples_split': 6},
  {'max_depth': 3, 'min_sa

In [28]:
drug_df = pd.DataFrame(g_dtc.cv_results_)
drug_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.002207,0.000945,0.00107,0.000634,2,6,"{'max_depth': 2, 'min_samples_split': 6}",0.866667,0.8,0.8,...,0.82,0.026667,5,0.816667,0.833333,0.833333,0.825,0.825,0.826667,0.006236
1,0.001681,0.000137,0.000723,5.1e-05,2,7,"{'max_depth': 2, 'min_samples_split': 7}",0.866667,0.8,0.8,...,0.82,0.026667,5,0.816667,0.833333,0.833333,0.825,0.825,0.826667,0.006236
2,0.004839,0.005348,0.000602,6.5e-05,3,6,"{'max_depth': 3, 'min_samples_split': 6}",0.8,0.9,0.9,...,0.873333,0.04899,3,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
3,0.000918,0.000172,0.00055,6.6e-05,3,7,"{'max_depth': 3, 'min_samples_split': 7}",0.8,0.9,0.9,...,0.873333,0.04899,3,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
4,0.000871,6.1e-05,0.000519,3.4e-05,4,6,"{'max_depth': 4, 'min_samples_split': 6}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,0.000915,0.000132,0.00053,4.5e-05,4,7,"{'max_depth': 4, 'min_samples_split': 7}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [29]:
print(g_dtc.best_params_, g_dtc.best_score_, sep="\n")

{'max_depth': 4, 'min_samples_split': 6}
0.9933333333333334


In [30]:
g_dtc.best_estimator_

In [43]:
dtc = g_dtc.best_estimator_
prediction = dtc.predict(X_test)
accuracy = round(accuracy_score(y_test, prediction),4)

In [44]:
accuracy

0.9737