### Cross Validation Task

### 약물 A, B, C, X, Y
##### 다중 분류(Multiclass Classification)
- 의학 연구원으로서 동일한 질병을 앓고 있는 일련의 환자에 대한 데이터를 수집했다.
- 치료 과정 동안 각 환자는 5가지 약물, 즉 약물 A, 약물 B, 약물 c, 약물 x 및 y 중 하나에 반응했다.
-  미래에 동일한 질병을 앓는 환자에게 어떤 약물이 적합할 수 있는지 알아보기 위한 모델을 구축한다.

##### feature
- Age: 환자의 나이
- Sex: 환자의 성별
- BP: 혈압
- Cholesterol: 콜레스테롤 수치
- Na_to_K: 나트륨-칼륨

##### target
- Drug: 의약품, 환자에게 효과가 있었던 약

In [1]:
import pandas as pd

d_df = pd.read_csv('./datasets/drugs.csv')
d_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
195,56,F,LOW,HIGH,11.567,drugC
196,16,M,LOW,HIGH,12.006,drugC
197,52,M,NORMAL,HIGH,9.894,drugX
198,23,M,NORMAL,NORMAL,14.020,drugX


In [2]:
d_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          200 non-null    int64  
 1   Sex          200 non-null    object 
 2   BP           200 non-null    object 
 3   Cholesterol  200 non-null    object 
 4   Na_to_K      200 non-null    float64
 5   Drug         200 non-null    object 
dtypes: float64(1), int64(1), object(4)
memory usage: 9.5+ KB


In [3]:
# 결측치 확인
d_df.isna().sum()

Age            0
Sex            0
BP             0
Cholesterol    0
Na_to_K        0
Drug           0
dtype: int64

In [5]:
# 중복 행 확인
d_df.duplicated().sum()

0

#### 표준화 이용하여 이상치 제거

In [8]:
# 이상치 있을만한 feature와 index번호 함께 가져와서(drop=False) 새로운 데이터프레임 생성
na_to_k_df = d_df.loc[:, 'Na_to_K'].reset_index()
na_to_k_df

Unnamed: 0,index,Na_to_K
0,0,25.355
1,1,13.093
2,2,10.114
3,3,7.798
4,4,18.043
...,...,...
195,195,11.567
196,196,12.006
197,197,9.894
198,198,14.020


In [9]:
# 표준화
from sklearn.preprocessing import StandardScaler

std = StandardScaler()

std_na_to_k = std.fit_transform(na_to_k_df[['Na_to_K']])

In [10]:
# 표준화 된 값으로 대체
na_to_k_df['Na_to_K'] = std_na_to_k
na_to_k_df

Unnamed: 0,index,Na_to_K
0,0,1.286522
1,1,-0.415145
2,2,-0.828558
3,3,-1.149963
4,4,0.271794
...,...,...
195,195,-0.626917
196,196,-0.565995
197,197,-0.859089
198,198,-0.286500


In [19]:
# ±1.96 범위에 속하는 값의 인덱스만 가져온 뒤, 원본 데이터프레임에서 필터링
# 방법 1
# con1 = na_to_k_df['Na_to_K'] >= -1.96
# con2 = na_to_k_df['Na_to_K'] <= 1.96
# condition = con1 & con2
# 
# d_df = d_df.iloc[na_to_k_df[condition].index].reset_index(drop=True)

# 방법2
na_to_k_df = na_to_k_df[na_to_k_df['Na_to_K'].between(-1.96, 1.96)]
d_df = d_df.iloc[na_to_k_df.index].reset_index(drop=True)

d_df

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K,Drug
0,23,F,HIGH,HIGH,25.355,drugY
1,47,M,LOW,HIGH,13.093,drugC
2,47,M,LOW,HIGH,10.114,drugC
3,28,F,NORMAL,HIGH,7.798,drugX
4,61,F,LOW,HIGH,18.043,drugY
...,...,...,...,...,...,...
183,56,F,LOW,HIGH,11.567,drugC
184,16,M,LOW,HIGH,12.006,drugC
185,52,M,NORMAL,HIGH,9.894,drugX
186,23,M,NORMAL,NORMAL,14.020,drugX


In [27]:
# 원본 데이터 건들지 않기 위해 복사
d_enc_df = d_df.copy()

In [28]:
# 데이터 프레임 내의 문자열 값을 정수로 변환 (LabelEncoder 사용)
from sklearn.preprocessing import LabelEncoder

drug_encoder = LabelEncoder()
targets = drug_encoder.fit_transform(d_enc_df['Drug'].tolist())
d_enc_df['Drug'] = targets

gender_encoder = LabelEncoder()
genders = gender_encoder.fit_transform(d_enc_df['Sex'].tolist())
d_enc_df['Sex'] = genders

bp_encoder = LabelEncoder()
BPs = bp_encoder.fit_transform(d_enc_df['BP'].tolist())
d_enc_df['BP'] = BPs

cholesterol_encoder = LabelEncoder()
cholesterols = cholesterol_encoder.fit_transform(d_enc_df['Cholesterol'].tolist())
d_enc_df['Cholesterol'] = cholesterols

In [39]:
# cross_val_score(estimator, x, y, cv, scoring) 메소드 사용하여 편하게 교차검증하기
# validation 데이터 세트들의 평균 evaluation 결과값만 간단하게 받아 볼 필요가 있을 때만 활용
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

dtc = DecisionTreeClassifier(random_state=124, min_samples_leaf=6)

features = d_enc_df.iloc[:, :-1]
targets = d_enc_df.iloc[:, -1]

score = cross_val_score(dtc, features, targets, cv=5, scoring='accuracy')
print(np.round(np.mean(score), 4))

0.9892


In [41]:
# GridSearchCV(estimator, param_grid, cv, refit, return_train_score) 사용하여 교차검증하기
# GridSearchCV는 다양한 하이퍼 파라미터 값을 입력해서 최적의 학습 모델을 찾아 준다.
# 최적의 하이퍼 파라미터 뿐만 아니라 최적 학습 모델도 반환 할 수 있다.
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score

features = d_enc_df.iloc[:, :-1]
targets = d_enc_df.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=124)

dtc = DecisionTreeClassifier()
parameters = {'max_depth': [3, 4, 5], 'min_samples_split': [15, 16, 17]}

In [32]:
g_dtc = GridSearchCV(dtc, 
                     param_grid=parameters, 
                     cv=5, 
                     refit=True, 
                     return_train_score=True, 
                     n_jobs=-1)  # 컴퓨터 코어의 개수 (-1: 최대 성능)

In [33]:
g_dtc.fit(X_train, y_train)

In [34]:
g_dtc.cv_results_

{'mean_fit_time': array([0.0045774 , 0.00551009, 0.00370312, 0.0062181 , 0.00250173,
        0.00200062]),
 'std_fit_time': array([9.20919407e-04, 2.03114691e-03, 7.49643293e-04, 5.25874675e-03,
        6.35278337e-04, 9.81867804e-07]),
 'mean_score_time': array([0.00170159, 0.00486312, 0.00140028, 0.00210228, 0.00180244,
        0.00160246]),
 'std_score_time': array([0.00060153, 0.00622434, 0.00048975, 0.00066486, 0.00051079,
        0.00037397]),
 'param_max_depth': masked_array(data=[2, 2, 3, 3, 4, 4],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[6, 7, 6, 7, 6, 7],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'max_depth': 2, 'min_samples_split': 6},
  {'max_depth': 2, 'min_samples_split': 7},
  {'max_depth': 3, 'min_samples_split': 6},
  {'max_depth': 3, 'min_samples_split': 7},
  {'ma

In [35]:
result_df = pd.DataFrame(g_dtc.cv_results_)
result_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.004577,0.0009209194,0.001702,0.000602,2,6,"{'max_depth': 2, 'min_samples_split': 6}",0.866667,0.8,0.8,...,0.82,0.026667,5,0.816667,0.833333,0.833333,0.825,0.825,0.826667,0.006236
1,0.00551,0.002031147,0.004863,0.006224,2,7,"{'max_depth': 2, 'min_samples_split': 7}",0.866667,0.8,0.8,...,0.82,0.026667,5,0.816667,0.833333,0.833333,0.825,0.825,0.826667,0.006236
2,0.003703,0.0007496433,0.0014,0.00049,3,6,"{'max_depth': 3, 'min_samples_split': 6}",0.8,0.9,0.9,...,0.873333,0.04899,3,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
3,0.006218,0.005258747,0.002102,0.000665,3,7,"{'max_depth': 3, 'min_samples_split': 7}",0.8,0.9,0.9,...,0.873333,0.04899,3,0.916667,0.916667,0.908333,0.933333,0.908333,0.916667,0.009129
4,0.002502,0.0006352783,0.001802,0.000511,4,6,"{'max_depth': 4, 'min_samples_split': 6}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0
5,0.002001,9.818678e-07,0.001602,0.000374,4,7,"{'max_depth': 4, 'min_samples_split': 7}",1.0,1.0,1.0,...,0.993333,0.013333,1,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [38]:
# 최적의 학습 모델 이용하여 평가(정확도)하기
dtc = g_dtc.best_estimator_
# 테스트 정답과 예측 정답을 전달하여 정확도 계산
np.round(accuracy_score(y_test, dtc.predict(X_test)), 4)

0.9737