<a href="https://colab.research.google.com/github/PolyGon-13/DataScience_Study/blob/main/Optimization/Hyperparameter_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Jupyter Notebook용 : 그래프가 별도의 창에서 나타나지 않도록 함
%matplotlib inline

In [5]:
df=pd.read_csv('./data/diabetes_feature.csv')
df.shape

(768, 16)

In [6]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Pregnancies_high,Age_low,Age_middle,Age_high,Insulin_nan,Insulin_log,low_glu_insulin
0,6,148,72,35,0,33.6,0.627,50,1,False,False,True,False,169.5,5.138735,False
1,1,85,66,29,0,26.6,0.351,31,0,False,False,True,False,102.5,4.639572,True
2,8,183,64,0,0,23.3,0.672,32,1,True,False,True,False,169.5,5.138735,False
3,1,89,66,23,94,28.1,0.167,21,0,False,True,False,False,94.0,4.553877,True
4,0,137,40,35,168,43.1,2.288,33,1,False,False,True,False,168.0,5.129899,False


In [7]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome', 'Pregnancies_high',
       'Age_low', 'Age_middle', 'Age_high', 'Insulin_nan', 'Insulin_log',
       'low_glu_insulin'],
      dtype='object')

In [8]:
X = df[['Glucose', 'BloodPressure', 'SkinThickness',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Pregnancies_high',
       'Insulin_nan', 'low_glu_insulin']]
X.shape

(768, 9)

In [9]:
y=df['Outcome']
y.shape

(768,)

In [10]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,
                                               test_size=0.2,
                                               random_state=42)

In [11]:
X_train.shape,y_train.shape

((614, 9), (614,))

In [13]:
X_test.shape,y_test.shape

((154, 9), (154,))

# 알고리즘 여러 개 사용 (각각 적용)

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

estimators=[DecisionTreeClassifier(random_state=42), # 결정트리
            RandomForestClassifier(random_state=42), # 여러 개의 결정 트리로 구성된 앙상블 모델
            GradientBoostingClassifier(random_state=42)] # 약한 학습기(주로 결정 트리)를 순차적으로 학습시키는 앙상블 모델
estimators

[DecisionTreeClassifier(random_state=42),
 RandomForestClassifier(random_state=42),
 GradientBoostingClassifier(random_state=42)]

In [17]:
max_depth=np.random.randint(2,20,10)
max_depth

array([15,  6, 10,  3,  8, 14, 13, 10,  5, 14])

In [18]:
max_features=np.random.uniform(0.3,1.0,10)
max_features

array([0.80537005, 0.68706592, 0.89814049, 0.96274492, 0.91155634,
       0.34093333, 0.45974664, 0.74964936, 0.99582986, 0.40764302])

In [19]:
results=[]
for estimator in estimators:
    result=[]
    result.append(estimator.__class__.__name__)
    results.append(result)
results

[['DecisionTreeClassifier'],
 ['RandomForestClassifier'],
 ['GradientBoostingClassifier']]

In [21]:
from sklearn.model_selection import RandomizedSearchCV

max_depth=np.random.randint(2,20,10)
max_features=np.random.uniform(0.3,1.0,10)
param_distributions={'max_depth':max_depth,
                     'max_features':max_features}

results=[]
for estimator in estimators:
    result=[]
    # estimator.__class__.__name__ : 현재 모델의 클래스를 문자열 형태로 얻음
    if estimator.__class__.__name__!='DecisionTreeClassifier': # 결정트리가 아닌 경우에만 실행
        param_distributions['n_estimators']=np.random.randint(100,200,10)

    clf=RandomizedSearchCV(estimator, # 현재 머신러닝 모델
                           param_distributions, # 조정할 파라미터의 분포
                           n_iter=100, # 랜덤 샘플링을 통해 100회의 파라미터 조합을 시도
                           scoring='accuracy', # 모델의 성능을 정확도로 평가
                           n_jobs=-1, # 모든 CPU 코어를 사용하여 병렬처리
                           cv=5, # 5-fold 교차 검증을 사용하여 모델 평가
                           verbose=2) # 진행상황을 더 자세히 출력

    clf.fit(X_train,y_train)
    result.append(estimator.__class__.__name__)
    result.append(clf.best_params_)
    result.append(clf.best_score_)
    result.append(clf.score(X_test,y_test))
    result.append(clf.cv_results_)
    result.append(clf.cv_results_)
    results.append(result)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [32]:
df=pd.DataFrame(results,
                columns=['estimator','best_params','train_score','test_score','cv_result','etc'])
df=df.drop(['etc'],axis=1)
df

Unnamed: 0,estimator,best_params,train_score,test_score,cv_result
0,DecisionTreeClassifier,"{'max_features': 0.5888890618870978, 'max_dept...",0.86648,0.837662,"{'mean_fit_time': [0.009562397003173828, 0.010..."
1,RandomForestClassifier,"{'n_estimators': 183, 'max_features': 0.588889...",0.905611,0.863636,"{'mean_fit_time': [0.6274001121520996, 0.64040..."
2,GradientBoostingClassifier,"{'n_estimators': 102, 'max_features': 0.538755...",0.910476,0.87013,"{'mean_fit_time': [2.2286092758178713, 1.51231..."


In [33]:
pd.DataFrame(df.loc[1,'cv_result']).sort_values(by='rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.627400,0.011561,0.019803,0.001272,183,0.588889,15,"{'n_estimators': 183, 'max_features': 0.588889...",0.878049,0.934959,0.861789,0.902439,0.950820,0.905611,0.033452,1
47,1.653608,0.387954,0.039830,0.025370,183,0.588889,16,"{'n_estimators': 183, 'max_features': 0.588889...",0.878049,0.934959,0.861789,0.902439,0.950820,0.905611,0.033452,1
89,0.413237,0.007414,0.014697,0.000725,125,0.588889,11,"{'n_estimators': 125, 'max_features': 0.588889...",0.886179,0.934959,0.853659,0.894309,0.950820,0.903985,0.034918,3
50,0.497657,0.132699,0.024469,0.009571,119,0.637715,19,"{'n_estimators': 119, 'max_features': 0.637714...",0.886179,0.934959,0.853659,0.894309,0.950820,0.903985,0.034918,3
17,1.156449,0.092724,0.041421,0.006847,196,0.715221,11,"{'n_estimators': 196, 'max_features': 0.715221...",0.869919,0.934959,0.861789,0.910569,0.942623,0.903972,0.032975,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,0.432226,0.011473,0.015127,0.001385,138,0.948146,3,"{'n_estimators': 138, 'max_features': 0.948145...",0.845528,0.918699,0.821138,0.853659,0.934426,0.874690,0.043968,96
40,0.350623,0.012919,0.013533,0.000140,125,0.450456,3,"{'n_estimators': 125, 'max_features': 0.450456...",0.813008,0.894309,0.845528,0.861789,0.950820,0.873091,0.046881,97
77,0.358581,0.007698,0.013829,0.000137,129,0.538755,3,"{'n_estimators': 129, 'max_features': 0.538755...",0.813008,0.894309,0.845528,0.861789,0.950820,0.873091,0.046881,97
21,0.976087,0.105316,0.044056,0.016922,183,0.948146,3,"{'n_estimators': 183, 'max_features': 0.948145...",0.837398,0.918699,0.804878,0.861789,0.934426,0.871438,0.048751,99
