## Optuna 로 하이퍼파라미터 튜닝

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, auc, roc_auc_score
from dmba import classificationSummary

from imblearn.over_sampling import SMOTE

import optuna
from optuna import Trial
from optuna.samplers import TPESampler

In [2]:
train_df = pd.read_csv('D:/숙탯 2기/2022-여름-캐글/최종데이터/Train.csv')
test_df = pd.read_csv('D:/숙탯 2기/2022-여름-캐글/최종데이터/Test.csv')

train_df['ownerChange'] = round(train_df['ownerChange'])
train_df['bedCount_class'] = round(train_df['bedCount_class'])

test_df['ownerChange'] = round(train_df['ownerChange'])
test_df['bedCount_class'] = round(train_df['bedCount_class'])

train_df.drop('instkind_nan', axis=1, inplace=True)
test_df.drop('instkind_nan', axis=1, inplace=True)

train_df.set_index('inst_id', inplace=True)
test_df.set_index('inst_id', inplace=True)

In [3]:
train_lr = pd.read_csv('D:/숙탯 2기/2022-여름-캐글/최종데이터/Train_lr.csv')

train_lr['ownerChange'] = round(train_lr['ownerChange'])
train_lr['bedCount_class'] = round(train_lr['bedCount_class'])

train_lr.head()

Unnamed: 0,inst_id,sido_choongchung,sido_gyeongsang,sido_jeonra,instkind_nursing_hospital,instkind_traditional_hospital,revenue1,sga1,salary1,noe1,...,profit2,inventoryAsset2,OnonCAsset2,shortLoan2,NCLiabilities2,longLoan2,surplus2,ownerChange,bedCount_class,OC
0,1,1.0,0.0,0.0,1.0,0.0,22.162515,22.099796,21.433189,16.539187,...,19.152229,16.425505,18.798422,0.0,20.099863,19.781657,20.963246,0.0,3.0,1
1,3,0.0,1.0,0.0,0.0,0.0,23.98028,23.481393,19.521179,16.823799,...,17.025473,16.044684,15.372413,9.180719,18.812726,17.568559,12.258481,0.0,4.0,1
2,4,0.0,0.0,0.0,1.0,0.0,20.727778,19.918561,19.507228,10.308986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1
3,7,0.0,0.0,0.0,0.0,0.0,25.006954,24.981392,24.182293,20.954034,...,20.546991,20.934483,22.088313,22.944579,23.755157,23.437961,22.769775,0.0,4.0,1
4,9,0.0,1.0,0.0,0.0,0.0,24.615974,24.587275,23.920337,21.073281,...,17.301128,19.908536,20.486709,23.59081,23.784786,23.37001,22.923325,0.0,3.0,1


In [4]:
train_dt = pd.read_csv('D:/숙탯 2기/2022-여름-캐글/최종데이터/Train_dt.csv')
train_dt.drop('instkind_nan', axis=1, inplace=True)

train_dt.head()

Unnamed: 0,inst_id,instkind_hospital,instkind_traditional_clinic,revenue1,salescost1,sga1,salary1,liquidAsset1,quickAsset1,receivableS1,inventoryAsset2,receivableL2,employee2,OC
0,1,0.0,0.0,22.162515,0.0,22.099796,21.433189,20.735886,20.720935,19.968362,16.425505,0.0,4.174387,1
1,3,0.0,0.0,23.98028,9.659782,23.481393,19.521179,19.074269,19.03727,10.917985,16.044684,0.0,6.70196,1
2,4,0.0,0.0,20.727778,20.060616,19.918561,19.507228,19.422937,19.351593,15.906875,0.0,0.0,0.693147,1
3,7,0.0,0.0,25.006954,0.0,24.981392,24.182293,23.291406,23.16863,0.0,20.934483,0.0,6.498282,1
4,9,0.0,0.0,24.615974,0.0,24.587275,23.920337,22.566524,22.493677,22.134087,19.908536,0.0,5.288267,1


In [5]:
train_dt.shape

(301, 14)

In [6]:
dtree = DecisionTreeClassifier(max_depth = 1)
logreg = LogisticRegression()

# 1. 전체 column

### 1) accuracy 기준으로 튜닝

In [7]:
X_train = train_df.drop('OC', axis=1)
y_train = train_df['OC']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

smote = SMOTE(random_state=42)
X_tr_over, y_tr_over = smote.fit_resample(X_train, y_train)

In [8]:
def objective1(trial):
    param = {
        'base_estimator' : trial.suggest_categorical('base_estimator', [dtree, logreg]),
        'n_estimators':trial.suggest_int('n_estimators',50,80,step=2),
        'algorithm' : trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R']),
        'learning_rate': trial.suggest_float('learning_rate', 0.1,2.5,step = 0.0000005),
        }
        
    ada = AdaBoostClassifier(**param)
    ada.fit(X_tr_over, y_tr_over)
    pred = ada.predict(X_val)
    accuracy = accuracy_score(y_val, pred)
    
    return accuracy

In [9]:
sampler = TPESampler(seed=42)

study = optuna.create_study(
    study_name = 'ada_parameter_opt',
    direction = 'maximize',
    sampler = sampler)

study.optimize(objective1, n_trials=10)

print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2022-09-02 16:58:27,993][0m A new study created in memory with name: ada_parameter_opt[0m
[32m[I 2022-09-02 16:58:30,913][0m Trial 0 finished with value: 0.9333333333333333 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 72, 'algorithm': 'SAMME', 'learning_rate': 0.47438649999999993}. Best is trial 0 with value: 0.9333333333333333.[0m
[32m[I 2022-09-02 16:58:31,246][0m Trial 1 finished with value: 0.85 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 68, 'algorithm': 'SAMME', 'learning_rate': 2.427784}. Best is trial 0 with value: 0.9333333333333333.[0m
[32m[I 2022-09-02 16:58:31,547][0m Trial 2 finished with value: 0.9 and parameters: {'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 54, 'algorithm': 'SAMME.R', 'learning_rate': 1.3594155}. Best is trial 0 with value: 0.9333333333333333.[0m
[32m[I 2022-09-02 16:58:31,830][0m Trial 3 finished with value: 0.95 and parameters: {'base_estimator': De

Best Score: 0.95
Best trial {'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 68, 'algorithm': 'SAMME.R', 'learning_rate': 0.9792685}


In [10]:
def clf_eval(y_test, pred):
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print('오차행렬')
    classificationSummary(y_test, pred)
    print()
    print('정확도 : {:.4f}\n정밀도 : {:.4f}\n재현율 : {:.4f}'.format(accuracy, precision, recall))
    print('\nAUC : {:.4f}'.format(roc_auc_score(y_test, pred)))

In [11]:
ada = AdaBoostClassifier(**study.best_params)
ada.fit(X_tr_over, y_tr_over)

print()
classificationSummary(y_val, ada.predict(X_val))
print('\nAUC:', roc_auc_score(y_val, ada.predict(X_val)))


Confusion Matrix (Accuracy 0.9500)

       Prediction
Actual  0  1
     0  0  1
     1  2 57

AUC: 0.4830508474576271


In [13]:
clf_eval(y_test, ada.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.9016)

       Prediction
Actual  0  1
     0  0  5
     1  1 55

정확도 : 0.9016
정밀도 : 0.9167
재현율 : 0.9821

AUC : 0.4911


### 2) AUC 기준

In [14]:
def objective2(trial):
    param = {
        'base_estimator' : trial.suggest_categorical('base_estimator', [dtree, logreg]),
        'n_estimators':trial.suggest_int('n_estimators',50,80,step=2),
        'algorithm' : trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R']),
        'learning_rate': trial.suggest_float('learning_rate', 0.1,2.5,step = 0.0000005),
        }
        
    ada = AdaBoostClassifier(**param)
    ada.fit(X_tr_over, y_tr_over)
    pred = ada.predict(X_val)
    AUC = roc_auc_score(y_val, ada.predict(X_val))
    return AUC

In [15]:
sampler = TPESampler(seed=42)

study = optuna.create_study(
    study_name = 'ada_parameter_opt',
    direction = 'maximize',
    sampler = sampler)

study.optimize(objective2, n_trials=10)

print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2022-09-02 16:59:42,951][0m A new study created in memory with name: ada_parameter_opt[0m
[32m[I 2022-09-02 16:59:45,676][0m Trial 0 finished with value: 0.9661016949152542 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 72, 'algorithm': 'SAMME', 'learning_rate': 0.47438649999999993}. Best is trial 0 with value: 0.9661016949152542.[0m
[32m[I 2022-09-02 16:59:46,010][0m Trial 1 finished with value: 0.4322033898305085 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 68, 'algorithm': 'SAMME', 'learning_rate': 2.427784}. Best is trial 0 with value: 0.9661016949152542.[0m
[32m[I 2022-09-02 16:59:46,316][0m Trial 2 finished with value: 0.4576271186440678 and parameters: {'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 54, 'algorithm': 'SAMME.R', 'learning_rate': 1.3594155}. Best is trial 0 with value: 0.9661016949152542.[0m
[32m[I 2022-09-02 16:59:46,906][0m Trial 3 finished with value: 0.48305084745

Best Score: 0.9661016949152542
Best trial {'base_estimator': LogisticRegression(), 'n_estimators': 72, 'algorithm': 'SAMME', 'learning_rate': 0.47438649999999993}


In [16]:
ada = AdaBoostClassifier(**study.best_params)
ada.fit(X_tr_over, y_tr_over)

print()
classificationSummary(y_val, ada.predict(X_val))
print('\nAUC:', roc_auc_score(y_val, ada.predict(X_val)))


Confusion Matrix (Accuracy 0.9333)

       Prediction
Actual  0  1
     0  1  0
     1  4 55

AUC: 0.9661016949152542


In [17]:
clf_eval(y_test, ada.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.8852)

       Prediction
Actual  0  1
     0  0  5
     1  2 54

정확도 : 0.8852
정밀도 : 0.9153
재현율 : 0.9643

AUC : 0.4821


# 2. Train_lr

In [18]:
X_train = train_lr.drop('OC', axis=1)
y_train = train_lr['OC']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

smote = SMOTE(random_state=42)
X_tr_over, y_tr_over = smote.fit_resample(X_train, y_train)

### 1) accuracy 기준으로 튜닝

In [19]:
sampler = TPESampler(seed=42)

study = optuna.create_study(
    study_name = 'ada_parameter_opt',
    direction = 'maximize',
    sampler = sampler)

study.optimize(objective1, n_trials=10)

print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2022-09-02 17:00:45,106][0m A new study created in memory with name: ada_parameter_opt[0m
[32m[I 2022-09-02 17:00:47,497][0m Trial 0 finished with value: 0.95 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 72, 'algorithm': 'SAMME', 'learning_rate': 0.47438649999999993}. Best is trial 0 with value: 0.95.[0m
[32m[I 2022-09-02 17:00:47,934][0m Trial 1 finished with value: 0.016666666666666666 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 68, 'algorithm': 'SAMME', 'learning_rate': 2.427784}. Best is trial 0 with value: 0.95.[0m
[32m[I 2022-09-02 17:00:48,154][0m Trial 2 finished with value: 0.9333333333333333 and parameters: {'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 54, 'algorithm': 'SAMME.R', 'learning_rate': 1.3594155}. Best is trial 0 with value: 0.95.[0m
[32m[I 2022-09-02 17:00:48,376][0m Trial 3 finished with value: 0.9166666666666666 and parameters: {'base_estimator': DecisionTreeC

Best Score: 0.95
Best trial {'base_estimator': LogisticRegression(), 'n_estimators': 72, 'algorithm': 'SAMME', 'learning_rate': 0.47438649999999993}


In [20]:
ada = AdaBoostClassifier(**study.best_params)
ada.fit(X_tr_over, y_tr_over)

print()
classificationSummary(y_val, ada.predict(X_val))
print('\nAUC:', roc_auc_score(y_val, ada.predict(X_val)))


Confusion Matrix (Accuracy 0.9500)

       Prediction
Actual  0  1
     0  1  0
     1  3 56

AUC: 0.9745762711864407


In [21]:
clf_eval(y_test, ada.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.8689)

       Prediction
Actual  0  1
     0  0  5
     1  3 53

정확도 : 0.8689
정밀도 : 0.9138
재현율 : 0.9464

AUC : 0.4732


### 2) AUC 기준

In [22]:
sampler = TPESampler(seed=42)

study = optuna.create_study(
    study_name = 'ada_parameter_opt',
    direction = 'maximize',
    sampler = sampler)

study.optimize(objective2, n_trials=10)

print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2022-09-02 17:01:06,021][0m A new study created in memory with name: ada_parameter_opt[0m
[32m[I 2022-09-02 17:01:07,852][0m Trial 0 finished with value: 0.9745762711864407 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 72, 'algorithm': 'SAMME', 'learning_rate': 0.47438649999999993}. Best is trial 0 with value: 0.9745762711864407.[0m
[32m[I 2022-09-02 17:01:08,333][0m Trial 1 finished with value: 0.5 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 68, 'algorithm': 'SAMME', 'learning_rate': 2.427784}. Best is trial 0 with value: 0.9745762711864407.[0m
[32m[I 2022-09-02 17:01:08,553][0m Trial 2 finished with value: 0.4745762711864407 and parameters: {'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 54, 'algorithm': 'SAMME.R', 'learning_rate': 1.3594155}. Best is trial 0 with value: 0.9745762711864407.[0m
[32m[I 2022-09-02 17:01:08,811][0m Trial 3 finished with value: 0.4661016949152542 and param

Best Score: 0.9745762711864407
Best trial {'base_estimator': LogisticRegression(), 'n_estimators': 72, 'algorithm': 'SAMME', 'learning_rate': 0.47438649999999993}


In [23]:
ada = AdaBoostClassifier(**study.best_params)
ada.fit(X_tr_over, y_tr_over)

print()
classificationSummary(y_val, ada.predict(X_val))
print('\nAUC:', roc_auc_score(y_val, ada.predict(X_val)))


Confusion Matrix (Accuracy 0.9500)

       Prediction
Actual  0  1
     0  1  0
     1  3 56

AUC: 0.9745762711864407


In [24]:
clf_eval(y_test, ada.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.8689)

       Prediction
Actual  0  1
     0  0  5
     1  3 53

정확도 : 0.8689
정밀도 : 0.9138
재현율 : 0.9464

AUC : 0.4732


# 3. Train_dt

### 1) Accuracy 기준

In [25]:
X_train = train_dt.drop('OC', axis=1)
y_train = train_dt['OC']

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

smote = SMOTE(random_state=42)
X_tr_over, y_tr_over = smote.fit_resample(X_train, y_train)

In [26]:
sampler = TPESampler(seed=42)

study = optuna.create_study(
    study_name = 'ada_parameter_opt',
    direction = 'maximize',
    sampler = sampler)

study.optimize(objective1, n_trials=10)

print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2022-09-02 17:01:35,701][0m A new study created in memory with name: ada_parameter_opt[0m
[32m[I 2022-09-02 17:01:37,593][0m Trial 0 finished with value: 0.7666666666666667 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 72, 'algorithm': 'SAMME', 'learning_rate': 0.47438649999999993}. Best is trial 0 with value: 0.7666666666666667.[0m
[32m[I 2022-09-02 17:01:38,058][0m Trial 1 finished with value: 0.9166666666666666 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 68, 'algorithm': 'SAMME', 'learning_rate': 2.427784}. Best is trial 1 with value: 0.9166666666666666.[0m
[32m[I 2022-09-02 17:01:38,218][0m Trial 2 finished with value: 0.9166666666666666 and parameters: {'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 54, 'algorithm': 'SAMME.R', 'learning_rate': 1.3594155}. Best is trial 1 with value: 0.9166666666666666.[0m
[32m[I 2022-09-02 17:01:38,418][0m Trial 3 finished with value: 0.91666666666

Best Score: 0.9333333333333333
Best trial {'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 70, 'algorithm': 'SAMME', 'learning_rate': 1.2884245}


In [27]:
ada = AdaBoostClassifier(**study.best_params)
ada.fit(X_tr_over, y_tr_over)

print()
classificationSummary(y_val, ada.predict(X_val))
print('\nAUC:', roc_auc_score(y_val, ada.predict(X_val)))


Confusion Matrix (Accuracy 0.9333)

       Prediction
Actual  0  1
     0  0  1
     1  3 56

AUC: 0.4745762711864407


In [28]:
clf_eval(y_test, ada.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.8689)

       Prediction
Actual  0  1
     0  0  5
     1  3 53

정확도 : 0.8689
정밀도 : 0.9138
재현율 : 0.9464

AUC : 0.4732


### 2) AUC 기준

In [29]:
sampler = TPESampler(seed=42)

study = optuna.create_study(
    study_name = 'ada_parameter_opt',
    direction = 'maximize',
    sampler = sampler)

study.optimize(objective2, n_trials=10)

print("Best Score:",study.best_value)
print("Best trial",study.best_trial.params)

[32m[I 2022-09-02 17:02:04,413][0m A new study created in memory with name: ada_parameter_opt[0m
[32m[I 2022-09-02 17:02:06,458][0m Trial 0 finished with value: 0.3898305084745763 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 72, 'algorithm': 'SAMME', 'learning_rate': 0.47438649999999993}. Best is trial 0 with value: 0.3898305084745763.[0m
[32m[I 2022-09-02 17:02:07,054][0m Trial 1 finished with value: 0.4661016949152542 and parameters: {'base_estimator': LogisticRegression(), 'n_estimators': 68, 'algorithm': 'SAMME', 'learning_rate': 2.427784}. Best is trial 1 with value: 0.4661016949152542.[0m
[32m[I 2022-09-02 17:02:07,220][0m Trial 2 finished with value: 0.4661016949152542 and parameters: {'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 54, 'algorithm': 'SAMME.R', 'learning_rate': 1.3594155}. Best is trial 1 with value: 0.4661016949152542.[0m
[32m[I 2022-09-02 17:02:07,447][0m Trial 3 finished with value: 0.46610169491

Best Score: 0.711864406779661
Best trial {'base_estimator': DecisionTreeClassifier(max_depth=1), 'n_estimators': 52, 'algorithm': 'SAMME.R', 'learning_rate': 2.040154}


In [30]:
ada = AdaBoostClassifier(**study.best_params)
ada.fit(X_tr_over, y_tr_over)

print()
classificationSummary(y_val, ada.predict(X_val))
print('\nAUC:', roc_auc_score(y_val, ada.predict(X_val)))


Confusion Matrix (Accuracy 0.4500)

       Prediction
Actual  0  1
     0  1  0
     1 33 26

AUC: 0.7203389830508474


In [31]:
clf_eval(y_test, ada.predict(X_test))

오차행렬
Confusion Matrix (Accuracy 0.4918)

       Prediction
Actual  0  1
     0  3  2
     1 29 27

정확도 : 0.4918
정밀도 : 0.9310
재현율 : 0.4821

AUC : 0.5411
