# Parameter Tuning

In [1]:
%matplotlib inline
import os
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix, parallel_coordinates
import seaborn as sns
import matplotlib.pylab as plt
import warnings

In [2]:
warnings.filterwarnings('ignore')

In [3]:
Train = pd.read_csv("C:/Users/kimch/Desktop/hospital_preprocessed_dataset/Train.csv")
Test = pd.read_csv("C:/Users/kimch/Desktop/hospital_preprocessed_dataset/Test.csv")

In [4]:
# ownerChange
Train["ownerChange"] = round(Train["ownerChange"])
# bedCount_class
Train["bedCount_class"] = round(Train["bedCount_class"])

In [5]:
Test["ownerChange"] = round(Test["ownerChange"])
Test["bedCount_class"] = round(Test["bedCount_class"])

In [6]:
Train = Train.drop(columns=["instkind_nan"])
Test = Test.drop(columns=["instkind_nan"])

In [7]:
Train = Train.set_index("inst_id")
Test = Test.set_index("inst_id")

## Selected Features

In [8]:
fs = ["sga1", "salary1", "revenue1", "profit2", "interest2", "interest1", "receivableS1", "receivableL1", "quickAsset1", "liquidAsset1", "employee2", "debt1", "instkind_nursing_hospital", "OC"]

In [9]:
fs_ = ["sga1", "salary1", "revenue1", "profit2", "interest2", "interest1", "receivableS1", "instkind_nursing_hospital", "OC"]

In [10]:
train = Train[fs]
test = Test[fs]

## Train -> Train, Valid, Test

In [11]:
from sklearn.model_selection import train_test_split

In [192]:
X = train.drop(columns=["OC"])
y = train["OC"]

# train:valid:test = 60:20:20 / 70:15:15 / 80:10:10  
# 아무래도 close한 데이터가 적으므로 60:20:20으로 시행 => 70:15:15가 평균적으로 더 잘 나오는 듯? 
train_X, resid_X, train_y, resid_y = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=21)  
valid_X, test_X, valid_y, test_y = train_test_split(resid_X, resid_y, test_size=0.5, shuffle=True, random_state=21)

In [193]:
train_y.shape

(210,)

In [194]:
valid_y.shape

(45,)

In [195]:
test_y.shape

(46,)

## Oversampling

In [196]:
train_y.value_counts()

1    199
0     11
Name: OC, dtype: int64

In [197]:
from imblearn.over_sampling import SMOTE

In [198]:
smote = SMOTE(random_state=11)

In [199]:
train_X_smote, train_y_smote = smote.fit_resample(train_X, train_y)

In [200]:
train_y_smote.value_counts()

1    199
0    199
Name: OC, dtype: int64

In [201]:
valid_y.value_counts()

1    42
0     3
Name: OC, dtype: int64

In [202]:
test_y.value_counts()

1    45
0     1
Name: OC, dtype: int64

## Random Forest Parameter Tuning  
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [106]:
from sklearn.ensemble import RandomForestClassifier

In [107]:
# GridSearchCV: 파라미터를 순차적으로 입력해 학습을 하고 측정을 하면서 가장 좋은 파라미터를 알려주는 기법
from sklearn.model_selection import GridSearchCV

In [108]:
# bootstrap: 복원추출로 선택(default=True로), max_leaf_nodes도 해봤는데 정확도 떨어지는 것 같음,, 
param_grid = {
    "n_estimators": [10, 50, 100],
    "max_depth": [5, 10, 15],
    "min_samples_leaf": [2, 5, 10],
    "min_samples_split": [6, 8, 10],
    "min_impurity_decrease": [0, 0.001, 0.01]
}

rf = RandomForestClassifier(random_state=0, n_jobs=-1)
gridSearch = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)

gridSearch.fit(train_X_smote, train_y_smote)
print("Best Score :", gridSearch.best_score_)
print("Best Parameters :", gridSearch.best_params_)

Best Score : 0.969873417721519
Best Parameters : {'max_depth': 10, 'min_impurity_decrease': 0, 'min_samples_leaf': 2, 'min_samples_split': 6, 'n_estimators': 100}


In [109]:
# 위 결과에서 좀 더 수정
# but 너무 과적합 같음,, 
param_grid = {
    "max_depth": [8, 10, 12], 
    "min_samples_leaf": [1, 2, 3, 4], 
    "min_samples_split" : [2, 4, 6], 
    "n_estimators": [75, 100, 125],
    "min_impurity_decrease": [0, 0.0005]
}

rf = RandomForestClassifier(random_state=0, n_jobs=-1)
gridSearch = GridSearchCV(rf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)

gridSearch.fit(train_X_smote, train_y_smote)
print("Best Score :", gridSearch.best_score_)
print("Best Parameters :", gridSearch.best_params_)

Best Score : 0.9724050632911393
Best Parameters : {'max_depth': 10, 'min_impurity_decrease': 0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 75}


**Random Forest 최종 파라미터 :  
n_estimators=100, min_impurity_decrease=0, min_samples_leaf=2, min_samples_split=6, max_depth=10**

In [203]:
best_rf = RandomForestClassifier(random_state=0, n_estimators=100, min_impurity_decrease=0, min_samples_leaf=2, min_samples_split=6, max_depth=10, n_jobs=-1)
best_rf.fit(train_X_smote, train_y_smote)

In [204]:
from dmba import classificationSummary

In [205]:
classificationSummary(train_y_smote, best_rf.predict(train_X_smote))

Confusion Matrix (Accuracy 0.9975)

       Prediction
Actual   0   1
     0 199   0
     1   1 198


In [206]:
classificationSummary(valid_y, best_rf.predict(valid_X))

Confusion Matrix (Accuracy 0.9333)

       Prediction
Actual  0  1
     0  2  1
     1  2 40


In [207]:
classificationSummary(test_y, best_rf.predict(test_X))

Confusion Matrix (Accuracy 0.9565)

       Prediction
Actual  0  1
     0  1  0
     1  2 43


## GBM Parameter Tuning  
https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html

In [116]:
from sklearn.ensemble import GradientBoostingClassifier

In [117]:
# loss에서 log_loss는 logistic regression이랑, exponential은 AdaBoost랑 같은 원리인 것 같음
param_grid ={
    "max_depth" : [3, 5, 10],
    "min_samples_split": [6, 8, 10], 
    "min_samples_leaf": [2, 5, 10],
    "n_estimators": [50, 100, 150], 
    "learning_rate": [0.1, 1, 2], 
    "subsample": [0.5, 1],
    "loss": ["log_loss", "deviance", "exponential"]
}

gbm = GradientBoostingClassifier(random_state=0)
gridSearch = GridSearchCV(gbm, param_grid, cv=5, n_jobs=-1)

gridSearch.fit(train_X_smote, train_y_smote)
print("Best Score :", gridSearch.best_score_)
print("Best Parameters :", gridSearch.best_params_)

Best Score : 0.9824367088607595
Best Parameters : {'learning_rate': 0.1, 'loss': 'exponential', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 8, 'n_estimators': 100, 'subsample': 0.5}


In [126]:
# 근데 loss= "exponential" 하면 AdaBoost랑 같은거 아닌가 해서 loss default 값으로 두고 다른 파라미터끼리만 튜닝해봄
# 역시나 과적합하는 것은 그렇게 좋아보이지 않음,, 
param_grid ={
    "max_depth" : [8, 10, 12],
    "min_samples_split": [6, 8, 10], 
    "min_samples_leaf": [2, 5, 10],
    "n_estimators": [75, 100, 125], 
    "learning_rate": [0.1, 0.5, 1], 
    "subsample": [0.5, 0.8, 1]
}

gbm = GradientBoostingClassifier(random_state=0)
gridSearch = GridSearchCV(gbm, param_grid, cv=5, n_jobs=-1)

gridSearch.fit(train_X_smote, train_y_smote)
print("Best Score :", gridSearch.best_score_)
print("Best Parameters :", gridSearch.best_params_)

Best Score : 0.9799683544303797
Best Parameters : {'learning_rate': 0.1, 'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100, 'subsample': 0.5}


**Gradient Boosting Classifier 최종 파라미터:  
learning rate=0.1, max_depth=10, min_samples_leaf=2, min_samples_split=8, n_estimators=100, subsample=0.5**

In [212]:
best_gbm = GradientBoostingClassifier(learning_rate=0.1, max_depth=10, min_samples_leaf=2, min_samples_split=8, n_estimators=100, subsample=0.5)
best_gbm.fit(train_X_smote, train_y_smote)

In [213]:
classificationSummary(train_y_smote, best_gbm.predict(train_X_smote))

Confusion Matrix (Accuracy 1.0000)

       Prediction
Actual   0   1
     0 199   0
     1   0 199


In [214]:
classificationSummary(valid_y, best_gbm.predict(valid_X))

Confusion Matrix (Accuracy 0.9333)

       Prediction
Actual  0  1
     0  2  1
     1  2 40


In [215]:
classificationSummary(test_y, best_gbm.predict(test_X))

Confusion Matrix (Accuracy 0.9565)

       Prediction
Actual  0  1
     0  1  0
     1  2 43
