테스트 데이터는 표본이 크지 않기에 테스트 데이터로 학습한 모델은 신뢰도가 높지 않다.\
테스트 데이터를 변경해가며 과적합을 방지하고 신뢰도를 올리기 위해 교차검증을 진행해본다.

KFold는 표본 데이터에서 여러개의 테스트 데이터를 만들어 모델 평가를 진행한다.\
validation이 끝난 후 테스트 데이터로 최종 평가하여 최적의 케이스를 알아본다.

hyperparameter optimization은 모델 학습 과정에서 변수를 다양하게 설정하여 최적의 성능을 찾는 방법이다.\
여기서는 decision tree에서 max_depth를 바꿔가며 테스트하고 최고 성능의 경우를 알아본다.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve


In [None]:
red_url = 'https://raw.githubusercontent.com/PinkWink/forML_study_data/main/data/winequality-red.csv'
white_url = 'https://raw.githubusercontent.com/PinkWink/forML_study_data/main/data/winequality-white.csv'

red_wine = pd.read_csv(red_url, sep=';')
white_wine = pd.read_csv(white_url, sep=';')
red_wine['color']=1.
white_wine['color']=0.
wine = pd.concat([red_wine, white_wine])
wine['taste'] = [1. if grade > 5 else 0. for grade in wine['quality']]
x = wine.drop(['taste', 'quality'], axis=1)
y = wine['taste']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

wine_tree = DecisionTreeClassifier(max_depth=2, random_state=5)
wine_tree.fit(x_train, y_train)
y_pred_tr = wine_tree.predict(x_train)
y_pred_test = wine_tree.predict(x_test)
print('Train Acc    : ', accuracy_score(y_train, y_pred_tr))
print('Test Acc     : ', accuracy_score(y_test, y_pred_test))

In [None]:
kfold = KFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=5)
for train_idx, test_idx in kfold.split(x) :
    print(len(train_idx), len(test_idx))

In [None]:
cv_accuracy = []

for train_idx, test_idx in kfold.split(x) :
    x_train, x_test = x.iloc[train_idx], x.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    wine_tree_cv.fit(x_train, y_train)
    pred = wine_tree_cv.predict(x_test)
    cv_accuracy.append(accuracy_score(y_test, pred))

np.mean(cv_accuracy)

In [None]:
skfold = StratifiedKFold(n_splits=5)
wine_tree_cv = DecisionTreeClassifier(max_depth=2, random_state=5)

cv_accuracy = []

for train_idx, test_idx in skfold.split(x, y) :
    x_train, x_test = x.iloc[train_idx], x.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    wine_tree_cv.fit(x_train, y_train)
    pred = wine_tree_cv.predict(x_test)
    cv_accuracy.append(accuracy_score(y_test, pred))

np.mean(cv_accuracy)

In [None]:
from sklearn.model_selection import cross_validate
cross_validate(wine_tree_cv, x, y, scoring=None, cv=skfold, return_train_score=True)

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth':[2,4,7,10]}
wine_tree = DecisionTreeClassifier(max_depth=2, random_state=5)
gridsearch = GridSearchCV(estimator=wine_tree, param_grid=params, cv=5)
gridsearch.fit(x, y)

In [None]:
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(gridsearch.cv_results_)

In [None]:
gridsearch.best_estimator_

In [None]:
gridsearch.best_score_

In [None]:
gridsearch.best_params_

In [None]:
estimators = [('scaler', StandardScaler()), ('clf', DecisionTreeClassifier(random_state=5))]
pipe = Pipeline(estimators)

param_grid = [ {'clf__max_depth':list(range(1,10))}]
GridSearch = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=5)
GridSearch.fit(x, y)

In [None]:
pp.pprint(gridsearch.cv_results_)

In [None]:
fig = plt.figure(figsize=(25,10))
_ = tree.plot_tree(GridSearch.best_estimator_['clf'], feature_names=x_train.columns, class_names =['white', 'red'], filled=True)

In [None]:
score_df = pd.DataFrame(GridSearch.cv_results_)
score_df.sort_values(order)
score_df[['params', 'rank_test_score', 'mean_test_score', 'std_test_score']]