In [1]:
# 2023 OCT 17

In [2]:
import sklearn
import numpy as np
import pandas as pd

In [3]:
# import modules
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

# evaluation
from sklearn.metrics import accuracy_score

# dataset
from sklearn.datasets import load_iris

In [4]:
# load dataset
iris = load_iris()
iris_features = iris.data
iris_labels = iris.target

#
print("dataset size:", iris_features.shape[0])

# method
classifier = DecisionTreeClassifier(random_state=156)

dataset size: 150


**cross_val_score()**

*cross_val_score() uses **Stratified K-Fold***

In [7]:
batch_accuracies = cross_val_score(classifier, iris_features, iris_labels, scoring="accuracy", cv=3)  # cv: k_fold_n_splits

print(batch_accuracies)

# evaluation: (mean batch performances)
print("\n<EVALUATION RESULT>")
print("accuracy score:", np.mean(batch_accuracies))

[0.98 0.94 0.98]

<EVALUATION RESULT>
accuracy score: 0.9666666666666667


**GridSearchCV**

*to find the optimal hyper parameters*

In [11]:
# divide dataset
X_train, X_test, y_train, y_test = train_test_split(iris_features, iris_labels, test_size=0.2, random_state=121)

In [13]:
# prepare estimator's hyper parameters as dict
params_roster = {"max_depth": [1, 2, 3], "min_samples_split": [2, 3]}  # -> 3 X 2 grid

In [14]:
# GridSearchCV
# cv=3 means Stratified 3-Fold cross validation
# -> 3 batches X (3 X 2) parameter settings => 18 training sessions (iterations)
# refit=True: the estimator will be trained using the best optimal hyper parameters

# setting
classifier_grid = GridSearchCV(classifier, param_grid=params_roster, cv=3, refit=True, return_train_score=True)

# initiate training
classifier_grid.fit(X_train, y_train)

In [15]:
# check the training result
scores_df = pd.DataFrame(classifier_grid.cv_results_)
display(scores_df[['params', 'mean_test_score', 'rank_test_score', 
    'split0_test_score', 'split1_test_score', 'split2_test_score']])

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


In [18]:
# check the optimal setting
print("optimal hyper parameters:", f"{classifier_grid.best_params_}")
print("best score (accuracy score):", f"{classifier_grid.best_score_: .4f}")

optimal hyper parameters: {'max_depth': 3, 'min_samples_split': 2}
best score (accuracy score):  0.9750


In [21]:
# predict

# <M1> use GridSearchCV object directly if already trained with refit=True option
predictions = classifier_grid.predict(X_test)
print(f"accuracy score: {accuracy_score(y_test, predictions): .4f}")

# <M2> extract the best optimal estimator from the GridSearchCV object
best_estimator = classifier_grid.best_estimator_
predictions = best_estimator.predict(X_test)
print(f"accuracy score: {accuracy_score(y_test, predictions): .4f}")

accuracy score:  0.9667
accuracy score:  0.9667
