In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [2]:
dataset = load_breast_cancer()
t = dataset.target
x = dataset.data

x.shape, t.shape

((569, 30), (569,))

In [3]:
from sklearn.model_selection import train_test_split
x_train_val, x_test, t_train_val, t_test = train_test_split(x, t, test_size=0.2, random_state=1)
x_train_val.shape

(455, 30)

In [4]:
x_train, x_val, t_train, t_val = train_test_split(x_train_val, t_train_val, test_size=0.3, random_state=1)

In [5]:
x_train.shape, x_val.shape, x_test.shape

((318, 30), (137, 30), (114, 30))

In [13]:
# at first, train with default value without adjusting
from sklearn.tree import DecisionTreeClassifier

# set hyper params by manual
dtree = DecisionTreeClassifier(max_depth=10, min_samples_split=30, random_state=0)

dtree.fit(x_train, t_train)
print(f'train score: {dtree.score(x_train, t_train)}')
print(f'val score: {dtree.score(x_val, t_val)}')
print(f'test score: {dtree.score(x_test, t_test)}')

train score: 0.9308176100628931
val score: 0.9562043795620438
test score: 0.9298245614035088


In [18]:
# grid search( a metrics to decide hyper params)
from sklearn.model_selection import GridSearchCV

# model
estimator = DecisionTreeClassifier(random_state=0)

# Search range
param_grid = [{
   'max_depth': [3, 20, 50],
   'min_samples_split': [3, 20, 30]
}]

# k of k-Partition cross-validation
cv = 5

# grid-search
tuned_model = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cv, return_train_score=False)

In [19]:
tuned_model.fit(x_train_val, t_train_val)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
             param_grid=[{'max_depth': [3, 20, 50],
                          'min_samples_split': [3, 20, 30]}])

In [21]:
pd.DataFrame(tuned_model.cv_results_).T
# check 'mean_test_score'

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.013585,0.013755,0.013254,0.013744,0.01227,0.012499,0.012168,0.003777,0.006109
std_fit_time,0.003881,0.003996,0.001667,0.003978,0.002132,0.00255,0.004465,0.002286,0.002338
mean_score_time,0.002588,0.001906,0.000206,0.0,0.0,0.001511,0.00163,0.001146,0.000005
std_score_time,0.002923,0.002538,0.000413,0.0,0.0,0.00255,0.003261,0.001821,0.00001
param_max_depth,3,3,3,20,20,20,50,50,50
param_min_samples_split,3,20,30,3,20,30,3,20,30
params,"{'max_depth': 3, 'min_samples_split': 3}","{'max_depth': 3, 'min_samples_split': 20}","{'max_depth': 3, 'min_samples_split': 30}","{'max_depth': 20, 'min_samples_split': 3}","{'max_depth': 20, 'min_samples_split': 20}","{'max_depth': 20, 'min_samples_split': 30}","{'max_depth': 50, 'min_samples_split': 3}","{'max_depth': 50, 'min_samples_split': 20}","{'max_depth': 50, 'min_samples_split': 30}"
split0_test_score,0.923077,0.912088,0.912088,0.956044,0.912088,0.912088,0.956044,0.912088,0.912088
split1_test_score,0.901099,0.901099,0.901099,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099
split2_test_score,0.934066,0.934066,0.934066,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066


In [22]:
# more detail range( 2nd)
# model
estimator2 = DecisionTreeClassifier(random_state=0)

# Search range
param_grid2 = [{
   'max_depth': [5, 10, 15],
   'min_samples_split': [10, 12, 15]
}]

# k of k-Partition cross-validation
cv2 = 5

# grid-search
tuned_model2 = GridSearchCV(estimator=estimator2, param_grid=param_grid2, cv=cv2, return_train_score=False)

# train
tuned_model2.fit(x_train_val, t_train_val)

pd.DataFrame(tuned_model2.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8
mean_fit_time,0.01995,0.02066,0.018933,0.019807,0.016734,0.016043,0.017869,0.018257,0.008854
std_fit_time,0.001996,0.003441,0.001714,0.002053,0.000527,0.00385,0.001293,0.003834,0.003209
mean_score_time,0.001492,0.001291,0.000945,0.001262,0.001223,0.001072,0.00088,0.001212,0.000477
std_score_time,0.001375,0.001062,0.000501,0.000457,0.000999,0.001339,0.001318,0.001949,0.000598
param_max_depth,5,5,5,10,10,10,15,15,15
param_min_samples_split,10,12,15,10,12,15,10,12,15
params,"{'max_depth': 5, 'min_samples_split': 10}","{'max_depth': 5, 'min_samples_split': 12}","{'max_depth': 5, 'min_samples_split': 15}","{'max_depth': 10, 'min_samples_split': 10}","{'max_depth': 10, 'min_samples_split': 12}","{'max_depth': 10, 'min_samples_split': 15}","{'max_depth': 15, 'min_samples_split': 10}","{'max_depth': 15, 'min_samples_split': 12}","{'max_depth': 15, 'min_samples_split': 15}"
split0_test_score,0.967033,0.923077,0.912088,0.967033,0.923077,0.912088,0.967033,0.923077,0.912088
split1_test_score,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099,0.912088,0.901099,0.901099
split2_test_score,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066,0.923077,0.934066,0.934066


In [23]:
# best params
tuned_model2.best_params_

{'max_depth': 5, 'min_samples_split': 10}

In [24]:
# best model
best_model = tuned_model2.best_estimator_

In [25]:
print(f'train score: {best_model.score(x_train_val, t_train_val)}')
print(f'test score: {best_model.score(x_test, t_test)}')

train score: 0.9934065934065934
test score: 0.956140350877193
