In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.experimental import enable_halving_search_cv, enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import HalvingRandomSearchCV

from sklearn.ensemble import RandomForestClassifier

df = pd.read_csv('https://mirror.coggle.club/dataset/heart.csv')
X = df.drop(columns=['output'])
y = df['output']

x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [27]:
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


# 原始模型

In [None]:
%%time

clf = RandomForestClassifier(random_state=0)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

CPU times: user 110 ms, sys: 194 µs, total: 110 ms
Wall time: 108 ms


0.8026315789473685

# GridSearchCV

In [16]:
%%time

parameters = {
    'max_depth': [2,4,5,6,7],
    'min_samples_leaf': [1,2,3],
    'min_weight_fraction_leaf': [0, 0.1],
    'min_impurity_decrease': [0, 0.1, 0.2]
}

clf = GridSearchCV(
    RandomForestClassifier(random_state=0),
    parameters, refit=True, verbose=1,
)
clf.fit(x_train, y_train)
clf.best_estimator_.score(x_test, y_test)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
CPU times: user 40.5 s, sys: 92.7 ms, total: 40.6 s
Wall time: 40.6 s


0.8157894736842105

# RandomizedSearchCV

In [17]:
%%time

parameters = {
    'max_depth': [2,4,5,6,7],
    'min_samples_leaf': [1,2,3],
    'min_weight_fraction_leaf': [0, 0.1],
    'min_impurity_decrease': [0, 0.1, 0.2]
}

clf = RandomizedSearchCV(
    RandomForestClassifier(random_state=0),
    parameters, refit=True, verbose=1, n_iter=10,
)
clf.fit(x_train, y_train)
clf.best_estimator_.score(x_test, y_test)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: user 4.68 s, sys: 11.8 ms, total: 4.69 s
Wall time: 4.69 s


0.8157894736842105

In [22]:
%%time

clf = RandomizedSearchCV(
    RandomForestClassifier(random_state=0),
    parameters, refit=True, verbose=1, n_iter=40,
)
clf.fit(x_train, y_train)
clf.best_estimator_.score(x_test, y_test)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
CPU times: user 18.3 s, sys: 43.8 ms, total: 18.3 s
Wall time: 18.3 s


0.8157894736842105

In [23]:
%%time

clf = RandomizedSearchCV(
    RandomForestClassifier(random_state=0),
    parameters, refit=True, verbose=1, n_iter=60,
)
clf.fit(x_train, y_train)
clf.best_estimator_.score(x_test, y_test)

Fitting 5 folds for each of 60 candidates, totalling 300 fits
CPU times: user 27.3 s, sys: 67.9 ms, total: 27.4 s
Wall time: 27.4 s


0.8157894736842105

# HalvingGridSearchCV

In [24]:
%%time

parameters = {
    'max_depth': [2,4,5,6,7],
    'min_samples_leaf': [1,2,3],
    'min_weight_fraction_leaf': [0, 0.1],
    'min_impurity_decrease': [0, 0.1, 0.2]
}

clf = HalvingGridSearchCV(
    RandomForestClassifier(random_state=0),
    parameters, refit=True, verbose=1,
)
clf.fit(x_train, y_train)
clf.best_estimator_.score(x_test, y_test)

n_iterations: 3
n_required_iterations: 5
n_possible_iterations: 3
min_resources_: 20
max_resources_: 227
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 90
n_resources: 20
Fitting 5 folds for each of 90 candidates, totalling 450 fits
----------
iter: 1
n_candidates: 30
n_resources: 60
Fitting 5 folds for each of 30 candidates, totalling 150 fits
----------
iter: 2
n_candidates: 10
n_resources: 180
Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: user 1min 1s, sys: 75.9 ms, total: 1min 1s
Wall time: 1min 1s


0.8552631578947368

# HalvingRandomSearchCV

In [25]:
%%time

parameters = {
    'max_depth': [2,4,5,6,7],
    'min_samples_leaf': [1,2,3],
    'min_weight_fraction_leaf': [0, 0.1],
    'min_impurity_decrease': [0, 0.1, 0.2]
}

clf = HalvingRandomSearchCV(
    RandomForestClassifier(random_state=0),
    parameters, refit=True, verbose=1,
)
clf.fit(x_train, y_train)
clf.best_estimator_.score(x_test, y_test)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 20
max_resources_: 227
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 11
n_resources: 20
Fitting 5 folds for each of 11 candidates, totalling 55 fits
----------
iter: 1
n_candidates: 4
n_resources: 60
Fitting 5 folds for each of 4 candidates, totalling 20 fits
----------
iter: 2
n_candidates: 2
n_resources: 180
Fitting 5 folds for each of 2 candidates, totalling 10 fits
CPU times: user 8.19 s, sys: 16 ms, total: 8.21 s
Wall time: 8.2 s


0.8289473684210527