In [1]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y= make_classification(
    n_features= 10,
    n_samples= 1000,
    n_informative= 8,
    n_redundant= 2,
    n_repeated=0,
    n_classes= 2,
    random_state= 42
)

X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.25, random_state= 42)

<h3>Method 1: Evaluate the model using train, test split and tune parameters by trial and error</h3>

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

model= DecisionTreeClassifier(criterion= "gini", max_depth= 10)
model.fit(X_train, y_train)

y_pred= model.predict(X_test)
report= classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.84      0.83      0.83       130
           1       0.82      0.82      0.82       120

    accuracy                           0.83       250
   macro avg       0.83      0.83      0.83       250
weighted avg       0.83      0.83      0.83       250



<h3>Cross Validation Score</h3>

In [5]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion= "gini", max_depth= 10), X, y, cv=5) 
# cv=5, will create StratifiedKFold object

array([0.765, 0.73 , 0.795, 0.79 , 0.815])

In [10]:
criterion= ["gini", "entropy"]
max_depth= [5, 10, 15]

avg_scores = {}

for c in criterion:
    for d in max_depth:
        dt= DecisionTreeClassifier(criterion= c, max_depth= d)
        score_list= cross_val_score(dt, X, y, cv= 5)
        avg_scores[c + "_"+ str(d)]= np.average(score_list)

avg_scores

{'gini_5': np.float64(0.781),
 'gini_10': np.float64(0.7929999999999999),
 'gini_15': np.float64(0.8),
 'entropy_5': np.float64(0.779),
 'entropy_10': np.float64(0.7929999999999999),
 'entropy_15': np.float64(0.8150000000000001)}

<h3>GridsearchCV</h3>

In [11]:
from sklearn.model_selection import GridSearchCV

gsc= GridSearchCV(
    DecisionTreeClassifier(), 
    {
        "criterion": ["gini", "entropy"],
        "max_depth": [5, 10, 15]
    },
    cv= 5,
    return_train_score= False
)
gsc.fit(X, y)
gsc.cv_results_

{'mean_fit_time': array([0.00680332, 0.01596928, 0.01736097, 0.01617198, 0.02315993,
        0.02503905]),
 'std_fit_time': array([0.00140116, 0.00111933, 0.00173041, 0.00036146, 0.00090909,
        0.00145517]),
 'mean_score_time': array([0.00083952, 0.00125604, 0.00137706, 0.00129862, 0.00131717,
        0.00138764]),
 'std_score_time': array([2.88138195e-04, 8.45805359e-05, 8.63381326e-05, 4.86484683e-05,
        8.03244330e-05, 9.74866019e-05]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 15, 5, 10, 15],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'criterion': 'gini', 'max_depth': 5},
  {'criterion': 'gini', 'max_depth': 10},
  {'criterion': 'gini', 'max_depth': 15},
  {'criterion': '

In [13]:
import pandas as pd

df= pd.DataFrame(gsc.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006803,0.001401,0.00084,0.000288,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.78,0.81,0.75,0.805,0.77,0.783,0.022271,5
1,0.015969,0.001119,0.001256,8.5e-05,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.78,0.75,0.79,0.79,0.81,0.784,0.019596,4
2,0.017361,0.00173,0.001377,8.6e-05,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.805,0.7,0.8,0.81,0.82,0.787,0.044,3
3,0.016172,0.000361,0.001299,4.9e-05,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.765,0.775,0.765,0.815,0.775,0.779,0.018547,6
4,0.02316,0.000909,0.001317,8e-05,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.775,0.8,0.825,0.78,0.775,0.791,0.019339,2
5,0.025039,0.001455,0.001388,9.7e-05,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.755,0.81,0.84,0.795,0.86,0.812,0.036414,1


In [14]:
df[["param_criterion", "param_max_depth", "mean_test_score"]]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,5,0.783
1,gini,10,0.784
2,gini,15,0.787
3,entropy,5,0.779
4,entropy,10,0.791
5,entropy,15,0.812


In [15]:
gsc.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [16]:
gsc.best_estimator_

0,1,2
,criterion,'entropy'
,splitter,'best'
,max_depth,15
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [18]:
from sklearn.svm import SVC

model_params= {
    'dt': {
        'model': DecisionTreeClassifier(), 
    'params': {
        "criterion": ["gini", "entropy"],
        "max_depth": [5, 10, 15]
    }
    },
    'svm': {
        'model': SVC(), 
    'params': {
        "C": [1, 10, 20],
        "kernel": ['rbf', 'linear']
    }
    }
}

scores= []

for key, val in model_params.items():
    clf = GridSearchCV(val['model'], val['params'], cv= 5, return_train_score= False)
    clf.fit(X, y)
    scores.append({
        'model': key,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
})

scores

[{'model': 'dt',
  'best_score': np.float64(0.817),
  'best_params': {'criterion': 'entropy', 'max_depth': 15}},
 {'model': 'svm',
  'best_score': np.float64(0.93),
  'best_params': {'C': 20, 'kernel': 'rbf'}}]

In [19]:
pd.DataFrame(scores)

Unnamed: 0,model,best_score,best_params
0,dt,0.817,"{'criterion': 'entropy', 'max_depth': 15}"
1,svm,0.93,"{'C': 20, 'kernel': 'rbf'}"


<h3>RandomizedSearchCV</h3>

In [22]:
from sklearn.model_selection import RandomizedSearchCV

rsc = RandomizedSearchCV(
    DecisionTreeClassifier(), 
    {
        "criterion": ["gini", "entropy"],
        "max_depth": [5, 10, 15]
    },
    cv= 5,
    return_train_score= False,
    n_iter= 3
)
rsc.fit(X, y)

df= pd.DataFrame(rsc.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01572,0.005854,0.001326,0.000409,15,gini,"{'max_depth': 15, 'criterion': 'gini'}",0.785,0.73,0.83,0.815,0.81,0.794,0.035128,2
1,0.011372,0.000225,0.00129,0.00017,5,gini,"{'max_depth': 5, 'criterion': 'gini'}",0.775,0.795,0.76,0.805,0.775,0.782,0.016,3
2,0.025062,0.003141,0.002015,0.001219,10,entropy,"{'max_depth': 10, 'criterion': 'entropy'}",0.78,0.79,0.84,0.78,0.795,0.797,0.022271,1
