In [9]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [10]:
X, y = make_classification(
    n_samples=1000,
    n_classes=2,
    n_features=10,
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    random_state=42
)

#### Method 1 : model using train , test , split and tune parameters by trial and error 

In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train,y_test = train_test_split(X,y, random_state=42, test_size=0.25)

model = DecisionTreeClassifier(criterion="gini", max_depth=15) # gini or entropy and deoth can be 5 or 10 
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.83      0.82      0.82       130
           1       0.80      0.82      0.81       120

    accuracy                           0.82       250
   macro avg       0.82      0.82      0.82       250
weighted avg       0.82      0.82      0.82       250



#### Method 2 : cross val score 

In [15]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion='gini', max_depth = 10), X, y , cv = 5)

array([0.775, 0.75 , 0.785, 0.775, 0.805])

In [16]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion='entropy', max_depth = 10), X, y , cv = 5)

array([0.77 , 0.8  , 0.825, 0.76 , 0.78 ])

### GRID SEARCH CV

In [23]:
criterion = ['gini', 'entropy']
max_depth = [5,10,15]

avg_scores = {}

for c in criterion:
    for d in max_depth:
        clf = DecisionTreeClassifier(criterion= c, max_depth = d)
        scores_list = cross_val_score(clf, X , y , cv = 5)
        avg_scores[c + "_" + str(d)] = np.average(scores_list)
avg_scores

{'gini_5': np.float64(0.778),
 'gini_10': np.float64(0.786),
 'gini_15': np.float64(0.7870000000000001),
 'entropy_5': np.float64(0.7809999999999999),
 'entropy_10': np.float64(0.7949999999999999),
 'entropy_15': np.float64(0.806)}

In [25]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(
    DecisionTreeClassifier(),
    {
        'criterion' : ['gini', 'entropy'],
        'max_depth' : [5,10,15],
    },
    cv = 5,
return_train_score = False 

)
clf.fit(X,y)
clf.cv_results_

{'mean_fit_time': array([0.01730733, 0.02501445, 0.02603641, 0.02316871, 0.03393898,
        0.03433948]),
 'std_fit_time': array([0.00087845, 0.00200328, 0.0022274 , 0.00117575, 0.00338809,
        0.00439312]),
 'mean_score_time': array([0.00360403, 0.0024025 , 0.00238762, 0.00258412, 0.00299501,
        0.0021935 ]),
 'std_score_time': array([0.00101562, 0.00049192, 0.00049471, 0.00047788, 0.00063287,
        0.00076585]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 15, 5, 10, 15],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'criterion': 'gini', 'max_depth': 5},
  {'criterion': 'gini', 'max_depth': 10},
  {'criterion': 'gini', 'max_depth': 15},
  {'criterion': 'entropy', 'max_depth': 5

In [27]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.017307,0.000878,0.003604,0.001016,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.775,0.785,0.76,0.795,0.77,0.777,0.012083,6
1,0.025014,0.002003,0.002402,0.000492,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.775,0.75,0.785,0.785,0.81,0.781,0.019339,5
2,0.026036,0.002227,0.002388,0.000495,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.785,0.73,0.805,0.8,0.825,0.789,0.032156,3
3,0.023169,0.001176,0.002584,0.000478,entropy,5,"{'criterion': 'entropy', 'max_depth': 5}",0.765,0.785,0.76,0.81,0.79,0.782,0.018055,4
4,0.033939,0.003388,0.002995,0.000633,entropy,10,"{'criterion': 'entropy', 'max_depth': 10}",0.78,0.8,0.84,0.78,0.785,0.797,0.022716,2
5,0.034339,0.004393,0.002193,0.000766,entropy,15,"{'criterion': 'entropy', 'max_depth': 15}",0.755,0.8,0.835,0.81,0.845,0.809,0.031528,1


In [28]:
df[["param_criterion","param_max_depth", "mean_test_score"]]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,5,0.777
1,gini,10,0.781
2,gini,15,0.789
3,entropy,5,0.782
4,entropy,10,0.797
5,entropy,15,0.809


In [29]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [30]:
clf.best_estimator_

### Randomized search cv

In [35]:
from sklearn.model_selection import RandomizedSearchCV

clf = RandomizedSearchCV(
    DecisionTreeClassifier(),
    {
        'criterion' : ['gini', 'entropy'],
        'max_depth' : [5,10,15,20],
    },
    cv = 5,
    return_train_score = False,
    n_iter=3

)
clf.fit(X,y)
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.027199,0.001918,0.002799,0.000399,5,entropy,"{'max_depth': 5, 'criterion': 'entropy'}",0.765,0.78,0.755,0.815,0.78,0.779,0.020347,3
1,0.02471,0.001766,0.002203,0.000402,20,gini,"{'max_depth': 20, 'criterion': 'gini'}",0.815,0.725,0.805,0.8,0.815,0.792,0.034,1
2,0.016616,0.001347,0.002202,0.000399,5,gini,"{'max_depth': 5, 'criterion': 'gini'}",0.775,0.81,0.75,0.79,0.775,0.78,0.019748,2
