In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(
    n_features=10,
    n_samples=1000,
    n_informative=8,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    random_state=42,
)

In [2]:
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

model = DecisionTreeClassifier(
    criterion="entropy", max_depth=10
)  # criteria: "gini" or "entropy", max_depth=5 or 10
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.83      0.75      0.79       130
           1       0.75      0.83      0.79       120

    accuracy                           0.79       250
   macro avg       0.79      0.79      0.79       250
weighted avg       0.79      0.79      0.79       250



In [3]:
from sklearn.model_selection import cross_val_score

cross_val_score(DecisionTreeClassifier(criterion="gini", max_depth=5), X, y, cv=5)

array([0.775, 0.81 , 0.75 , 0.805, 0.775])

In [4]:
cross_val_score(DecisionTreeClassifier(criterion="entropy", max_depth=5), X, y, cv=5)

array([0.765, 0.785, 0.75 , 0.815, 0.79 ])

In [5]:
criterion = ["gini", "entropy"]
max_depth = [5, 10, 15]

avg_scores = {}

for c in criterion:
    for d in max_depth:
        clf = DecisionTreeClassifier(criterion=c, max_depth=d)
        score_list = cross_val_score(clf, X, y, cv=5)
        avg_scores[c + "_" + str(d)] = np.average(score_list)

avg_scores

{'gini_5': np.float64(0.777),
 'gini_10': np.float64(0.785),
 'gini_15': np.float64(0.7949999999999999),
 'entropy_5': np.float64(0.7779999999999999),
 'entropy_10': np.float64(0.786),
 'entropy_15': np.float64(0.8150000000000001)}

In [6]:
from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(
    DecisionTreeClassifier(),
    {"criterion": ["gini", "entropy"], "max_depth": [5, 10, 15]},
    cv=5,
    return_train_score=False,
)
clf.fit(X, y)
clf.cv_results_

  _data = np.array(data, dtype=dtype, copy=copy,


{'mean_fit_time': array([0.00679626, 0.0079073 , 0.00740004, 0.00731821, 0.01119933,
        0.01112657]),
 'std_fit_time': array([0.00039897, 0.00110468, 0.00048843, 0.00039802, 0.00098062,
        0.00113691]),
 'mean_score_time': array([0.00120153, 0.00120811, 0.00060072, 0.00040007, 0.00100069,
        0.00080104]),
 'std_score_time': array([3.98483504e-04, 4.14468641e-04, 4.90490022e-04, 4.89979265e-04,
        1.21943587e-06, 4.00519893e-04]),
 'param_criterion': masked_array(data=['gini', 'gini', 'gini', 'entropy', 'entropy',
                    'entropy'],
              mask=[False, False, False, False, False, False],
        fill_value=np.str_('?'),
             dtype=object),
 'param_max_depth': masked_array(data=[5, 10, 15, 5, 10, 15],
              mask=[False, False, False, False, False, False],
        fill_value=999999),
 'params': [{'criterion': 'gini', 'max_depth': 5},
  {'criterion': 'gini', 'max_depth': 10},
  {'criterion': 'gini', 'max_depth': 15},
  {'criterion': '

In [7]:
df = pd.DataFrame(clf.cv_results_)
df.head(3)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006796,0.000399,0.001202,0.000398,gini,5,"{'criterion': 'gini', 'max_depth': 5}",0.78,0.79,0.745,0.79,0.77,0.775,0.016733,5
1,0.007907,0.001105,0.001208,0.000414,gini,10,"{'criterion': 'gini', 'max_depth': 10}",0.785,0.71,0.795,0.765,0.81,0.773,0.034728,6
2,0.0074,0.000488,0.000601,0.00049,gini,15,"{'criterion': 'gini', 'max_depth': 15}",0.81,0.73,0.815,0.8,0.81,0.793,0.031875,3


In [8]:
df[["param_criterion", "param_max_depth", "mean_test_score"]]

Unnamed: 0,param_criterion,param_max_depth,mean_test_score
0,gini,5,0.775
1,gini,10,0.773
2,gini,15,0.793
3,entropy,5,0.776
4,entropy,10,0.797
5,entropy,15,0.81


In [9]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 15}

In [10]:
model = clf.best_estimator_
model

In [11]:
from sklearn import svm

model_params = {
    "decision_tree": {
        "model": DecisionTreeClassifier(),
        "params": {"criterion": ["gini", "entropy"], "max_depth": [5, 10, 15]},
    },
    "svm": {
        "model": svm.SVC(gamma="auto"),
        "params": {"C": [1, 10, 20], "kernel": ["rbf", "linear"]},
    },
}

scores = []

for key, val in model_params.items():
    clf = GridSearchCV(val["model"], val["params"], cv=5, return_train_score=False)
    clf.fit(X, y)
    scores.append(
        {"model": key, "best_score": clf.best_score_, "best_params": clf.best_params_}
    )

scores

  _data = np.array(data, dtype=dtype, copy=copy,


[{'model': 'decision_tree',
  'best_score': np.float64(0.8059999999999998),
  'best_params': {'criterion': 'entropy', 'max_depth': 15}},
 {'model': 'svm',
  'best_score': np.float64(0.9260000000000002),
  'best_params': {'C': 1, 'kernel': 'rbf'}}]

In [12]:
df = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
df

Unnamed: 0,model,best_score,best_params
0,decision_tree,0.806,"{'criterion': 'entropy', 'max_depth': 15}"
1,svm,0.926,"{'C': 1, 'kernel': 'rbf'}"
