In [1]:
# Classification Toy Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Generate dataset
from sklearn.datasets import make_classification

In [3]:
# without coefficient of underline model
X, y = make_classification(n_samples=1000, n_features=5, n_clusters_per_class=1, n_classes=2, random_state=2529)

In [4]:
# First five rows of target variable y and features X
X[0:5]

array([[ 1.54701705,  0.84770596, -0.41725021, -0.62356778, -0.19388577],
       [ 0.80633556,  0.40985594, -0.45641095, -0.3052022 ,  0.50935923],
       [ 0.94390268,  0.70041038,  1.11385452, -0.49394417,  1.42305455],
       [ 1.92091517,  0.95815739, -1.2235022 , -0.71578154,  0.66588981],
       [ 1.45270369,  0.69035375, -1.18119669, -0.52009219, -0.22745417]])

In [5]:
y[0:5]

array([0, 0, 1, 0, 0])

In [6]:
X.shape, y.shape

((1000, 5), (1000,))

In [7]:
# Train test split
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2529)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((700, 5), (300, 5), (700,), (300,))

In [10]:
# Decision tree classification model train
from sklearn.tree import DecisionTreeClassifier

In [11]:
model = DecisionTreeClassifier()

In [12]:
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [14]:
# Model prediction
y_pred = model.predict(X_test)

In [15]:
y_pred.shape

(300,)

In [16]:
y_pred

array([1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1])

In [17]:
# Model evaluation
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [18]:
accuracy_score(y_test, y_pred)

0.9833333333333333

In [19]:
confusion_matrix(y_test, y_pred)

array([[155,   2],
       [  3, 140]])

In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98       157
           1       0.99      0.98      0.98       143

    accuracy                           0.98       300
   macro avg       0.98      0.98      0.98       300
weighted avg       0.98      0.98      0.98       300



In [21]:
# Hyperparameter tunning grid search
from sklearn.model_selection import GridSearchCV
parameters = {'criterion': ['gini', 'entropy'], 'max_depth':[2,3,4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
gridsearch = GridSearchCV(DecisionTreeClassifier(), parameters)
gridsearch.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15,
                                       20, 30, 40, 50, 70, 90, 120, 150]})

In [22]:
gridsearch.best_params_

{'criterion': 'gini', 'max_depth': 3}

In [23]:
gridsearch.best_score_

0.9885714285714287

In [24]:
gridsearch.best_estimator_

DecisionTreeClassifier(max_depth=3)

In [25]:
gridsearch.best_index_

1

In [26]:
y_pred_grid = gridsearch.predict(X_test)

In [27]:
confusion_matrix(y_test, y_pred_grid)

array([[155,   2],
       [  1, 142]])

In [28]:
print(classification_report(y_test, y_pred_grid))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       157
           1       0.99      0.99      0.99       143

    accuracy                           0.99       300
   macro avg       0.99      0.99      0.99       300
weighted avg       0.99      0.99      0.99       300

