In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import mglearn.plots

In [21]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,...,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,...,33.6,0.63,50,1
1,1,85,66,29,...,26.6,0.35,31,0
2,8,183,64,0,...,23.3,0.67,32,1
3,1,89,66,23,...,28.1,0.17,21,0
4,0,137,40,35,...,43.1,2.29,33,1


In [22]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [23]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,...,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,...,768.0,768.0,768.0,768.0
mean,3.85,120.89,69.11,20.54,...,31.99,0.47,33.24,0.35
std,3.37,31.97,19.36,15.95,...,7.88,0.33,11.76,0.48
min,0.0,0.0,0.0,0.0,...,0.0,0.08,21.0,0.0
25%,1.0,99.0,62.0,0.0,...,27.3,0.24,24.0,0.0
50%,3.0,117.0,72.0,23.0,...,32.0,0.37,29.0,0.0
75%,6.0,140.25,80.0,32.0,...,36.6,0.63,41.0,1.0
max,17.0,199.0,122.0,99.0,...,67.1,2.42,81.0,1.0


# Split the data

In [24]:
from sklearn.model_selection import train_test_split

X_trainval, X_test, y_trainval, y_test = train_test_split(data.drop('Outcome', axis=1), data['Outcome'], random_state=0)

X_train, X_valid, y_train, y_valid = train_test_split(X_trainval, y_trainval, random_state=0)

print("Size of training set: {}   size of validation set: {}   size of test set:"
      " {}\n".format(X_train.shape[0], X_valid.shape[0], X_test.shape[0]))

Size of training set: 432   size of validation set: 144   size of test set: 192



In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': range(1, 10),
              'min_samples_split': range(2, 10),
              'min_samples_leaf': range(1, 10)}

grid_search = GridSearchCV(DecisionTreeClassifier(random_state=0), param_grid, cv=5, return_train_score=True, scoring='recall', verbose=1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


# Rebuild the model with the best parameters

In [29]:
tree = DecisionTreeClassifier(max_depth=grid_search.best_params_['max_depth'],
                              min_samples_split=grid_search.best_params_['min_samples_split'],
                              min_samples_leaf=grid_search.best_params_['min_samples_leaf'],
                              random_state=0)
tree.fit(X_trainval, y_trainval)

# Evaluate the model

In [34]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, tree.predict(X_test), target_names=['No Diabetes', 'Diabetes']))
print(confusion_matrix(y_test, tree.predict(X_test)))

              precision    recall  f1-score   support

 No Diabetes       0.81      0.82      0.82       130
    Diabetes       0.62      0.60      0.61        62

    accuracy                           0.75       192
   macro avg       0.71      0.71      0.71       192
weighted avg       0.75      0.75      0.75       192

[[107  23]
 [ 25  37]]
