In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
colnames = ['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age', 'outcome']
prima_df = pd.read_csv("pima-indians-diabetes-1.data",names=colnames)
prima_df.head()

Unnamed: 0,preg,glu,bp,sft,ins,bmi,dpf,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
prima_df.shape

(768, 9)

In [4]:
prima_df['outcome'].value_counts()

0    500
1    268
Name: outcome, dtype: int64

In [9]:
prima_df['outcome'] = prima_df['outcome'].replace({0: 'Healthy', 1: 'Diabetic'})
prima_df


Unnamed: 0,preg,glu,bp,sft,ins,bmi,dpf,age,outcome
0,6,148,72,35,0,33.6,0.627,50,Diabetic
1,1,85,66,29,0,26.6,0.351,31,Healthy
2,8,183,64,0,0,23.3,0.672,32,Diabetic
3,1,89,66,23,94,28.1,0.167,21,Healthy
4,0,137,40,35,168,43.1,2.288,33,Diabetic
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,Healthy
764,2,122,70,27,0,36.8,0.340,27,Healthy
765,5,121,72,23,112,26.2,0.245,30,Healthy
766,1,126,60,0,0,30.1,0.349,47,Diabetic


In [85]:
prima_df['outcome'] = prima_df.outcome.astype('category')
prima_df

Unnamed: 0,preg,glu,bp,sft,ins,bmi,dpf,age,outcome
0,6,148,72,35,0,33.6,0.627,50,Diabetic
1,1,85,66,29,0,26.6,0.351,31,Healthy
2,8,183,64,0,0,23.3,0.672,32,Diabetic
3,1,89,66,23,94,28.1,0.167,21,Healthy
4,0,137,40,35,168,43.1,2.288,33,Diabetic
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,Healthy
764,2,122,70,27,0,36.8,0.340,27,Healthy
765,5,121,72,23,112,26.2,0.245,30,Healthy
766,1,126,60,0,0,30.1,0.349,47,Diabetic


In [86]:
X=prima_df[['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age']]
Y=prima_df['outcome']

In [87]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.3,random_state=0)

In [88]:
Xtrain=np.array(xtrain)
Ytrain=np.array(ytrain)

In [89]:
Xtest=np.array(xtest)
Ytest=np.array(ytest)

In [281]:
model=DecisionTreeClassifier(criterion = 'entropy',max_depth=5,max_leaf_nodes=20,min_samples_leaf=5)
model.fit(Xtrain,Ytrain)

DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=20,
                       min_samples_leaf=5)

In [282]:
ypred=model.predict(Xtest)

In [283]:
acc=metrics.accuracy_score(Ytest,ypred)
print(acc)

0.7835497835497836


In [284]:
cm=metrics.confusion_matrix(Ytest,ypred)
print(cm)

[[ 50  24]
 [ 26 131]]


In [285]:
cr=metrics.classification_report(Ytest,ypred)
print(cr)

              precision    recall  f1-score   support

    Diabetic       0.66      0.68      0.67        74
     Healthy       0.85      0.83      0.84       157

    accuracy                           0.78       231
   macro avg       0.75      0.76      0.75       231
weighted avg       0.79      0.78      0.78       231



In [298]:
print(pd.DataFrame(model.feature_importances_, columns = ["Imp"], index = xtrain.columns))

           Imp
preg  0.048644
glu   0.448606
bp    0.027446
sft   0.034408
ins   0.000000
bmi   0.178720
dpf   0.081560
age   0.180617


In [308]:
model2=DecisionTreeClassifier(criterion = 'entropy',max_depth=5,max_leaf_nodes=20)
model2.fit(Xtrain,Ytrain)

DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=20)

In [309]:
ypred2=model2.predict(Xtest)

In [310]:
acc2=metrics.accuracy_score(Ytest,ypred2)
print(acc2)

0.7705627705627706


In [311]:
cm2=metrics.confusion_matrix(Ytest,ypred2)
print(cm2)

[[ 49  25]
 [ 28 129]]


In [312]:
cr2=metrics.classification_report(Ytest,ypred2)
print(cr2)

              precision    recall  f1-score   support

    Diabetic       0.64      0.66      0.65        74
     Healthy       0.84      0.82      0.83       157

    accuracy                           0.77       231
   macro avg       0.74      0.74      0.74       231
weighted avg       0.77      0.77      0.77       231



In [106]:
from IPython.display import Image  
from sklearn import tree
from os import system

Diabetic_Tree_File = open('diabetes_tree.dot','w')
dot_data = tree.export_graphviz(model, out_file=Diabetic_Tree_File, 
        feature_names = list(xtrain), class_names = list(ytrain))

Diabetic_Tree_File.close()

# http://graphviz.it/#/gallery/longflat.gv
# importance of features in the tree building ( The importance of a feature is computed as the 
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
