# Importing essential Libraries

In [14]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc
from sklearn import metrics
import matplotlib.pyplot as plt

# Loading DATA

In [66]:
columns=['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age', 'outcome']
df=pd.read_csv("C:\\Users\\pujas\\Downloads\\prima-indians-diabetes.csv",names=columns)

In [67]:
df.head()

Unnamed: 0,preg,glu,bp,sft,ins,bmi,dpf,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [68]:
df.shape

(768, 9)

In [69]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   preg     768 non-null    int64  
 1   glu      768 non-null    int64  
 2   bp       768 non-null    int64  
 3   sft      768 non-null    int64  
 4   ins      768 non-null    int64  
 5   bmi      768 non-null    float64
 6   dpf      768 non-null    float64
 7   age      768 non-null    int64  
 8   outcome  768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [70]:
df['outcome'].value_counts() # 0 referes to healthy and 1 Diabetic

0    500
1    268
Name: outcome, dtype: int64

# Let's Separate X and Y variables

In [71]:
X=df.drop('outcome',axis=1)
Y=df['outcome']

# Splitting the data into training and testing

In [72]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=.3,random_state=0)

In [73]:
DT=DecisionTreeClassifier(random_state=0) #Fully grown DecisionTree, craeting an instance/object of DT

# Let's Train/Fit the model

In [74]:
DT.fit(xtrain,ytrain) #Model is learning using the training data

DecisionTreeClassifier(random_state=0)

In [75]:
ypred=DT.predict(xtest) # To see after learing from the training data, how model is going to predict for our test records

# Let's check the performance of our model using multiple metrics

In [76]:
accuracy=metrics.accuracy_score(ytest,ypred)
print(accuracy) #Our overall accuracy is 68%, though for a binary classifier we prefer to see ROC_AUC score as overall accuracy may mislead

0.7229437229437229


In [77]:
cm=metrics.confusion_matrix(ytest,ypred)
print(cm)

[[123  34]
 [ 30  44]]


# Let's Manually Find out the coorniates for ROC curve from cunfusion matrix

In [81]:
alpha=34/(123+34)
print(alpha)  # alpha: X-axis of ROC curve

0.21656050955414013


In [84]:
one_min_Beta=44/(44+30)
print(one_min_Beta) #1-beta: Y-axis of ROC curve

0.5945945945945946


# The above performance was for a fully grown DT model, Now let's do Pruning to tune the Hyperparameters like criterion and max_depth using GridSearchCV

In [91]:
from sklearn.model_selection import GridSearchCV
param={'max_depth':np.arange(1,15),'criterion':['gini','entropy']}
GS=GridSearchCV(DT,param,cv=5,scoring='roc_auc')
GS.fit(X,Y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=0),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])},
             scoring='roc_auc')

In [92]:
GS.best_params_

{'criterion': 'gini', 'max_depth': 5}

# Now, let's see the performance of Regularized DT with {'criterion': 'gini', 'max_depth': 5}

In [94]:
DT_REG=DecisionTreeClassifier(max_depth=5,criterion='gini',random_state=0)

In [96]:
DT_REG.fit(xtrain,ytrain)

DecisionTreeClassifier(max_depth=5, random_state=0)

In [98]:
ypred=DT_REG.predict(xtest)

In [100]:
accuracy=metrics.accuracy_score(ytest,ypred)
print(accuracy)

0.7619047619047619


In [101]:
cm=metrics.confusion_matrix(ytest,ypred)
print(cm)

[[132  25]
 [ 30  44]]


In [102]:
#Here, we can see the difference in performance of Fully Grown DT compared to Regularised DT
76.19-72.29

3.8999999999999915

# Now, Let's check cross check the performance of Fully grown DT and Regularized DT through cross validation score with Bias and Variance Error:

In [104]:
from sklearn.model_selection import cross_val_score,KFold

# First check the performance of fully grown DT

In [110]:
kf=KFold(n_splits=5,shuffle=True,random_state=0)#using KFold to decide number of folds to gain the benefit of shuffle and random-state
auc=cross_val_score(DT,X,Y,cv=kf,scoring='roc_auc')#roc_auc cz it's a binary problem 
print('Bias_Error',np.mean(1-auc))
print('Var_Error',np.std(1-auc,ddof=1))#ddof=1 because we are using samples of the population

Bias_Error 0.3241909395277806
Var_Error 0.05510734487374048


# Let's check the performance of Regularized DT

In [111]:
kf=KFold(n_splits=5,shuffle=True,random_state=0)
auc=cross_val_score(DT_REG,X,Y,cv=kf,scoring='roc_auc')
print('Bias_Error',np.mean(1-auc))
print('var_Error',np.std(1-auc,ddof=1))

Bias_Error 0.23015694825504326
var_Error 0.031321351672405844


# Conclusion:

1.we can see that the regularized DT model has reduced Bias Error and Variance Error

2.Reduction in the Variance Error is because our fully grown DT is overfitted model, which we reduced by setting max_depth to 5

3.Though reduction in the Bias_Error is completely at random, as here we don't have any control on Bias Error, it may increase or decrease.

4.Reduction in the Variance Error is also till some extent.

5.All the drawbacks of DecisonTree, we are going to overcome using RandomForest.
