In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [2]:
df=pd.read_csv("Diabetes data.csv")

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35,0,33.6,0.627,50.0,1
1,1,85.0,66.0,29,0,26.6,0.351,31.0,0
2,8,183.0,64.0,0,0,23.3,0.672,32.0,1
3,1,89.0,66.0,23,94,28.1,0.167,21.0,0
4,0,137.0,40.0,35,168,43.1,2.288,33.0,1


In [4]:
## Finding Missing Values
df.isnull().sum()

Pregnancies                 0
Glucose                     2
BloodPressure               1
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    1
Age                         1
Outcome                     0
dtype: int64

### Missing entries of Healthcare data should be replaced with median

In [5]:
df['Glucose']=df.fillna(df['Glucose'].median())

In [6]:
df['BloodPressure']=df.fillna(df['BloodPressure'].median())

In [7]:
df['DiabetesPedigreeFunction']=df.fillna(df['DiabetesPedigreeFunction'].median())

In [8]:
df['Age']=df.fillna(df['Age'].median())

In [9]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [10]:
y=df['Outcome']
del df['Outcome']
x=df

In [11]:
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.80,random_state=123)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(614, 8)
(154, 8)
(614L,)
(154L,)


In [12]:
clf1=DecisionTreeClassifier()
dt_model=clf1.fit(x_train,y_train)
print(dt_model)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')


In [13]:
dt_prd=dt_model.predict(x_test)
print(accuracy_score(y_test,dt_prd))
print(confusion_matrix(y_test,dt_prd))
print(classification_report(y_test,dt_prd))


0.616883116883
[[70 26]
 [33 25]]
             precision    recall  f1-score   support

          0       0.68      0.73      0.70        96
          1       0.49      0.43      0.46        58

avg / total       0.61      0.62      0.61       154



In [14]:
clf2=RandomForestClassifier()
rf_model=clf2.fit(x_train,y_train)
print(rf_model)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


In [15]:
rf_prd=rf_model.predict(x_test)
print(accuracy_score(y_test,rf_prd))
print(confusion_matrix(y_test,rf_prd))
print(classification_report(y_test,rf_prd))

0.701298701299
[[83 13]
 [33 25]]
             precision    recall  f1-score   support

          0       0.72      0.86      0.78        96
          1       0.66      0.43      0.52        58

avg / total       0.69      0.70      0.68       154



In [16]:
## Standardization of data before applying the algos further
from sklearn.preprocessing import  StandardScaler
ss=StandardScaler()
x_train1=ss.fit_transform(x_train)

In [17]:
x_test1=ss.fit_transform(x_test)

In [18]:
dt_model1=clf1.fit(x_train1,y_train)
dt_model1

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [19]:
dt_prd1=dt_model1.predict(x_test1)
print(accuracy_score(y_test,dt_prd1))
print(confusion_matrix(y_test,dt_prd1))
print(classification_report(y_test,dt_prd1))

0.642857142857
[[74 22]
 [33 25]]
             precision    recall  f1-score   support

          0       0.69      0.77      0.73        96
          1       0.53      0.43      0.48        58

avg / total       0.63      0.64      0.63       154



In [23]:
rf_model1=clf2.fit(x_train1,y_train)
print(rf_model1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


In [24]:
rf_prd1=rf_model1.predict(x_test1)
print(accuracy_score(y_test,rf_prd1))
print(confusion_matrix(y_test,rf_prd1))
print(classification_report(y_test,rf_prd1))

0.681818181818
[[81 15]
 [34 24]]
             precision    recall  f1-score   support

          0       0.70      0.84      0.77        96
          1       0.62      0.41      0.49        58

avg / total       0.67      0.68      0.66       154



In [25]:
rf_params={"criterion":['gini','entropy'],"n_estimators":[10,20,30,40,50,70,100,120,150],"max_depth":[4,6,7,8,9],"max_leaf_nodes":[10,15,20,25,30,40]}

In [26]:
rf_grid=GridSearchCV(clf2,rf_params,cv=5)
rf_grid_model=rf_grid.fit(x_train1,y_train)

In [49]:
rf_grid.best_params_

{'criterion': 'entropy',
 'max_depth': 9,
 'max_leaf_nodes': 15,
 'n_estimators': 30}

In [52]:
## GridSearchCv on non normalized data to build model with Random Forest
rf_grid_prd=rf_grid_model.predict(x_test1)
print(accuracy_score(rf_grid_prd,y_test))
print(confusion_matrix(rf_grid_prd,y_test))
print(classification_report(y_test,rf_grid_prd))

0.6948051948051948
[[89 40]
 [ 7 18]]
             precision    recall  f1-score   support

          0       0.69      0.93      0.79        96
          1       0.72      0.31      0.43        58

avg / total       0.70      0.69      0.66       154



In [54]:
rf_grid_model1=rf_grid.fit(x_train,y_train)

In [55]:
rf_grid_prd1=rf_grid_model1.predict(x_test)
print(accuracy_score(y_test,rf_grid_prd1))
print(confusion_matrix(y_test,rf_grid_prd1))
print(classification_report(y_test,rf_grid_prd1))

0.6883116883116883
[[92  4]
 [44 14]]
             precision    recall  f1-score   support

          0       0.68      0.96      0.79        96
          1       0.78      0.24      0.37        58

avg / total       0.71      0.69      0.63       154



### Please note that we can apply GridSearchCv on DecisionTree algorithm in the same way