In [59]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')

In [38]:
d1=pd.read_csv('../DATASET/Diabetes.csv')
d1.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [29]:
d1.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [39]:
d=d1.loc[(d1['Glucose']!=0) & (d1['BloodPressure']!=0) & 
           (d1['SkinThickness']!=0) & (d1['Insulin']!=0) & (d1['BMI']!=0)]

In [50]:
columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for each in columns:
    d1[each].replace(0,d[each].mean(),inplace=True)
d1.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,121.692888,72.3258,29.151052,155.79556,32.466469,0.471876,33.240885,0.348958
std,3.369578,30.436043,12.101807,8.790943,85.021487,6.875558,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.145408,156.056122,32.4,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,156.056122,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [54]:
#Extracting Independent and dependent Variable  
x= d1.iloc[:, 0:8].values  
y= d1.iloc[:, 8].values 

In [55]:
#feature Scaling  
from sklearn.preprocessing import StandardScaler    
st_x= StandardScaler()    
x= st_x.fit_transform(x)

In [57]:
models=[]   # model[] is empty
scores=[]   #scores[] is empty

# GrdidSearchCV- LogisticRegression

In [60]:
parameters = {'penalty':['l1', 'l2']}        
c1 = LogisticRegression()
grid = GridSearchCV(c1, parameters, cv=10)
grid.fit(x,y)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

models.append(c1)   # models[c1]
scores.append(grid.best_score_)   #scores[0.7682501708817498]

{'penalty': 'l2'}
LogisticRegression()
0.7682501708817498


# GrdidSearchCV- DecisionTreeClassifier

In [61]:
parameters = {'criterion':['gini', 'entropy'],
              'splitter':['best', 'random']
              }        
c2 = DecisionTreeClassifier()
grid = GridSearchCV(c2, parameters, cv=10)
grid.fit(x,y)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

models.append(c2)  #models[c1,c2]
scores.append(grid.best_score_)  #scores[0.7682501708817498,0.712218045112782]

{'criterion': 'entropy', 'splitter': 'best'}
DecisionTreeClassifier(criterion='entropy')
0.7147983595352015


# GrdidSearchCV- Support Vector Classifier(SVC)

In [14]:
parameters = {'C': [0.01, 0.05],
              'degree': [2, 3],
              'gamma':[0.001, 0.01],
              'kernel': ['rbf']
              }        
c3 = SVC()
grid = GridSearchCV(c3, parameters, cv=10)
grid.fit(x,y)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

models.append(c3)   #models[c1,c2,c3]
scores.append(grid.best_score_)     #scores[0.7682501708817498,0.712218045112782,0.6510594668489406]

{'C': 0.01, 'degree': 2, 'gamma': 0.001, 'kernel': 'rbf'}
SVC(C=0.01, degree=2, gamma=0.001)
0.6510594668489406


# GrdidSearchCV- KNeighborsClassifier(KNN)

In [62]:
parameters = {'n_neighbors': range(30),
              'metric':['manhattan','euclidean']}        
c4 = KNeighborsClassifier()  
grid = GridSearchCV(c4, parameters, cv=10)
grid.fit(x,y)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)

models.append(c4)     #models[c1,c2,c3,c4]
scores.append(grid.best_score_)     #scores[0.7682501708817498,0.712218045112782,0.6510594668489406,0.7773752563226247]

{'metric': 'euclidean', 'n_neighbors': 27}
KNeighborsClassifier(metric='euclidean', n_neighbors=27)
0.7773752563226247


# GrdidSearchCV- RandomForestClassifier

In [63]:
parameters = {'n_estimators':range(20),
              'criterion':['gini','entropy']}        
c5 = RandomForestClassifier()  
grid = GridSearchCV(c5, parameters, cv=10)
grid.fit(x,y)
print(grid.best_params_)
print(grid.best_estimator_)
print(grid.best_score_)


models.append(c5)      #models[c1,c2,c3,c4,c5]
scores.append(grid.best_score_)     #scores[0.7682501708817498,0.712218045112782,0.6510594668489406,0.7773752563226247,0.7669514695830485]

{'criterion': 'gini', 'n_estimators': 15}
RandomForestClassifier(n_estimators=15)
0.7695317840054683


In [17]:
result_data=pd.DataFrame({"Models":models,"Score":scores})

In [18]:
print(result_data)

                     Models     Score
0      LogisticRegression()  0.768250
1  DecisionTreeClassifier()  0.710988
2                     SVC()  0.651059
3    KNeighborsClassifier()  0.777375
4  RandomForestClassifier()  0.769481
