# Import Libraries

In [76]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

## Load dataset

In [26]:
df=pd.read_csv("diabetes.csv")

In [27]:
df.head() # check first rows of data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Shape of data

In [28]:
df.shape

(768, 9)

**Observation**
- There ar 768 rows and 9 columns in dataset

In [29]:
df.describe().T # overview of data

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


## Checking the Nans Value 

In [30]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

**Observation**
- dataset is clean there is no null value

In [31]:
x=df[["Age","BMI","BloodPressure","DiabetesPedigreeFunction","Glucose","Insulin","Pregnancies","SkinThickness"]]
y=df.Outcome

### Split the data into training and testing

In [61]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [62]:
lr=LogisticRegressionCV()
lr.fit(X_train,y_train)

LogisticRegressionCV()

In [63]:
y_pred=lr.predict(X_test)

In [64]:
print("Accuracy is:",accuracy_score(y_test,y_pred))
print("precision_score is:",precision_score(y_test,y_pred))
print("recall_score is:",recall_score(y_test,y_pred))
print("f1_score is:",f1_score(y_test,y_pred))

Accuracy is: 0.7662337662337663
precision_score is: 0.7878787878787878
recall_score is: 0.4727272727272727
f1_score is: 0.5909090909090909


In [70]:
dt=DecisionTreeClassifier()
dt.fit(X_train,y_train)
y_pred=dt.predict(X_test)
print("Accuracy is:",accuracy_score(y_test,y_pred))
print("precision_score is:",precision_score(y_test,y_pred))
print("recall_score is:",recall_score(y_test,y_pred))
print("f1_score is:",f1_score(y_test,y_pred))

Accuracy is: 0.6753246753246753
precision_score is: 0.5531914893617021
recall_score is: 0.4727272727272727
f1_score is: 0.5098039215686275


In [74]:
svc=SVC()
svc.fit(X_train,y_train)
y_pred=svc.predict(X_test)
print("Accuracy is:",accuracy_score(y_test,y_pred))
print("precision_score is:",precision_score(y_test,y_pred))
print("recall_score is:",recall_score(y_test,y_pred))
#print("f1_score is:",f1_score(y_test,y_pred)

Accuracy is: 0.7467532467532467
precision_score is: 0.8076923076923077
recall_score is: 0.38181818181818183


In [75]:
rf=RandomForestClassifier()
rf.fit(X_train,y_train)
y_pred=rf.predict(X_test)
print("Accuracy is:",accuracy_score(y_test,y_pred))
print("precision_score is:",precision_score(y_test,y_pred))
print("recall_score is:",recall_score(y_test,y_pred))
print("f1_score is:",f1_score(y_test,y_pred))

Accuracy is: 0.7662337662337663
precision_score is: 0.7878787878787878
recall_score is: 0.4727272727272727
f1_score is: 0.5909090909090909


### Cross Validation

In [None]:
lr=LogisticRegressionCV()
model=cross_val_score(lr,x,y,cv=10)
print("LogisticRegressionCV Accuracy:",np.mean(model)*100)

In [96]:
dt=DecisionTreeClassifier()
model=cross_val_score(dt,x,y,cv=10)
print("DecisionTreeClassifier Accuracy:",np.mean(model)*100)

DecisionTreeClassifier Accuracy: 70.69548872180451


In [97]:
svm=SVC()
model=cross_val_score(svm,x,y,cv=10)
print("SVC Accuracy:",np.mean(model)*100)

LogisticRegressionCV Accuracy: 75.78605604921395


In [98]:
rf=RandomForestClassifier()
model=cross_val_score(rf,x,y,cv=10)
print("RandomForestClassifier Accuracy:",np.mean(model)*100)

RandomForestClassifier Accuracy: 76.42686261107313
