In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import numpy as np

In [3]:
df = pd.read_csv("diabetes.csv")

### Data Exploration

In [4]:
df.shape

(2000, 9)

In [5]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [6]:
df.isnull().any()

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               2000 non-null   int64  
 1   Glucose                   2000 non-null   int64  
 2   BloodPressure             2000 non-null   int64  
 3   SkinThickness             2000 non-null   int64  
 4   Insulin                   2000 non-null   int64  
 5   BMI                       2000 non-null   float64
 6   DiabetesPedigreeFunction  2000 non-null   float64
 7   Age                       2000 non-null   int64  
 8   Outcome                   2000 non-null   int64  
dtypes: float64(2), int64(7)
memory usage: 140.8 KB


In [8]:
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [9]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,2000.0,3.7035,3.306063,0.0,1.0,3.0,6.0,17.0
Glucose,2000.0,121.1825,32.068636,0.0,99.0,117.0,141.0,199.0
BloodPressure,2000.0,69.1455,19.188315,0.0,63.5,72.0,80.0,122.0
SkinThickness,2000.0,20.935,16.103243,0.0,0.0,23.0,32.0,110.0
Insulin,2000.0,80.254,111.180534,0.0,0.0,40.0,130.0,744.0
BMI,2000.0,32.193,8.149901,0.0,27.375,32.3,36.8,80.6
DiabetesPedigreeFunction,2000.0,0.47093,0.323553,0.078,0.244,0.376,0.624,2.42
Age,2000.0,33.0905,11.786423,21.0,24.0,29.0,40.0,81.0
Outcome,2000.0,0.342,0.474498,0.0,0.0,0.0,1.0,1.0


In [10]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [11]:
df.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
1995,2,75,64,24,55,29.7,0.37,33,0
1996,8,179,72,42,130,32.7,0.719,36,1
1997,6,85,78,0,0,31.2,0.382,42,0
1998,0,129,110,46,130,67.1,0.319,26,1
1999,2,81,72,15,76,30.1,0.547,25,0


In [14]:
# Replacing the null(0) values from ['Glucose','BloodPressure','SkinThickness','Insulin','BMI'] by NaN
df_copy = df.copy(deep=True)
df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
df_copy.isnull().sum()

Pregnancies                   0
Glucose                      13
BloodPressure                90
SkinThickness               573
Insulin                     956
BMI                          28
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [15]:
# Replacing NaN value by mean, median depending upon distribution
df_copy['Glucose'].fillna(df_copy['Glucose'].mean(), inplace=True)
df_copy['BloodPressure'].fillna(df_copy['BloodPressure'].mean(), inplace=True)
df_copy['SkinThickness'].fillna(df_copy['SkinThickness'].median(), inplace=True)
df_copy['Insulin'].fillna(df_copy['Insulin'].median(), inplace=True)
df_copy['BMI'].fillna(df_copy['BMI'].median(), inplace=True)

In [16]:
df_copy.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [17]:
# splitting dataset into dependent and independent 
from sklearn.model_selection import train_test_split

x= df.drop(columns='Outcome')
y = df['Outcome']

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1600, 8)
(400, 8)
(1600,)
(400,)


In [21]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression( random_state=0)

In [23]:
lr.fit(x_train , y_train)

LogisticRegression(random_state=0)

In [24]:
y_pred_lr=lr.predict(x_test)

In [25]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confu_lr=confusion_matrix(y_test,y_pred_lr)
print("THe confusion matrix is ", print(confu_lr))

print("-"*50)
from sklearn.metrics import accuracy_score
ac_lr=accuracy_score(y_test,y_pred_lr)
ac_lr

[[243  29]
 [ 59  69]]
THe confusion matrix is  None
--------------------------------------------------


0.78

### Naive Bayes

In [26]:
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
nb.fit(x_train,y_train)

GaussianNB()

In [27]:
y_pred_nb=nb.predict(x_test)

In [28]:
confu_nb=confusion_matrix(y_test,y_pred_nb)
print("The confusion matrix is ", confu_nb)

print("-"*50)
from sklearn.metrics import accuracy_score
ac_nb=accuracy_score(y_test,y_pred_nb)
ac_nb

The confusion matrix is  [[230  42]
 [ 55  73]]
--------------------------------------------------


0.7575

### Support vecotr classification

In [29]:
from sklearn.svm import SVC
svc=SVC(kernel='linear',random_state=0)
svc.fit(x_train,y_train)

SVC(kernel='linear', random_state=0)

In [30]:
y_pred_svc=svc.predict(x_test)

In [31]:
confu_svc=confusion_matrix(y_test,y_pred_svc)
print("The confusion matrix is ", confu_svc)

print("-"*50)
from sklearn.metrics import accuracy_score
ac_svc=accuracy_score(y_test,y_pred_svc)
ac_svc

The confusion matrix is  [[239  33]
 [ 57  71]]
--------------------------------------------------


0.775

### Random Forest Classification

In [32]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(criterion='entropy',n_estimators=25,random_state=27)
rfc.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=25, random_state=27)

In [33]:
y_pred_rfc=rfc.predict(x_test)

In [34]:
y_pred_rfc=rfc.predict(x_test)
confu_rfc=confusion_matrix(y_test,y_pred_rfc)
print(" the confusion matrix is ", confu_rfc)

print("-"*50)
from sklearn.metrics import accuracy_score
ac_rfc=accuracy_score(y_test,y_pred_rfc)
ac_rfc


 the confusion matrix is  [[272   0]
 [  7 121]]
--------------------------------------------------


0.9825