# Diabetes Prediction Model

## Importing require library

In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

## Importing data

In [2]:
diabetes = pd.read_csv('diabetes.csv')

diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
diabetes.shape  # rows and columns in dataset 

(768, 9)

In [4]:
diabetes.describe()  # Statistical insight of data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
diabetes['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

## Looking for missing values and cleaning data
- Looking for missing values
- Dealing with outliers

In [6]:
diabetes.isna().sum()  # looking for null values

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
# column vise looking missing data or outliers

# diabetes['Pregnancies'].value_counts()
# diabetes['Glucose'].value_counts()         # contain 0 -
# diabetes['BloodPressure'].value_counts()   # contain 0 -
# diabetes['SkinThickness'].value_counts()   # contain 0 -
# diabetes['Insulin'].value_counts()         # contain 0 -
# diabetes['BMI'].value_counts()             # contain 0 -
# diabetes['DiabetesPedigreeFunction'].value_counts()
# diabetes['Age'].value_counts()

- Here 0 is an outlier
- Looking for value 0 column vise

In [8]:
print(diabetes[diabetes.Glucose == 0].shape[0])
diabetes[diabetes.Glucose == 0].groupby('Outcome').count()

5


Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3,3,3,3,3,3,3,3
1,2,2,2,2,2,2,2,2


In [9]:
print(diabetes[diabetes.BloodPressure == 0].shape[0])
diabetes[diabetes.BloodPressure == 0].groupby('Outcome').count()

35


Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,19,19,19,19,19,19,19,19
1,16,16,16,16,16,16,16,16


In [10]:
print(diabetes[diabetes.SkinThickness == 0].shape[0])
diabetes[diabetes.SkinThickness == 0].groupby('Outcome').count()

227


Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,139,139,139,139,139,139,139,139
1,88,88,88,88,88,88,88,88


In [11]:
print(diabetes[diabetes.Insulin == 0].shape[0])
diabetes[diabetes.Insulin == 0].groupby('Outcome').count()

374


Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,236,236,236,236,236,236,236,236
1,138,138,138,138,138,138,138,138


In [12]:
print(diabetes[diabetes.BMI == 0].shape[0])
diabetes[diabetes.BMI == 0].groupby('Outcome').count()

11


Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,9,9,9,9,9,9,9,9
1,2,2,2,2,2,2,2,2


### Dealing with outlier (by cleaning entire row)

In [13]:
diabetes_mod = diabetes[(diabetes.BloodPressure != 0) & (diabetes.BMI != 0) & (diabetes.Glucose != 0)]
diabetes_mod.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [14]:
diabetes_mod.shape

(724, 9)

In [15]:
diabetes_mod.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [16]:
diabetes_mod['SkinThickness'].mean()

21.443370165745858

### Dealing with outlier (by replacing 0 value to mean of column)

In [17]:
diabetes_mod['SkinThickness'].replace(to_replace=0, value=diabetes_mod['SkinThickness'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_mod['SkinThickness'].replace(to_replace=0, value=diabetes_mod['SkinThickness'].mean(), inplace=True)


In [18]:
diabetes_mod.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0
mean,3.866022,121.882597,72.400552,27.13001,84.494475,32.467127,0.474765,33.350829,0.343923
std,3.362803,30.75003,12.37987,9.645083,117.016513,6.888941,0.332315,11.765393,0.475344
min,0.0,44.0,24.0,7.0,0.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,21.44337,0.0,27.5,0.245,24.0,0.0
50%,3.0,117.0,72.0,24.0,48.0,32.4,0.379,29.0,0.0
75%,6.0,142.0,80.0,33.0,130.5,36.6,0.6275,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [19]:
diabetes_mod['SkinThickness'].value_counts()

21.44337    192
32.00000     30
30.00000     26
27.00000     23
28.00000     20
18.00000     20
33.00000     20
23.00000     19
31.00000     19
39.00000     18
19.00000     18
29.00000     17
37.00000     16
26.00000     16
25.00000     16
40.00000     16
22.00000     16
35.00000     14
36.00000     14
15.00000     14
41.00000     14
17.00000     14
24.00000     12
42.00000     11
13.00000     11
20.00000     11
21.00000     10
46.00000      8
34.00000      8
12.00000      7
38.00000      7
11.00000      6
43.00000      6
16.00000      6
45.00000      6
14.00000      6
44.00000      5
10.00000      5
48.00000      4
47.00000      4
49.00000      3
50.00000      3
8.00000       2
7.00000       2
52.00000      2
54.00000      2
63.00000      1
60.00000      1
56.00000      1
51.00000      1
99.00000      1
Name: SkinThickness, dtype: int64

In [20]:
diabetes_mod['Insulin'].mean()

84.49447513812154

In [21]:
diabetes_mod['Insulin'].replace(to_replace=0, value=diabetes_mod['Insulin'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  diabetes_mod['Insulin'].replace(to_replace=0, value=diabetes_mod['Insulin'].mean(), inplace=True)


In [22]:
diabetes_mod.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0
mean,3.866022,121.882597,72.400552,27.13001,123.24056,32.467127,0.474765,33.350829,0.343923
std,3.362803,30.75003,12.37987,9.645083,94.398997,6.888941,0.332315,11.765393,0.475344
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.75,64.0,21.44337,84.494475,27.5,0.245,24.0,0.0
50%,3.0,117.0,72.0,24.0,84.494475,32.4,0.379,29.0,0.0
75%,6.0,142.0,80.0,33.0,130.5,36.6,0.6275,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [23]:
diabetes_mod['Insulin'].value_counts()

84.494475     332
105.000000     11
130.000000      9
140.000000      9
120.000000      8
             ... 
73.000000       1
171.000000      1
255.000000      1
52.000000       1
112.000000      1
Name: Insulin, Length: 185, dtype: int64

In [24]:
diabetes_mod.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35.0,84.494475,33.6,0.627,50,1
1,1,85,66,29.0,84.494475,26.6,0.351,31,0
2,8,183,64,21.44337,84.494475,23.3,0.672,32,1
3,1,89,66,23.0,94.0,28.1,0.167,21,0
4,0,137,40,35.0,168.0,43.1,2.288,33,1


In [25]:
diabetes_mod.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.32,111.016842,70.907368,25.813062,110.065944,30.974947,0.430297,31.258947
1,4.907631,142.610442,75.248996,29.64226,148.372862,35.313655,0.559594,37.341365


### Slicing of data

In [26]:
x = diabetes_mod.drop('Outcome', axis=1)  # x = features
y = diabetes_mod['Outcome']               # y = target

In [27]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35.0,84.494475,33.6,0.627,50
1,1,85,66,29.0,84.494475,26.6,0.351,31
2,8,183,64,21.44337,84.494475,23.3,0.672,32
3,1,89,66,23.0,94.0,28.1,0.167,21
4,0,137,40,35.0,168.0,43.1,2.288,33


In [28]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

## Standardization of data

In [29]:
scaler = StandardScaler()
scaler.fit(x)
x_trans = scaler.transform(x)

x_trans

array([[ 0.63502182,  0.84993279, -0.03237751, ...,  0.16456174,
         0.45842062,  1.41607523],
       [-0.85286102, -1.20026207, -0.51737031, ..., -0.85226209,
        -0.37269083, -0.19994691],
       [ 1.23017496,  1.98892994, -0.67903458, ..., -1.33162189,
         0.59392792, -0.11489311],
       ...,
       [ 0.33744525, -0.02872215, -0.03237751, ..., -0.9103663 ,
        -0.69188581, -0.28500071],
       [-0.85286102,  0.13399173, -1.00236311, ..., -0.34385017,
        -0.37871338,  1.16091384],
       [-0.85286102, -0.93991986, -0.19404178, ..., -0.30027201,
        -0.48109667, -0.88037728]])

## Test train splite

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x_trans, y, test_size=0.2, stratify=y, random_state=1)

## Support Vector Model

In [31]:
# Model prediction on train data

clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(x_train, y_train)
x_train_pred = clf_svm.predict(x_train)
svm_acc = accuracy_score(x_train_pred, y_train)

print('Accuracy score of training data: {}'.format(svm_acc))

Accuracy score of training data: 0.7875647668393783


In [32]:
# Model Prediction on test data

x_test_pred = clf_svm.predict(x_test)
svm_acc = accuracy_score(x_test_pred, y_test)

print('Accuracy score of testing data: {}'.format(svm_acc))

Accuracy score of testing data: 0.7103448275862069


## K Nearest Neighborhood Model

In [33]:
# Model prediction on train data

clf_knn = KNeighborsClassifier()
clf_knn.fit(x_train, y_train)
x_train_pred_knn = clf_knn.predict(x_train)
knn_acc = accuracy_score(x_train_pred_knn, y_train)

print('Accuracy score of training data: {}'.format(knn_acc))

Accuracy score of training data: 0.8134715025906736


In [34]:
# Model prediction on test data

x_test_pred_knn = clf_knn.predict(x_test)
knn_acc = accuracy_score(x_test_pred_knn, y_test)

print('Accuracy score of testing data: {}'.format(knn_acc))

Accuracy score of testing data: 0.7103448275862069


## Logestic Regression Model

In [35]:
# Model prediction on train data

clf_lr = LogisticRegression()
clf_lr.fit(x_train, y_train)
x_train_pred_lr = clf_lr.predict(x_train)
lr_acc = accuracy_score(x_train_pred_lr, y_train)

print('Accuracy score of training data: {}'.format(lr_acc))

Accuracy score of training data: 0.7823834196891192


In [36]:
# Model prediction on test data

x_test_pred_lr = clf_lr.predict(x_test)
lr_acc = accuracy_score(x_test_pred_lr, y_test)

print('Accuracy score of testing data: {}'.format(lr_acc))

Accuracy score of testing data: 0.7310344827586207


## Decision Tree Model

In [37]:
# Model prediction on train data

clf_dt = DecisionTreeClassifier()
clf_dt.fit(x_train, y_train)
x_train_pred_dt = clf_dt.predict(x_train)
dt_acc = accuracy_score(x_train_pred_dt, y_train)

print('Accuracy score of training data: {}'.format(dt_acc))

Accuracy score of training data: 1.0


In [38]:
# Model prediction on test data

x_test_pred_dt = clf_dt.predict(x_test)
dt_acc = accuracy_score(x_test_pred_dt, y_test)

print('Accuracy score of testing data: {}'.format(dt_acc))

Accuracy score of testing data: 0.6689655172413793


## Random Forest Model

In [39]:
# Model prediction on train data

clf_rf = DecisionTreeClassifier()
clf_rf.fit(x_train, y_train)
x_train_pred_rf = clf_rf.predict(x_train)
rf_acc = accuracy_score(x_train_pred_rf, y_train)

print('Accuracy score of training data: {}'.format(rf_acc))

Accuracy score of training data: 1.0


In [40]:
# Model prediction on test data

x_test_pred_rf = clf_rf.predict(x_test)
rf_acc = accuracy_score(x_test_pred_rf, y_test)

print('Accuracy score of testing data: {}'.format(rf_acc))

Accuracy score of testing data: 0.696551724137931


## Cross Validation
- Kfold cross validation

### Cross Validation on SVM Model

In [41]:
scores = cross_val_score(clf_svm, x_trans, y,cv=10, scoring='accuracy').mean()
print(scores)

0.7693683409436833


### Cross Validation on K Nearest neighbourhood Model

In [42]:
scores = cross_val_score(clf_knn, x_trans, y,cv=10, scoring='accuracy').mean()
print(scores)

0.7430745814307458


### Cross Validation on Logestic Regression Model

In [43]:
scores = cross_val_score(clf_lr, x_trans, y,cv=10, scoring='accuracy').mean()
print(scores)

0.769406392694064


### Cross Validation on Decision Tree Model

In [44]:
scores = cross_val_score(clf_dt, x_trans, y,cv=10, scoring='accuracy').mean()
print(scores)

0.6881088280060883


### Cross Validation on Random Forest Model

In [45]:
scores = cross_val_score(clf_rf, x_trans, y,cv=10, scoring='accuracy').mean()
print(scores)

0.6880898021308981


# Model usage

In [58]:
input_data = (1,189,60,23,846,30.1,0.398,59)
input_data_np = np.asarray(input_data)
input_data_reshape = input_data_np.reshape(1,-1)
std_data = scaler.transform(input_data_reshape)
prediction = clf_lr.predict(std_data)
print(prediction)

if (prediction == [0]):
    print('Person is not diabetic.')
else:
    print('Person is diabetic.')

[1]
Person is diabetic.




# Model file
- using joblib

In [47]:
from joblib import dump, load
dump(clf_lr, 'Diabetes_prediction.joblib')

['Diabetes_prediction.joblib']