#### Statement Problem
* Build a model to diagonise  diabetes in a patient
* Logistic Regression will be used since it is a classification problem

* Import Libraries

In [125]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [126]:
# import dataset
df = pd.read_csv("../DATASET/Diabetes.csv")
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


* Perform Exploratory Data Analysis

In [127]:
# Shape of dataset
df.shape

(768, 9)

In [128]:
# return column names
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [129]:
# dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [130]:
# General data statistics
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [131]:
# Check for null values
df.isna().any()

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool

In [132]:
# Check for duplicated values
df.loc[df.duplicated(subset=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'])]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [133]:
# Conditionally select zero values
df[(df.BloodPressure == 0) | (df.BMI == 0) | (df.Glucose == 0) | (df.SkinThickness == 0) | (df.Insulin == 0)].head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
5,5,116,74,0,0,25.6,0.201,30,0
7,10,115,0,0,0,35.3,0.134,29,0
9,8,125,96,0,0,0.0,0.232,54,1
10,4,110,92,0,0,37.6,0.191,30,0
11,10,168,74,0,0,38.0,0.537,34,1
12,10,139,80,0,0,27.1,1.441,57,0
15,7,100,0,0,0,30.0,0.484,32,1


In [134]:
# Replace selected columns with zero values with the column mean
df['BloodPressure'].replace(0,df.BloodPressure.mean(), inplace=True)
df['BMI'].replace(0,df.BMI.mean(), inplace=True)
df['Glucose'].replace(0,df.Glucose.mean(),inplace=True)
df['SkinThickness'].replace(0,df.SkinThickness.mean(), inplace=True)
df['Insulin'].replace(0,df.Insulin.mean(), inplace=True)

In [135]:
# check for columns with zero value
df[(df.BloodPressure == 0) | (df.BMI == 0) | (df.Glucose == 0) | (df.SkinThickness == 0) | (df.Insulin == 0)].head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [136]:
# describe
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,121.681605,30.436016,44.0,99.75,117.0,140.25,199.0
BloodPressure,768.0,72.254807,12.115932,24.0,64.0,72.0,80.0,122.0
SkinThickness,768.0,26.606479,9.631241,7.0,20.536458,23.0,32.0,99.0
Insulin,768.0,118.660163,93.080358,14.0,79.799479,79.799479,127.25,846.0
BMI,768.0,32.450805,6.875374,18.2,27.5,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [137]:
# check correlation btw coluns
df.corr().T

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.127964,0.208984,0.013376,-0.018082,0.021546,-0.033523,0.544341,0.221898
Glucose,0.127964,1.0,0.219666,0.160766,0.396597,0.231478,0.137106,0.2666,0.492908
BloodPressure,0.208984,0.219666,1.0,0.134155,0.010926,0.281231,0.000371,0.32674,0.162986
SkinThickness,0.013376,0.160766,0.134155,1.0,0.240361,0.535703,0.154961,0.026423,0.175026
Insulin,-0.018082,0.396597,0.010926,0.240361,1.0,0.189856,0.157806,0.038652,0.179185
BMI,0.021546,0.231478,0.281231,0.535703,0.189856,1.0,0.153508,0.025748,0.312254
DiabetesPedigreeFunction,-0.033523,0.137106,0.000371,0.154961,0.157806,0.153508,1.0,0.033561,0.173844
Age,0.544341,0.2666,0.32674,0.026423,0.038652,0.025748,0.033561,1.0,0.238356
Outcome,0.221898,0.492908,0.162986,0.175026,0.179185,0.312254,0.173844,0.238356,1.0


In [138]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,79.799479,33.6,0.627,50,1
1,1,85.0,66.0,29.0,79.799479,26.6,0.351,31,0
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


##### Train model and predict

In [139]:
# separate data into x & y
x= df.iloc[:,:8]
y = df.iloc[:,-1]

In [140]:
# split x & y into training and testing data
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [141]:
# Instantiate model and train
logisticreg = LogisticRegression()
logisticreg.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [142]:
# Predict with trained model based on test data
y_predict = logisticreg.predict(x_test)

In [143]:
# compare the value of y_predict and y_test in a dataframe
pred_vs_test = pd.DataFrame({
    "Predicted": y_predict,
    "Actual": y_test
})
pred_vs_test.head()

Unnamed: 0,Predicted,Actual
661,1,1
122,0,0
113,0,0
14,1,1
529,0,0


##### Logistic Regression Model Metrics

In [144]:
# Confusion Matrix
cf = confusion_matrix(y_test,y_predict)
print("Confusion Matrix \n",cf)

Confusion Matrix 
 [[98  9]
 [17 30]]


In [145]:
# Classification Report
cr = classification_report(y_test,y_predict)
print("Classification Report \n",cr)

Classification Report 
               precision    recall  f1-score   support

           0       0.85      0.92      0.88       107
           1       0.77      0.64      0.70        47

    accuracy                           0.83       154
   macro avg       0.81      0.78      0.79       154
weighted avg       0.83      0.83      0.83       154



In [146]:
# Predict based on training data
y_predict_1 = logisticreg.predict(x_train)

In [148]:
# compare values with y_train on a dataframe
pd_y_train_vs_y_pred = pd.DataFrame({
    "Y_TRAIN": y_train,
    "X_TRAIN_PRED": y_predict_1
})
pd_y_train_vs_y_pred.head(5)

Unnamed: 0,Y_TRAIN,X_TRAIN_PRED
603,1,1
118,0,0
247,0,1
157,0,0
468,1,0


In [149]:
# confusion matrix of training data
print("Confusion Matrix Train \n",confusion_matrix(y_train,y_predict_1))

Confusion Matrix Train 
 [[339  54]
 [ 94 127]]


In [150]:
# Classification Report on Training Data
print("Classification Report on Training Data \n",classification_report(y_train,y_predict_1))

Classification Report on Training Data 
               precision    recall  f1-score   support

           0       0.78      0.86      0.82       393
           1       0.70      0.57      0.63       221

    accuracy                           0.76       614
   macro avg       0.74      0.72      0.73       614
weighted avg       0.75      0.76      0.75       614

