# 1. Problem statement >> predict the person is diabetic are not base on given parameters

# Diabetes
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. The objective is to predict based on diagnostic measurements whether a patient has diabetes.

Content
Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
 

Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test 

BloodPressure: Diastolic blood pressure (mm Hg)  

SkinThickness: Triceps skin fold thickness (mm) 

Insulin: 2-Hour serum insulin (mu U/ml) 

BMI: Body mass index (weight in kg/(height in m)^2) 

DiabetesPedigreeFunction: Diabetes pedigree function 

Age: Age (years) 

Outcome: Class variable (0 or 1)

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.impute import KNNImputer

import pickle 
import json

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.metrics import multilabel_confusion_matrix

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 

%config InlineBackend.figure_format = 'retina'

# 2. Data Gathering 

In [2]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148,50,35,0,33.6,0.627,50,1
1,85,66,29,0,26.6,0.351,31,0
2,183,64,0,0,23.3,0.672,52,1
3,150,66,23,94,28.1,0.167,21,0
4,150,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...
763,101,76,48,180,32.9,0.171,63,0
764,122,70,27,0,36.8,0.340,27,0
765,121,72,23,112,26.2,0.245,30,0
766,126,60,0,0,30.1,0.349,47,1



# 3. EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 8 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Glucose                   768 non-null    int64  
 1   BloodPressure             768 non-null    int64  
 2   SkinThickness             768 non-null    int64  
 3   Insulin                   768 non-null    int64  
 4   BMI                       768 non-null    float64
 5   DiabetesPedigreeFunction  768 non-null    float64
 6   Age                       768 non-null    int64  
 7   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(6)
memory usage: 48.1 KB


In [26]:
df.describe()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,121.910878,72.375171,29.15342,155.548223,32.457464,0.471876,33.24349,0.348958
std,30.353169,12.409774,10.476982,118.775855,6.924988,0.331329,11.758182,0.476951
min,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,100.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,118.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,142.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
df["Outcome"].value_counts()*100/len(df)

Outcome
0    65.104167
1    34.895833
Name: count, dtype: float64

plt.figure(figsize=(9,7))
sns.heatmap(df.corr(),annot=True)

In [6]:
df.isna().sum()

Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = df[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0, np.NaN)

In [8]:
df.isna().sum()

Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [9]:
knn_imputer = KNNImputer(n_neighbors=1) # n_neighbors= k = 5, default
array1 = knn_imputer.fit_transform(df)
df1 = pd.DataFrame(array1,columns=df.columns)
df1

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148.0,50.0,35.0,190.0,33.6,0.627,50.0,1.0
1,85.0,66.0,29.0,71.0,26.6,0.351,31.0,0.0
2,183.0,64.0,36.0,495.0,23.3,0.672,52.0,1.0
3,150.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,150.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0
...,...,...,...,...,...,...,...,...
763,101.0,76.0,48.0,180.0,32.9,0.171,63.0,0.0
764,122.0,70.0,27.0,205.0,36.8,0.340,27.0,0.0
765,121.0,72.0,23.0,112.0,26.2,0.245,30.0,0.0
766,126.0,60.0,18.0,122.0,30.1,0.349,47.0,1.0


In [10]:
df1.isna().sum()

Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

# Outliers Analysis

In [11]:
for feature in df1:
    
    Q1 = df[feature].quantile(0.05)
    Q3 = df[feature].quantile(0.95)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    
    if df[(df[feature] > upper)].any(axis=None):
        print(feature,"yes")
    else:
        print(feature, "no")

Glucose no
BloodPressure no
SkinThickness yes
Insulin no
BMI no
DiabetesPedigreeFunction no
Age no
Outcome no


In [12]:
df1.columns

Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [13]:
df1.head()

Unnamed: 0,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,148.0,50.0,35.0,190.0,33.6,0.627,50.0,1.0
1,85.0,66.0,29.0,71.0,26.6,0.351,31.0,0.0
2,183.0,64.0,36.0,495.0,23.3,0.672,52.0,1.0
3,150.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,150.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0


# 5. Feature Selection

In [14]:
X= df1.drop("Outcome",axis=1)
y = df1["Outcome"]

# 6. Model training

In [15]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2,stratify=y)

In [16]:
lr = LogisticRegression(solver='sag', max_iter=1000)
lr.fit(x_train,y_train)

# 7. Model Evalution 

In [17]:
#Testing Data Evalation

y_pred = lr.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print("Confusion Matrix: \n",cnf_matrix)
print("*"*80)

accuracy = accuracy_score(y_test,y_pred)
print("Accuarcy : ",accuracy)
print("*"*80)

clf_report = classification_report(y_test,y_pred)
print("Classification Report :\n",clf_report)

Confusion Matrix: 
 [[92  8]
 [35 19]]
********************************************************************************
Accuarcy :  0.7207792207792207
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

         0.0       0.72      0.92      0.81       100
         1.0       0.70      0.35      0.47        54

    accuracy                           0.72       154
   macro avg       0.71      0.64      0.64       154
weighted avg       0.72      0.72      0.69       154



In [18]:
#Training Data Evaluation

y_pred_train = lr.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred_train)
print("Confusion Matrix: \n",cnf_matrix)
print("*"*80)

accuracy = accuracy_score(y_train,y_pred_train)
print("Accuarcy : ",accuracy)
print("*"*80)

clf_report = classification_report(y_train,y_pred_train)
print("Classification Report :\n",clf_report)

Confusion Matrix: 
 [[351  49]
 [130  84]]
********************************************************************************
Accuarcy :  0.7084690553745928
********************************************************************************
Classification Report :
               precision    recall  f1-score   support

         0.0       0.73      0.88      0.80       400
         1.0       0.63      0.39      0.48       214

    accuracy                           0.71       614
   macro avg       0.68      0.64      0.64       614
weighted avg       0.70      0.71      0.69       614



# 8.API & testing

In [19]:
with open('model.pkl','wb') as file:
    pickle.dump(lr,file)

In [20]:
X.columns

Index(['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [24]:
d1= {"columns" : [
       'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
       'DiabetesPedigreeFunction', 'Age'],"result_values": {"Non Diabetic": 0, "Diabetic": 1}}

In [25]:
with open("asset.json",'w') as file:
    json.dump(d1,file)