# Female Diabetes Predicting Model

# Imports

In [33]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Dataset

In [2]:
ds=pd.read_csv("Data/Female-Diabetes.csv")

ds.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Some Observations and Calculations

In [3]:
ds.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
ds['Outcome'].value_counts() #in total 768 instances

0    500
1    268
Name: Outcome, dtype: int64

In [5]:
ds.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


# Checking Correlations

In [6]:
CorrelationMatrix=ds.corr()
CorrelationMatrix["Outcome"].sort_values(ascending=False)

# as a result, i won't remove any features

Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64

# Classifying Features and Labels

In [7]:
# below line means to include all data in x , except 'Outcome'
x=ds.drop(columns='Outcome',axis=1)  

y=ds['Outcome']

# Data Standardization

In [8]:
# creating a scaler for better accuracy of our model
Scaler=StandardScaler()

# fitting and then transforming the data of x as oer the standard scaler
Standard_x=Scaler.fit_transform(x)

# setting the value of x as the standard x value
x=Standard_x

# Separation of testing and training data

In [9]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=42)

# Model training

In [34]:
model=svm.SVC(kernel='linear')
# model=LogisticRegression()
# model=GaussianProcessClassifier()
# model=KNeighborsClassifier()
# model=RandomForestClassifier()
# model=MLPClassifier()

model.fit(x_train,y_train)

# Predictions and calculations

In [35]:
# on testing data

y_predictions=model.predict(x_test)

accuracy=accuracy_score(y_predictions,y_test)

print("The Model Accuracy with svm is :- ",accuracy)

# 0.7532467532467533----with RandomForestClassifier  [1st]
# 0.7402597402597403----with MLPClassifier
# 0.7207792207792207----with svm
# 0.7142857142857143----with LogisticRegression
# 0.7142857142857143----with GaussianProcessClassifier
# 0.7077922077922078----with KNeighborsClassifier

The Model Accuracy with svm is :-  0.7207792207792207


In [12]:
# on training data

y_predictions=model.predict(x_train)

accuracy=accuracy_score(y_predictions,y_train)

print("The Model Accuracy with svm is :- ",accuracy)

# 1.0               ----with RandomForestClassifier
# 0.9006514657980456----with GaussianProcessClassifier
# 0.8029315960912052----with KNeighborsClassifier
# 0.8013029315960912----with MLPClassifier
# 0.7915309446254072----with svm
# 0.7915309446254072----with Logistic Regression

The Model Accuracy with svm is :-  0.8078175895765473


# Some Private testing....

In [13]:
ds.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [45]:
weight=30
height=1.2
height_square=height*height

BMI=weight/height_square

print(BMI)

myData=[[0,0,30,5,0,BMI,0.0012,11]]

# print(x_test)

results=model.predict(myData)

if results[0]==1:
    print("You may have diabetes , its preferable to meet a doctor!")
else:
    print("According to the data, you may not have diabetes , but always be careful!")


20.833333333333336
You may have diabetes , its preferable to meet a doctor!


# Converting to .joblib

In [65]:
from joblib import dump

dump(model,"FemaleDiabetesPrediction.joblib")

['FemaleDiabetesPrediction.joblib']