In [None]:
# Import All The Libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
from sklearn.model_selection import train_test_split

#Import the Data set
df=pd.read_csv("diabetes.csv")
df.head()

#Data Cleaning
column="BMI"
mean_value1=df[column].mean()
df.fillna({column:mean_value1},inplace=True)
column2='DiabetesPedigreeFunction'
mean_value2=df[column].mean()
df.fillna({column2:mean_value2},inplace=True)

#Normalize the Dataset using z_score Normalization
z_scores=stats.zscore(df)
threshold=3
print("Size before removing outliers:",df.shape)
outliers_df=df[(z_scores>threshold).any(axis=1)]
df=df[(z_scores<=threshold).all(axis=1)]
print("Size after removing outliers:",df.shape)

#Data Preprocessing
x=df.drop('Outcome',axis=1)
y=df['Outcome']
scalar=StandardScaler()
x=pd.DataFrame(scalar.fit_transform(x),columns=x.columns)


#Train-Test-Split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=42)
print("*****Train-Test-Split-Shapes*****")
print("x_train:",x_train.shape)
print("y_train:",y_train.shape)
print("x_test:",x_test.shape)
print("y_test:",y_test.shape)

#Models Prediction and Its Accuracy
models=[]
models.append(('Logistic Regression',LogisticRegression(max_iter=1000)))
models.append(('KNN',KNeighborsClassifier()))
models.append(('RandomForestClassifier',RandomForestClassifier()))
models.append(('SVM',SVC(kernel='linear')))
for name,model in models:
    print("\n")
    print("*******************Model Name : ",name,"*********************")
    print()
    model.fit(x_train,y_train)
    x_pred=model.predict(x_test)
    print("Confusion-Matrix :",confusion_matrix(y_test,x_pred))
    print("Accuracy :",accuracy_score(y_test,x_pred))
    print("Precision :",precision_score(y_test,x_pred))
    print("Recall :",recall_score(y_test,x_pred))
    print("F1score :",f1_score(y_test,x_pred))

#Input Data
pregnancies=int(input("Enter The Pregnancies as integer value(For Women) :"))
glucose=int(input("Enter The Glucose as integer value:"))
bloodpressure=int(input("Enter The BloodPressure as integer value :"))
skinthickness=int(input("Enter The SkinThickness as integer value :"))
insulin=int(input("Enter The Insulin as integer value :"))
BMI=float(input("Enter The BMI as float value :"))
diabetespedigreefunction=float(input("Enter The DiabetesPedigreeFunction as float value :"))
age=int(input("Enter The Age as integer value :"))
input_data=(pregnancies,glucose,bloodpressure,skinthickness,insulin,BMI,diabetespedigreefunction,age)
I_D=np.asarray(input_data)
I_D_Reshaped=I_D.reshape(1,-1)
prediction=model.predict(I_D_Reshaped) 
if(prediction==1):
    print("The Predicted Value For The Given Input is That The Patient Have Diabetes")
else:
   print("The Predicted Value For The Given Input is That The Patient Does Not Have Diabetes")

Size before removing outliers: (768, 9)
Size after removing outliers: (729, 9)
*****Train-Test-Split-Shapes*****
x_train: (656, 8)
y_train: (656,)
x_test: (73, 8)
y_test: (73,)


*******************Model Name :  Logistic Regression *********************

Confusion-Matrix : [[47  7]
 [ 8 11]]
Accuracy : 0.7945205479452054
Precision : 0.6111111111111112
Recall : 0.5789473684210527
F1score : 0.5945945945945946


*******************Model Name :  KNN *********************

Confusion-Matrix : [[49  5]
 [ 6 13]]
Accuracy : 0.8493150684931506
Precision : 0.7222222222222222
Recall : 0.6842105263157895
F1score : 0.7027027027027027


*******************Model Name :  RandomForestClassifier *********************

Confusion-Matrix : [[44 10]
 [ 8 11]]
Accuracy : 0.7534246575342466
Precision : 0.5238095238095238
Recall : 0.5789473684210527
F1score : 0.55


*******************Model Name :  SVM *********************

Confusion-Matrix : [[49  5]
 [ 7 12]]
Accuracy : 0.8356164383561644
Precision : 0.7058

Enter The Pregnancies as integer value(For Women) : 6
Enter The Glucose as integer value: 148
Enter The BloodPressure as integer value : 72
Enter The SkinThickness as integer value : 35
Enter The Insulin as integer value : 0
Enter The BMI as float value : 33.6
Enter The DiabetesPedigreeFunction as float value : 0.627
Enter The Age as integer value : 50


The Predicted Value For The Given Input is That The Patient Have Diabetes




In [29]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
df.shape

(768, 9)

In [7]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [23]:
column="BMI"
mean_value1=df[column].mean()
df.fillna({column:mean_value1},inplace=True)
print(df[column])
column2='DiabetesPedigreeFunction'
mean_value2=df[column].mean()
df.fillna({column2:mean_value2},inplace=True)
print(df[column2])

0      33.6
1      26.6
2      23.3
3      28.1
4      43.1
       ... 
763    32.9
764    36.8
765    26.2
766    30.1
767    30.4
Name: BMI, Length: 768, dtype: float64
0      0.627
1      0.351
2      0.672
3      0.167
4      2.288
       ...  
763    0.171
764    0.340
765    0.245
766    0.349
767    0.315
Name: DiabetesPedigreeFunction, Length: 768, dtype: float64


In [13]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
from sklearn.model_selection import train_test_split

In [17]:
models=[]
models.append(('Logistic Regression',LogisticRegression(max_iter=1000)))
models.append(('KNN',KNeighborsClassifier()))
models.append(('RandomForestClassifier',RandomForestClassifier()))
models.append(('SVM',SVC(kernel='linear')))

In [19]:
scalar=StandardScaler()
df_scale=pd.DataFrame(scalar.fit_transform(df),columns=df.columns)
x=df.drop('Outcome',axis=1)
y=df['Outcome']
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=42)
print("x_train:",x_train.shape)
print("y_train:",y_train.shape)
print("x_test:",x_test.shape)
print("y_test:",y_test.shape)

x_train: (691, 8)
y_train: (691,)
x_test: (77, 8)
y_test: (77,)


In [21]:
for name,model in models:
    print("\n")
    print("*******************Model Name : ",name,"*********************")
    print()
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print("Confusion-Matrix :",confusion_matrix(y_test,y_pred))
    print("Accuracy :",accuracy_score(y_test,y_pred))
    print("Precision :",precision_score(y_test,y_pred))
    print("Recall :",recall_score(y_test,y_pred))
    print("F1score :",f1_score(y_test,y_pred))



*******************Model Name :  Logistic Regression *********************

Confusion-Matrix : [[36 14]
 [ 9 18]]
Accuracy : 0.7012987012987013
Precision : 0.5625
Recall : 0.6666666666666666
F1score : 0.6101694915254238


*******************Model Name :  KNN *********************

Confusion-Matrix : [[35 15]
 [12 15]]
Accuracy : 0.6493506493506493
Precision : 0.5
Recall : 0.5555555555555556
F1score : 0.5263157894736842


*******************Model Name :  RandomForestClassifier *********************

Confusion-Matrix : [[36 14]
 [11 16]]
Accuracy : 0.6753246753246753
Precision : 0.5333333333333333
Recall : 0.5925925925925926
F1score : 0.5614035087719298


*******************Model Name :  SVM *********************

Confusion-Matrix : [[36 14]
 [ 9 18]]
Accuracy : 0.7012987012987013
Precision : 0.5625
Recall : 0.6666666666666666
F1score : 0.6101694915254238


In [22]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [78]:
#Input Data
pregnancies=int(input("Enter The Pregnancies as integer value(For Women) :"))
glucose=int(input("Enter The Glucose as integer value:"))
bloodpressure=int(input("Enter The BloodPressure as integer value :"))
skinthickness=int(input("Enter The SkinThickness as integer value :"))
insulin=int(input("Enter The Insulin as integer value :"))
BMI=int(input("Enter The BMI as float value :"))
diabetespedigreefunction=int(input("Enter The DiabetesPedigreeFunction as float value :"))
age=int(input("Enter The Age as integer value :"))
input_data=(pregnancies,glucose,bloodpressure,skinthickness,insulin,BMI,diabetespedigreefunction,age)
I_D=np.asarray(input_data)
I_D_Reshaped=I_D.reshape(1,-1)
prediction=model.predict(I_D_Reshaped) 
if(prediction==1):
    print("The Predicted Value For The Given Input is That The Patient Have Diabetes")
else:
    print("The Predicted Value For The Given Input is That The Patient Does Not Have Diabetes")

Enter The Pregnancies as integer value(For Women) : 0
Enter The Glucose as integer value: 130
Enter The BloodPressure as integer value : 80
Enter The SkinThickness as integer value : 6
Enter The Insulin as integer value : 0
Enter The BMI as float value : 20
Enter The DiabetesPedigreeFunction as float value : 0
Enter The Age as integer value : 56


The Predicted Value For The Given Input is That The Patient Does Not Have Diabetes




In [25]:
import pickle

In [27]:
with open('Diabetes_Prediction.pkl','wb') as file:
    pickle.dump(model,file)