In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [67]:
df=pd.read_csv('diabetes_prediction_dataset.csv')

In [68]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [69]:
df.tail()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0
99999,Female,57.0,0,0,current,22.43,6.6,90,0


In [70]:
df.shape

(100000, 9)

In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [72]:
df.isna().sum()

Unnamed: 0,0
gender,0
age,0
hypertension,0
heart_disease,0
smoking_history,0
bmi,0
HbA1c_level,0
blood_glucose_level,0
diabetes,0


In [73]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [74]:
df['diabetes'].value_counts()

Unnamed: 0_level_0,count
diabetes,Unnamed: 1_level_1
0,91500
1,8500


In [75]:
X=df.drop(columns='diabetes',axis=1)
Y=df['diabetes']

In [76]:
categorical_features = ['gender', 'smoking_history']
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [77]:
X

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
0,80.0,0,1,25.19,6.6,140,False,False,False,False,False,True,False
1,54.0,0,0,27.32,6.6,80,False,False,False,False,False,False,False
2,28.0,0,0,27.32,5.7,158,True,False,False,False,False,True,False
3,36.0,0,0,23.45,5.0,155,False,False,True,False,False,False,False
4,76.0,1,1,20.14,4.8,155,True,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,80.0,0,0,27.32,6.2,90,False,False,False,False,False,False,False
99996,2.0,0,0,17.37,6.5,100,False,False,False,False,False,False,False
99997,66.0,0,0,27.83,5.7,155,True,False,False,False,True,False,False
99998,24.0,0,0,35.42,4.0,100,False,False,False,False,False,True,False


In [78]:
Y

Unnamed: 0,diabetes
0,0
1,0
2,0
3,0
4,0
...,...
99995,0
99996,0
99997,0
99998,0


In [79]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [80]:
X_train

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
83449,24.0,0,0,23.71,4.8,100,True,False,False,False,False,False,False
50853,26.0,0,0,24.05,5.0,160,False,False,False,False,False,False,False
34842,22.0,0,0,25.79,5.8,158,False,False,False,False,False,True,False
75890,18.0,0,0,20.66,4.8,155,False,False,False,False,False,True,False
84819,10.0,0,0,17.79,5.8,85,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
52997,80.0,0,0,26.80,4.0,100,False,False,False,False,False,True,False
18231,37.0,0,0,24.69,4.0,130,False,False,True,False,False,False,False
63158,31.0,0,0,20.44,3.5,85,False,False,False,False,False,True,False
63868,49.0,0,0,27.48,5.7,160,True,False,False,False,False,True,False


In [81]:
Y_train

Unnamed: 0,diabetes
83449,0
50853,0
34842,0
75890,0
84819,0
...,...
52997,0
18231,0
63158,0
63868,0


In [82]:
X_test

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,gender_Male,gender_Other,smoking_history_current,smoking_history_ever,smoking_history_former,smoking_history_never,smoking_history_not current
6172,22.0,0,0,27.32,3.5,159,True,False,False,False,False,False,False
28975,80.0,0,1,26.93,7.5,200,True,False,False,False,False,False,False
59534,60.0,0,0,29.53,4.8,100,True,False,False,False,False,False,False
80117,42.0,0,0,21.45,3.5,130,False,False,True,False,False,False,False
76421,77.0,0,0,19.73,3.5,100,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35649,39.0,0,0,27.32,6.1,159,True,False,False,False,False,False,False
46709,17.0,0,0,27.86,6.0,85,True,False,False,False,False,False,False
64747,25.0,0,0,21.03,6.6,140,False,False,True,False,False,False,False
94268,46.0,1,0,22.96,4.5,80,False,False,True,False,False,False,False


In [83]:
Y_test

Unnamed: 0,diabetes
6172,0
28975,1
59534,0
80117,0
76421,0
...,...
35649,0
46709,0
64747,0
94268,0


In [84]:
model=RandomForestClassifier()

In [85]:
model.fit(X_train,Y_train)

In [86]:
X_train_prediction=model.predict(X_train)
train_prediction_accuracy=accuracy_score(X_train_prediction,Y_train)

In [87]:
print('Accuracy on Training data : ', train_prediction_accuracy)

Accuracy on Training data :  0.9991375


In [88]:
X_test_prediction=model.predict(X_test)
test_prediction_accuracy=accuracy_score(X_test_prediction,Y_test)

In [89]:
print('Accuracy on Testing data : ', test_prediction_accuracy)

Accuracy on Testing data :  0.9714


In [90]:
# Input data based on the given row:
input_data = {
    'age': 80.0,
    'hypertension': 0,
    'heart_disease': 1,
    'bmi': 25.19,
    'HbA1c_level': 6.6,
    'blood_glucose_level': 140,
    'gender_Male': 0,  # 0 for Female
    'gender_Other': 0, # 0 for not Other
    'smoking_history_current': 0,  # 0 for never
    'smoking_history_ever':0, # 0 for never
    'smoking_history_former': 0, # 0 for never
    'smoking_history_never': 1, # 1 for never
    'smoking_history_not current': 0 # 0 for never
}

# Create a DataFrame from the input data:
input_df = pd.DataFrame([input_data])

# Make the prediction:
prediction = model.predict(input_df)

# Print the prediction and the result:
print(prediction)
if prediction[0] == 0:
    print('The Person does not have Diabetes')
else:
    print('The Person has Diabetes')

[0]
The Person does not have Diabetes


In [91]:
import pickle

In [93]:
filename = 'diabetes_prediction_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [95]:
loaded_model = pickle.load(open('diabetes_prediction_model.sav', 'rb'))

In [96]:
for column in X_train.columns:
  print(column)

age
hypertension
heart_disease
bmi
HbA1c_level
blood_glucose_level
gender_Male
gender_Other
smoking_history_current
smoking_history_ever
smoking_history_former
smoking_history_never
smoking_history_not current
