## Diabetes Prediction App

In [21]:
# Libraries

import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import pickle

In [None]:
# Loading the dataset

data = pd.read_csv("../data/diabetes_prediction_dataset.csv")

# View the first few rows of the dataset

data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [None]:
# Summary statistics of the dataset

data.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [None]:
# Check basic information

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [None]:
# Check for null values

data.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64

In [None]:
# Check for duplicate rows

print(data.duplicated().sum())

# Drop duplicate rows

data.drop_duplicates(inplace=True)

print("---Removed Duplicate---")
print(data.duplicated().sum())

3854
---Removed Duplicate---
0


In [13]:
# Incode the categorical columns

le = LabelEncoder()
data["gender"] = le.fit_transform(data["gender"])
data["smoking_history"] = le.fit_transform(data["smoking_history"])

In [14]:
# Selecting features and target variable

features = data.drop("diabetes", axis=1)

X = features
y = data["diabetes"]

In [15]:
# Scaling the features

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [16]:
# Applying PCA

pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Finding the optimal number of components

max_index = pca.explained_variance_ratio_.cumsum().argmax()

# Choose the number of components that explain 95% of the variance

n_components = max_index + 1

# Apply PCA with the optimal number of components

pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)

In [19]:
# Splitting the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Initializing and training the model

model = XGBClassifier()
model.fit(X_train, y_train)

# Making predictions on the test set

y_pred = model.predict(X_test)

# Evaluating the model

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy:, {accuracy:.4f}")
print(f"Classification Report: \n {classification_report(y_test, y_pred)}")

Accuracy:, 0.9657
Classification Report: 
               precision    recall  f1-score   support

           0       0.97      0.99      0.98     17509
           1       0.92      0.67      0.78      1721

    accuracy                           0.97     19230
   macro avg       0.95      0.83      0.88     19230
weighted avg       0.96      0.97      0.96     19230



In [22]:
# Save the model

with open("../model/diabetes_model.pkl", "wb") as f:
    pickle.dump(model, f)