<a href="https://colab.research.google.com/github/Rudragupta-1/DiabeAI/blob/main/DiabAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Diabetes Prediction Model
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, accuracy_score, roc_curve
import joblib

In [None]:
# Step 2: Load the Dataset
df = pd.read_csv('/diabetes_prediction_dataset.csv')
df.head()

In [None]:
# Step 3: Exploratory Data Analysis (EDA)

# Encode categorical columns using LabelEncoder for simplicity since they are binary
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Encoding 'gender' and 'smoking' (converting 'Male'/'Female' to 0/1 and 'Yes'/'No' to 0/1)
df['gender'] = label_encoder.fit_transform(df['gender'])
df['smoking_history'] = label_encoder.fit_transform(df['smoking_history'])

# Check if encoding was successful
print(df[['gender', 'smoking_history']].head())

# Correlation Heatmap to understand feature relationships
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()


In [13]:
# Step 4: Data Preprocessing

# Categorical and numerical columns separation
categorical_cols = ['gender', 'smoking_history']  # Assuming these are the only categorical features
numerical_cols = ['age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level', 'blood_glucose_level']

# Preprocessing for numerical data: Imputation (median) + Scaling
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: Imputation (most frequent) + One-Hot Encoding
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(drop='first'))  # Avoid dummy variable trap
])

# Combining preprocessing steps for numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


In [14]:
# Step 5: Splitting Data into Training and Testing sets

X = df.drop(columns=['diabetes'])  # Features
y = df['diabetes']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [15]:
# Step 6: Model Training using Pipelines

# Pipeline for Logistic Regression
log_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(random_state=42))
])

# Pipeline for Random Forest Classifier
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

In [None]:

# Step 7: Hyperparameter Tuning and Cross-Validation

# Random Forest Hyperparameters tuning using GridSearchCV
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10]
}

grid_search_rf = GridSearchCV(rf_pipeline, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

print(f"Best Parameters for Random Forest: {grid_search_rf.best_params_}")
best_rf_model = grid_search_rf.best_estimator_

In [None]:
# Step 8: Model Evaluation

# Logistic Regression Evaluation
log_pipeline.fit(X_train, y_train)
y_pred_log = log_pipeline.predict(X_test)

print("Logistic Regression Model Performance:")
print(confusion_matrix(y_test, y_pred_log))
print(classification_report(y_test, y_pred_log))
print(f"ROC-AUC Score: {roc_auc_score(y_test, log_pipeline.predict_proba(X_test)[:, 1])}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_log)}")

# Random Forest Evaluation (with tuned hyperparameters)
y_pred_rf = best_rf_model.predict(X_test)

print("Random Forest Model Performance:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print(f"ROC-AUC Score: {roc_auc_score(y_test, best_rf_model.predict_proba(X_test)[:, 1])}")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf)}")

# ROC Curve for both models
y_pred_log_proba = log_pipeline.predict_proba(X_test)[:, 1]
y_pred_rf_proba = best_rf_model.predict_proba(X_test)[:, 1]

fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_log_proba)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf_proba)

plt.figure(figsize=(10, 6))
plt.plot(fpr_log, tpr_log, label='Logistic Regression (AUC = {:.2f})'.format(roc_auc_score(y_test, y_pred_log_proba)))
plt.plot(fpr_rf, tpr_rf, label='Random Forest (AUC = {:.2f})'.format(roc_auc_score(y_test, y_pred_rf_proba)))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

In [None]:
# Step 9: Save the Best Model

# Saving the Random Forest Model (Assuming it performs better)
joblib.dump(best_rf_model, 'diabetes_prediction_model.pkl')
print("Random Forest Model saved successfully!")

# Step 10: Load the Model for Future Predictions
# To load the model, you can use:
# loaded_model = joblib.load('diabetes_prediction_model.pkl')
# predictions = loaded_model.predict(new_data)


In [None]:
!pip install streamlit


In [None]:
# Save your trained model (assuming it's the Random Forest model)
joblib.dump(best_rf_model, 'diabetes_prediction_model.pkl')


In [None]:
import streamlit as st
import pandas as pd
import joblib
import numpy as np

# Load the trained model
model = joblib.load('diabetes_prediction_model.pkl')

# Function to predict diabetes
def predict_diabetes(gender, age, hypertension, heart_disease, smoking, bmi, hba1c, blood_glucose):
    # Encode categorical variables
    gender = 1 if gender == 'Female' else 0
    smoking = 1 if smoking == 'Yes' else 0

    # Prepare the input data as a DataFrame
    input_data = pd.DataFrame({
        'gender': [gender],
        'age': [age],
        'hypertension': [hypertension],
        'heart_disease': [heart_disease],
        'smoking': [smoking],
        'bmi': [bmi],
        'HbA1c_level': [hba1c],
        'blood_glucose_level': [blood_glucose]
    })

    # Make prediction
    prediction = model.predict(input_data)
    return prediction[0]

# Streamlit UI
st.title("Diabetes Prediction App")
st.header("Enter Patient Information")

# User input fields
gender = st.selectbox("Gender", options=["Male", "Female"])
age = st.number_input("Age", min_value=0, max_value=120, value=30)
hypertension = st.selectbox("Hypertension (0 = No, 1 = Yes)", options=[0, 1])
heart_disease = st.selectbox("Heart Disease (0 = No, 1 = Yes)", options=[0, 1])
smoking = st.selectbox("Smoking (Yes/No)", options=["Yes", "No"])
bmi = st.number_input("BMI", min_value=10.0, max_value=50.0, value=25.0)
hba1c = st.number_input("HbA1c Level (%)", min_value=0.0, max_value=15.0, value=5.0)
blood_glucose = st.number_input("Blood Glucose Level (mg/dL)", min_value=0.0, max_value=500.0, value=100.0)

# Prediction button
if st.button("Predict"):
    prediction = predict_diabetes(gender, age, hypertension, heart_disease, smoking, bmi, hba1c, blood_glucose)
    if prediction == 1:
        st.success("The model predicts that the patient **has diabetes**.")
    else:
        st.success("The model predicts that the patient **does not have diabetes**.")
