## Rabiul Ruhan
rabiulruhan381@gmail.com

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import gradio as gr
import joblib

: 

## 1. Data Loading (5 Marks)
Load the chosen dataset into your environment and display the first few rows along with the shape to verify correctness.


In [None]:
url = "https://raw.githubusercontent.com/RABIUL-RUHAN/ML_files/refs/heads/main/diabetes.csv"
df = pd.read_csv(url)

In [None]:
df.head()

In [None]:
df.shape

## 2. Data Preprocessing (10 Marks)
Perform and document at least 5 distinct preprocessing steps (e.g., handling missing values, encoding, scaling, outlier detection, feature engineering).


In [None]:
df.describe()

In [None]:
df.info()

In [None]:
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    df[col] = df[col].replace(0, np.nan)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:

for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    df[col] = df[col].fillna(df[col].median())




In [None]:
!pip install ydata-profiling


In [None]:
from ydata_profiling import ProfileReport

profile = ProfileReport( df , title="Pima Indians Diabetes dataset", explorative = True  )

profile.to_file("ydata.html")

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=df[numeric_cols])
plt.xticks(rotation=45)
plt.title("Boxplots of Numeric Features ")
plt.show()

In [None]:
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[column] = np.where(df[column] > upper_bound, upper_bound, df[column])
    df[column] = np.where(df[column] < lower_bound, lower_bound, df[column])
    return df

for col in ['Insulin', 'SkinThickness']:
    df = remove_outliers_iqr(df, col)

## 3. Pipeline Creation (10 Marks)
Construct a standard Machine Learning pipeline that integrates preprocessing and the model


In [None]:
numeric_cols = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']

X = df[numeric_cols]
y = df['Outcome']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)



In [None]:
preprocessor = ColumnTransformer(
    [('num', StandardScaler(), numeric_cols)]
)


## 4. Primary Model Selection (5 Marks)
Choose a suitable algorithm and justify why this specific model was selected for the dataset.


## Model Selection
I am considering three model . and the models are
1. Random forest:
random forest captures non-linear relationships between features. it can handle feature interaction and robust to outliers
2. Logistic Regression:
it serves as a baseline model. and works well on medium size datasets
3. Gradient Boosting:
it works good on datasets where there is complex pattern and interaction between features.


## 5. Model Training (10 Marks)
Train your selected model using the training portion of your dataset.


In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42)
}
results = []

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results).sort_values("F1 Score", ascending=False)
results_df



## 6. Cross-Validation (10 Marks)
Apply Cross-Validation  to assess robustness and report the average score with standard deviation.


In [None]:
cv_results = []

for name, model in models.items():
    pipe = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    scores = cross_val_score(pipe, X_train, y_train, cv=5, scoring='f1')

    cv_results.append({
        "Model": name,

        "CV Mean F1": scores.mean(),
        "CV Std F1": scores.std()
    })

cv_results_df = pd.DataFrame(cv_results).sort_values("CV Mean F1",  ascending=False)
cv_results_df = cv_results_df.round(4)
cv_results_df


## 7. Hyperparameter Tuning (10 Marks)
Optimize your model using search methods displaying both the parameters tested and the best results found.


In [None]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 5, 10],
    'model__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(
    Pipeline([
        ('preprocessor', preprocessor),
        ('model', RandomForestClassifier(random_state=42))
    ]),
    param_grid,
    cv=5,
    scoring='f1',
    n_jobs=-1
)


In [None]:

grid_search.fit(X_train, y_train)



In [None]:

grid_search.best_params_

In [None]:
print(f"Grid best score: {grid_search.best_score_: .4}")

## 8. Best Model Selection (10 Marks)
Select  the final best-performing model based on the hyperparameter tuning results.


In [None]:
best_model = grid_search.best_estimator_

print("Best model selected")
print(f"Best parameters: {grid_search.best_params_}")

## 9. Model Performance Evaluation (10 Marks)
Evaluate the model on the test set and print comprehensive metrics suitable for the problem type.


In [None]:
# Make predictions
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

# Calculate metrics
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC-AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6, 4))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=['Not Diabetic', 'Diabetic'],
    yticklabels=['Not Diabetic', 'Diabetic']
)

plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

: 

## 10. Web Interface with Gradio (10 Marks)
Create a user-friendly Gradio web interface that takes user inputs and displays the prediction from your trained model.


In [None]:
import joblib

joblib.dump(best_model, "diabetes_model.pkl")
print("Model saved successfully!")

import gradio as gr
import pandas as pd
import joblib
import numpy as np

# Load trained model
model = joblib.load("diabetes_model.pkl")

# Feature order (MUST match training)
numeric_cols = [
    'Pregnancies',
    'Glucose',
    'BloodPressure',
    'SkinThickness',
    'Insulin',
    'BMI',
    'DiabetesPedigreeFunction',
    'Age',
    'AgeGroup',
    'BMI_Category'
]

def predict_diabetes(
    Pregnancies,
    Glucose,
    BloodPressure,
    SkinThickness,
    Insulin,
    BMI,
    DiabetesPedigreeFunction,
    Age,
    AgeGroup,
    BMI_Category
):
    # Create DataFrame
    data = pd.DataFrame([[
        Pregnancies,
        Glucose,
        BloodPressure,
        SkinThickness,
        Insulin,
        BMI,
        DiabetesPedigreeFunction,
        Age,
        AgeGroup,
        BMI_Category
    ]], columns=numeric_cols)

    # Prediction
    pred = model.predict(data)[0]
    prob = model.predict_proba(data)[0][1]

    result = "Diabetic" if pred == 1 else "Not Diabetic"

    return f"Prediction: {result}\nProbability of Diabetes: {prob:.2f}"

# Gradio Interface
iface = gr.Interface(
    fn=predict_diabetes,
    inputs=[
        gr.Number(label="Pregnancies"),
        gr.Number(label="Glucose"),
        gr.Number(label="Blood Pressure"),
        gr.Number(label="Skin Thickness"),
        gr.Number(label="Insulin"),
        gr.Number(label="BMI"),
        gr.Number(label="Diabetes Pedigree Function"),
        gr.Number(label="Age"),

    ],
    outputs=gr.Textbox(label="Result"),
    title="ðŸ©º Diabetes Prediction System",
    description="Enter patient health parameters to predict diabetes using a trained Machine Learning model.",
)

iface.launch(share = True)


## 11. Deployment to Hugging Face (10 Marks)
Deploy the Gradio app to Hugging Face Spaces and ensure it is accessible via a public URL.
