In [1]:
# ==========================================
# INSTALL LIBRARIES (Run this first in Colab)
# ==========================================
# !pip install gradio

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ==========================================
# 1. DATA LOADING (5 Marks)
# ==========================================
# Loading Pima Indians Diabetes dataset from a stable URL
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(url, names=columns)

print("Task 1: Data Loaded")
print(f"Shape: {df.shape}")
display(df.head())

# ==========================================
# 2. DATA PREPROCESSING (10 Marks)
# ==========================================
# Step 1: Handling Missing Values (In this dataset, 0 represents missing for some cols)
cols_with_missing = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing] = df[cols_with_missing].replace(0, np.nan)

# Step 2: Feature Engineering (Creating a new feature 'AgeGroup')
# This adds a categorical feature we can Encode later
bins = [20, 30, 50, 100]
labels = ['Young', 'Middle-Aged', 'Senior']
df['AgeGroup'] = pd.cut(df['Age'], bins=bins, labels=labels)

# Step 3: Outlier Detection/Handling (Simple capping for demo)
# Cap Insulin at 99th percentile
q_hi = df['Insulin'].quantile(0.99)
df = df[df['Insulin'] <= q_hi] if pd.notna(q_hi) else df

# Separating X and y
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Splitting Data (Step 4 of process flow, though Pipeline handles the rest)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("\nTask 2: Preprocessing Steps Defined (Imputation, Feature Eng, Outlier Capping, Split)")

# ==========================================
# 3. PIPELINE CREATION (10 Marks)
# ==========================================
# Defining numeric and categorical columns
numeric_features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
categorical_features = ['AgeGroup']

# Numeric Transformer: Impute median + Scale
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Step 5: Scaling/Imputing
    ('scaler', StandardScaler())
])

# Categorical Transformer: Impute frequent + OneHotEncode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Combine into Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

print("Task 3: Pipeline Created")

# ==========================================
# 4. PRIMARY MODEL SELECTION (5 Marks)
# ==========================================
# Selection: Random Forest Classifier
# Justification:
# 1. It handles non-linear relationships well (common in health data).
# 2. It is robust to outliers and less prone to overfitting than a single Decision Tree.
# 3. It provides feature importance, which is useful for medical diagnosis.

model = RandomForestClassifier(random_state=42)

# Create Full Pipeline
clf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', model)])

print("Task 4: Random Forest Selected")

# ==========================================
# 5. MODEL TRAINING (10 Marks)
# ==========================================
clf_pipeline.fit(X_train, y_train)
print("Task 5: Model Trained")

# ==========================================
# 6. CROSS-VALIDATION (10 Marks)
# ==========================================
cv_scores = cross_val_score(clf_pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"\nTask 6: Cross-Validation Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# ==========================================
# 7. HYPERPARAMETER TUNING (10 Marks)
# ==========================================
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

grid_search = GridSearchCV(clf_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f"\nTask 7: Best Params: {grid_search.best_params_}")

# ==========================================
# 8. BEST MODEL SELECTION (10 Marks)
# ==========================================
best_model = grid_search.best_estimator_
print("Task 8: Best Model Selected")

# ==========================================
# 9. MODEL PERFORMANCE EVALUATION (10 Marks)
# ==========================================
y_pred = best_model.predict(X_test)

print("\nTask 9: Evaluation Metrics")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# ==========================================
# 10. WEB INTERFACE WITH GRADIO (10 Marks)
# ==========================================
def predict_diabetes(pregnancies, glucose, bp, skin, insulin, bmi, dpf, age):
    # Create DataFrame from input matching original structure
    input_data = pd.DataFrame([[pregnancies, glucose, bp, skin, insulin, bmi, dpf, age]],
                              columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])

    # Feature Engineering (Must repeat logic from Step 2)
    # Note: Pipeline handles imputation/scaling/encoding, but we must create 'AgeGroup'
    input_data['AgeGroup'] = pd.cut(input_data['Age'], bins=[20, 30, 50, 100], labels=['Young', 'Middle-Aged', 'Senior'])

    prediction = best_model.predict(input_data)
    proba = best_model.predict_proba(input_data)[0][1]

    result = "Diabetic" if prediction[0] == 1 else "Non-Diabetic"
    return f"{result} (Probability: {proba:.2f})"

# Define Gradio Interface
iface = gr.Interface(
    fn=predict_diabetes,
    inputs=[
        gr.Number(label="Pregnancies"),
        gr.Number(label="Glucose Level"),
        gr.Number(label="Blood Pressure"),
        gr.Number(label="Skin Thickness"),
        gr.Number(label="Insulin"),
        gr.Number(label="BMI"),
        gr.Number(label="Diabetes Pedigree Function"),
        gr.Number(label="Age")
    ],
    outputs="text",
    title="Diabetes Prediction System",
    description="Enter patient details to predict diabetes risk."
)

# Launch (for Colab)
iface.launch(share=True)

Task 1: Data Loaded
Shape: (768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1



Task 2: Preprocessing Steps Defined (Imputation, Feature Eng, Outlier Capping, Split)
Task 3: Pipeline Created
Task 4: Random Forest Selected
Task 5: Model Trained

Task 6: Cross-Validation Accuracy: 0.7885 (+/- 0.0621)

Task 7: Best Params: {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Task 8: Best Model Selected

Task 9: Evaluation Metrics
Accuracy: 0.8076923076923077

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.88      0.86        52
           1       0.74      0.65      0.69        26

    accuracy                           0.81        78
   macro avg       0.79      0.77      0.78        78
weighted avg       0.80      0.81      0.80        78


Confusion Matrix:
 [[46  6]
 [ 9 17]]
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://fdf89a71ba9c3a4f8f.gradio.live

This share link expires in 1 

