Milestone 1: Data Understanding & Exploration
- Load the diabetes dataset using scikit-learn and display the first five rows
- Perform descriptive statistics (mean, median, min, max) and summarize class balance
- Visualize the distributions of at least two features using histograms or box plots

In [None]:
# Environment & Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

import joblib
import warnings
warnings.filterwarnings('ignore')

plt.style.use("seaborn-v0_8")
sns.set_palette("husl")

In [None]:
# Load dataset
diabetes = load_diabetes(as_frame=True)
df = diabetes.frame.copy()

print("Dataset shape:", df.shape)
df.head()

Milestone 2: Data Preparation & Preprocessing
- Handle any missing values, duplicates, or outliers in the dataset
- Apply feature scaling/normalization
- Encode categorical variables (if present) with a suitable technique
- Perform basic correlation analysis and point out highly correlated features

In [None]:
# Descriptive statistics
print(df.describe().T)

# Median
print("\nMedian values:")
print(df.median())

# Target distribution
plt.figure(figsize=(7,4))
sns.histplot(df["target"], kde=True)
plt.title("Target Value Distribution")
plt.show()


In [None]:
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
sns.histplot(df['bmi'], kde=True)
plt.title("BMI Distribution")

plt.subplot(1,2,2)
sns.boxplot(x=df['bp'])
plt.title("Blood Pressure (bp) Boxplot")

plt.show()


In [None]:
print(df.isnull().sum())


In [None]:
print("Duplicates:", df.duplicated().sum())


In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=df[['bmi', 'bp', 's5']])
plt.title("Outlier Detection")
plt.show()


In [None]:
from scipy.stats import zscore
df_clean = df[(np.abs(zscore(df)) < 3).all(axis=1)]


In [None]:
scaler = StandardScaler()
scaled = scaler.fit_transform(df.drop('target', axis=1))
df_scaled = pd.DataFrame(scaled, columns=df.columns[:-1])
df_scaled.head()


In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(df.corr(), annot=False, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


In [None]:
print(df.corr()["target"].sort_values(ascending=False))


Milestone 3: Model Selection & Training
- Split the dataset into training and test sets with a suitable ratio
- Choose and train at least two machine learning models (e.g., logistic regression, random forest)
- Include hyperparameter tuning with cross-validation for one chosen model

In [None]:
# 1. Train-Test Split
X = df_clean.drop("target", axis=1)
y = df_clean["target"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

In [None]:
# 2. Select Models
model_lr = LinearRegression()
model_rf = RandomForestRegressor(random_state=42)

# Train models
model_lr.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Predictions
pred_lr = model_lr.predict(X_test)
pred_rf = model_rf.predict(X_test)


In [None]:
# Evaluation function
def evaluate(model_name, y_test, y_pred):
    print(f"\n----- {model_name} -----")
    print("MAE :", mean_absolute_error(y_test, y_pred))
    print("MSE :", mean_squared_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R² Score:", r2_score(y_test, y_pred))

# Model Performance
evaluate("Linear Regression", y_test, pred_lr)
evaluate("Random Forest", y_test, pred_rf)

In [None]:
# --------------------------------------------------
# 3. Hyperparameter Tuning with GridSearchCV
# --------------------------------------------------

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_split": [2, 5, 10]
}

grid = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

In [None]:
grid.fit(X_train, y_train)

print("\nBest Parameters from Grid Search:")
print(grid.best_params_)



In [None]:
# Evaluate best model
best_model = grid.best_estimator_
best_pred = best_model.predict(X_test)

evaluate("Tuned Random Forest", y_test, best_pred)

Milestone 4: Model Evaluation & Interpretation
- Evaluate the trained models with appropriate metrics (accuracy, ROC-AUC, precision, recall)
- Plot and interpret a confusion matrix for the best model

In [None]:
# -------------------------------
# Milestone 4: Model Evaluation
# -------------------------------

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, RocCurveDisplay
)


In [None]:
# Convert regression target → binary target
# Median threshold: > median = 1, else 0
y_class = (df_clean["target"] > df_clean["target"].median()).astype(int)
X_class = df_clean.drop("target", axis=1)



In [None]:
# Split data
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X_class, y_class, test_size=0.2, random_state=42
)



In [None]:
# Train classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_c, y_train_c)



In [None]:
# Predictions
y_pred_c = clf.predict(X_test_c)
y_prob_c = clf.predict_proba(X_test_c)[:, 1]



In [None]:
# Evaluation Metrics
print("Accuracy :", accuracy_score(y_test_c, y_pred_c))
print("Precision:", precision_score(y_test_c, y_pred_c))
print("Recall   :", recall_score(y_test_c, y_pred_c))
print("F1 Score :", f1_score(y_test_c, y_pred_c))
print("ROC-AUC  :", roc_auc_score(y_test_c, y_prob_c))



In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test_c, y_pred_c)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


In [None]:

# ROC Curve
plt.figure(figsize=(6,5))
RocCurveDisplay.from_estimator(clf, X_test_c, y_test_c)
plt.title("ROC Curve")
plt.show()


Milestone 5: Deployment & Reporting
- save your final trained model for later use
- load the model and making a new prediction

In [None]:
# --------------------------
# Milestone 5: Deployment
# --------------------------

import joblib
import numpy as np



In [None]:
# 1. Save the model
model_path = "best_diabetes_model.pkl"
joblib.dump(clf, model_path)
print("Model saved as:", model_path)


In [None]:

# 2. Load the saved model
loaded_model = joblib.load(model_path)
print("Model loaded successfully!")


In [None]:

# 3. Make a new prediction
# (Example new patient data — must be 10 features like the dataset)
new_data = np.array([[0.03, 0.05, 0.06, 0.02,
                      -0.04, -0.03, -0.04, -0.01,
                      0.02, -0.015]])


In [None]:

# Predict class
new_pred = loaded_model.predict(new_data)[0]


In [None]:

# Interpret output
if new_pred == 1:
    print("Prediction: High risk of diabetes progression.")
else:
    print("Prediction: Low risk of diabetes progression.")
