# Train RandomForest on Diabetes dataset

This notebook loads the diabetes dataset in `ml/data/diabetes.csv`, performs cleaning and preprocessing, trains a RandomForest classifier, evaluates it, and saves a trained pipeline (model + scaler + metadata) to `backend/model/model.pkl`.

Run each cell in order. The notebook is intended to be runnable in the project's Python environment.

In [None]:
# Section 1: Imports and setup
import time
import os
from pathlib import Path
import logging
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve

# reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger('train')

In [None]:
# Section 2: Configure paths
BASE_DIR = Path('c:/gdg/diabetes-diet-planner') if os.name == 'nt' else Path('.')
DATA_PATH = BASE_DIR / 'ml' / 'data' / 'diabetes.csv'
MODEL_DIR = BASE_DIR / 'backend' / 'model'
MODEL_FILE = MODEL_DIR / 'model.pkl'
logger.info(f'DATA_PATH: {DATA_PATH}')
logger.info(f'MODEL_FILE: {MODEL_FILE}')

In [None]:
# Section 3: Load dataset
if not DATA_PATH.exists():
    raise FileNotFoundError(f'Could not find dataset at {DATA_PATH}. Make sure the file exists.')

df = pd.read_csv(DATA_PATH)
logger.info(f'Dataset loaded: shape={df.shape}')
df.head()

In [None]:
# Section 4: Quick EDA
display(df.describe(include='all'))
print('
Missing values per column:')
print(df.isnull().sum())

# Identify common Pima columns that sometimes contain zeros treated as missing
possible_missing_zero_cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
present_zero_counts = {}
for c in possible_missing_zero_cols:
    if c in df.columns:
        present_zero_counts[c] = int((df[c] == 0).sum())

print('
Zero counts (these may indicate missing values):')
print(present_zero_counts)

In [None]:
# Section 5: Data cleaning - replace zeros with NaN and impute
df_clean = df.copy()
cols_to_fix = [c for c in ['Glucose','BloodPressure','SkinThickness','Insulin','BMI'] if c in df_clean.columns]
for c in cols_to_fix:
    n_zeros = int((df_clean[c] == 0).sum())
    if n_zeros > 0:
        logger.info(f'Replacing {n_zeros} zeros in {c} with NaN and imputing median')
        df_clean[c].replace(0, np.nan, inplace=True)
        median_val = df_clean[c].median()
        df_clean[c].fillna(median_val, inplace=True)

# Drop duplicates if any
n_before = len(df_clean)
df_clean.drop_duplicates(inplace=True)
n_after = len(df_clean)
logger.info(f'Dropped {n_before - n_after} duplicate rows')

df_clean.shape

In [None]:
# Section 6: Feature / target preparation
# Detect target column (common Pima column is 'Outcome')
possible_target_names = ['Outcome','outcome','target','Target']
target_col = None
for t in possible_target_names:
    if t in df_clean.columns:
        target_col = t
        break
if target_col is None:
    # fallback: assume last column is target
    target_col = df_clean.columns[-1]
    logger.warning(f'No common target column found. Falling back to last column: {target_col}')

# Define features
default_features = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
features = [f for f in default_features if f in df_clean.columns]
if not features:
    # if none of the expected features exist, use all except target
    features = [c for c in df_clean.columns if c != target_col]

logger.info(f'Using target: {target_col} and features: {features}')
X = df_clean[features].copy()
y = df_clean[target_col].copy()

# Show class distribution
print('Target distribution:')
print(y.value_counts(normalize=True))

In [None]:
# Section 7: Train/test split and scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
logger.info(f'Train shape: {X_train.shape}, Test shape: {X_test.shape}')

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Section 8: Train RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
t0 = time.time()
clf.fit(X_train_scaled, y_train)
train_time = time.time() - t0
logger.info(f'Training completed in {train_time:.2f} seconds')

# Evaluate on test set
y_pred = clf.predict(X_test_scaled)
acc = accuracy_score(y_test, y_pred)
logger.info(f'Test accuracy: {acc:.4f}')
print('
Classification report:')
print(classification_report(y_test, y_pred))

# confusion matrix
cm = confusion_matrix(y_test, y_pred)
print('Confusion matrix:
', cm)

# ROC AUC if possible
if hasattr(clf, 'predict_proba'):
    y_proba = clf.predict_proba(X_test_scaled)[:,1]
    try:
        auc = roc_auc_score(y_test, y_proba)
        logger.info(f'ROC AUC: {auc:.4f}')
    except Exception as e:
        logger.warning(f'ROC AUC not computed: {e}')

In [None]:
# Section 9: Save model and artifacts to backend/model
MODEL_DIR = Path(MODEL_DIR)
MODEL_DIR.mkdir(parents=True, exist_ok=True)
artifact = {'model': clf, 'scaler': scaler, 'features': features, 'target_col': target_col, 'trained_at': time.time(), 'random_state': RANDOM_STATE}
with open(MODEL_FILE, 'wb') as f:
    pickle.dump(artifact, f)
logger.info(f'Model saved to: {MODEL_FILE}')

In [None]:
# Section 10: Quick sanity checks
assert MODEL_FILE.exists(), 'Model file was not created'
print('Model file exists:', MODEL_FILE)
# sample prediction
sample_X = X_test.iloc[:3]
sample_scaled = scaler.transform(sample_X)
sample_pred = clf.predict(sample_scaled)
print('Sample predictions (first 3 test rows):', sample_pred)

## Notes & Next steps
- Consider using a scikit-learn Pipeline to combine preprocessing and estimator for cleaner serialization.
- Add hyperparameter tuning (GridSearchCV / RandomizedSearchCV) and persist CV results.
- Save model metadata (metrics, feature importances) alongside the artifact.
- If model file is large, prefer saving to an artifact store rather than committing to Git.