In [2]:
# ======================================================================
# model_training.ipynb  |  Disease-Prediction Mini-Hackathon
# FOCUSED: Train multiple models using selected features
# Author: <your name>  |  Python 3.10.11
# ======================================================================

# %% [markdown]
# # 1. Setup & Configuration

# %%
import os, warnings, logging, joblib, time
import numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import xgboost as xgb, lightgbm as lgb, catboost as cb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

warnings.filterwarnings("ignore")
SEED = 42

ROOT = Path.cwd().parent if Path.cwd().name.lower()=="notebook" else Path.cwd()
PROC = ROOT / "data" / "processed" 
MODELS = ROOT / "models"

print("🏋️ Model Training Pipeline") 
print("=" * 50)

# %% [markdown]
# # 2. Load Selected Features & Data

# %%
# Load selected features
selected_features = joblib.load(MODELS / "selected_features.pkl")
print(f"Loaded {len(selected_features)} selected features")

# Load training data with selected features
X_train = pd.read_csv(PROC / "X_train_selected.csv")
y_train = pd.read_csv(PROC / "y_train.csv").squeeze()
X_valid = pd.read_csv(PROC / "X_valid_selected.csv") 
y_valid = pd.read_csv(PROC / "y_valid.csv").squeeze()

print(f"Training: {X_train.shape}, Validation: {X_valid.shape}")

# %% [markdown]
# # 3. Model Training Function

# %%
def train_evaluate_model(model, X_tr, y_tr, X_val, y_val, name):
    """Train and evaluate a single model"""
    start_time = time.time()
    
    # Train
    model.fit(X_tr, y_tr)
    
    # Predictions
    train_pred = model.predict(X_tr)
    valid_pred = model.predict(X_val)
    
    # Metrics
    train_acc = accuracy_score(y_tr, train_pred)
    valid_acc = accuracy_score(y_val, valid_pred)
    
    # Cross-validation
    cv_scores = cross_val_score(model, X_tr, y_tr, cv=5, scoring='accuracy')
    training_time = time.time() - start_time
    
    results = {
        'model': model,
        'train_acc': train_acc,
        'valid_acc': valid_acc,
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'training_time': training_time
    }
    
    print(f"\n{name} Results:")
    print(f"  Train Acc: {train_acc:.4f}")
    print(f"  Valid Acc: {valid_acc:.4f}")
    print(f"  CV Score:  {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    print(f"  Time:      {training_time:.2f}s")
    
    return results

# %% [markdown]
# # 4. Train Individual Models

# %%
print("\n🏋️ TRAINING CHAMPION MODELS")
print("=" * 50)

trained_models = {}

# XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=SEED,
    n_jobs=-1,
    verbosity=0
)
trained_models['XGBoost'] = train_evaluate_model(xgb_model, X_train, y_train, X_valid, y_valid, "XGBoost")

# LightGBM
lgb_model = lgb.LGBMClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    feature_fraction=0.8,
    random_state=SEED,
    n_jobs=-1,
    verbose=-1
)
trained_models['LightGBM'] = train_evaluate_model(lgb_model, X_train, y_train, X_valid, y_valid, "LightGBM")

# CatBoost
cb_model = cb.CatBoostClassifier(
    iterations=300,
    depth=6,
    learning_rate=0.1,
    random_state=SEED,
    verbose=False
)
trained_models['CatBoost'] = train_evaluate_model(cb_model, X_train, y_train, X_valid, y_valid, "CatBoost")

# Random Forest
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=SEED,
    n_jobs=-1
)
trained_models['RandomForest'] = train_evaluate_model(rf_model, X_train, y_train, X_valid, y_valid, "RandomForest")

# %% [markdown]
# # 5. Create Ensemble

# %%
print("\n🎯 CREATING ENSEMBLE")
print("=" * 50)

# Select top 3 models for ensemble
best_models = sorted(trained_models.items(), key=lambda x: x[1]['cv_mean'], reverse=True)[:3]
print("Top 3 models for ensemble:")
for i, (name, results) in enumerate(best_models, 1):
    print(f"  {i}. {name}: {results['cv_mean']:.4f}")

# Create voting ensemble
voting_ensemble = VotingClassifier(
    estimators=[(name, results['model']) for name, results in best_models],
    voting='soft'
)

trained_models['VotingEnsemble'] = train_evaluate_model(
    voting_ensemble, X_train, y_train, X_valid, y_valid, "Voting Ensemble"
)

# %% [markdown]
# # 6. Save All Trained Models

# %%
print("\n💾 SAVING TRAINED MODELS")
print("=" * 50)

# Save individual models
for name, results in trained_models.items():
    model_filename = MODELS / f"model_{name.lower()}.pkl"
    joblib.dump(results['model'], model_filename)
    print(f"✅ Saved {model_filename}")

# Save training results summary
training_summary = {
    name: {
        'train_accuracy': float(results['train_acc']),
        'valid_accuracy': float(results['valid_acc']),
        'cv_mean': float(results['cv_mean']),
        'cv_std': float(results['cv_std']),
        'training_time': float(results['training_time'])
    }
    for name, results in trained_models.items()
}

with open(MODELS / "training_results.json", 'w') as f:
    import json
    json.dump(training_summary, f, indent=2)

print(f"✅ All models trained and saved!")
print(f"🏆 Ready for model_selection.ipynb")


🏋️ Model Training Pipeline
Loaded 93 selected features
Training: (3936, 93), Validation: (984, 93)

🏋️ TRAINING CHAMPION MODELS

XGBoost Results:
  Train Acc: 1.0000
  Valid Acc: 1.0000
  CV Score:  0.9995 ± 0.0010
  Time:      96.71s

LightGBM Results:
  Train Acc: 1.0000
  Valid Acc: 1.0000
  CV Score:  0.9995 ± 0.0010
  Time:      39.44s

CatBoost Results:
  Train Acc: 1.0000
  Valid Acc: 1.0000
  CV Score:  1.0000 ± 0.0000
  Time:      129.94s

RandomForest Results:
  Train Acc: 1.0000
  Valid Acc: 1.0000
  CV Score:  0.9985 ± 0.0019
  Time:      5.32s

🎯 CREATING ENSEMBLE
Top 3 models for ensemble:
  1. CatBoost: 1.0000
  2. XGBoost: 0.9995
  3. LightGBM: 0.9995

Voting Ensemble Results:
  Train Acc: 1.0000
  Valid Acc: 1.0000
  CV Score:  1.0000 ± 0.0000
  Time:      232.18s

💾 SAVING TRAINED MODELS
✅ Saved d:\Portfolio\disease\models\model_xgboost.pkl
✅ Saved d:\Portfolio\disease\models\model_lightgbm.pkl
✅ Saved d:\Portfolio\disease\models\model_catboost.pkl
✅ Saved d:\Portfoli