# Sprint 6: Clinical Model Training (PyCaret)

**Objective:** Train and optimize the machine learning model using the new NHANES clinical biomarkers.
**Key Changes:**
- Uses `SAMPLE_FRAC = 0.005` for development speed.
- Explicitly defines **numeric** and **categorical** features from `models/model_config.json`.
- Applies `RobustScaler` to handle varying clinical scales.

In [None]:
import pandas as pd
import numpy as np
from pycaret.classification import setup, compare_models, save_model, pull
import os
import json

# --- PERFORMANCE CRITICAL CONFIG ---
SAMPLE_FRAC = 0.005 
# -----------------------------------

print(f"⚙️ Config: Using {SAMPLE_FRAC*100}% of available data.")

## 1. Load Configuration

In [None]:
config_path = "../models/model_config.json"
if not os.path.exists(config_path):
    config_path = "models/model_config.json"

try:
    with open(config_path, 'r') as f:
        config = json.load(f)
    print("✅ Configuration loaded.")
    numeric_feats = config['numeric_features']
    categorical_feats = config['categorical_features']
    target = config['target']
except Exception as e:
    print(f"❌ Error loading config: {e}")
    numeric_feats = []
    categorical_feats = []
    target = 'HeartDisease'

## 2. Load and Prepare Data

In [None]:
data_path = "../data/02_intermediate/process_data.parquet"
if not os.path.exists(data_path):
    data_path = "data/02_intermediate/process_data.parquet"

if os.path.exists(data_path):
    df = pd.read_parquet(data_path)
    
    # Rename Spanish columns to English Schema
    col_map = {
        'Presion_Sistolica': 'SystolicBP', 'Colesterol_Total': 'TotalCholesterol',
        'LDL': 'LDL', 'Triglycerides': 'Triglycerides', 'HbA1c': 'HbA1c',
        'Glucosa': 'Glucose', 'Acido_Urico': 'UricAcid', 'Creatinina': 'Creatinine',
        'BMI': 'BMI', 'Cintura': 'WaistCircumference', 'Sexo': 'Sex',
        'Fumador': 'Smoking', 'Actividad_Fisica': 'PhysicalActivity',
        'TARGET': 'HeartDisease'
    }
    df.rename(columns=col_map, inplace=True)
    
    # Ensure missing categorical features exist (imputation fallback)
    for col in categorical_feats:
        if col not in df.columns:
            df[col] = 0 # Default for binary/categorical
            
    # Handle missing numeric features (simple imputation before sampling)
    for col in numeric_feats:
        if col not in df.columns:
             df[col] = np.nan

    # --- SAMPLING ---
    df = df.sample(frac=SAMPLE_FRAC, random_state=42).reset_index(drop=True)
    print(f"✅ Data loaded & Sampled. Active Shape: {df.shape}")
    
else:
    print("❌ Data file not found.")
    df = pd.DataFrame()

## 3. PyCaret Setup

In [None]:
if not df.empty and target in df.columns:
    # Verify features exist in sampled DF
    valid_numeric = [c for c in numeric_feats if c in df.columns]
    valid_categorical = [c for c in categorical_feats if c in df.columns]

    # Setup
    exp = setup(
        data=df,
        target=target,
        numeric_features=valid_numeric,
        categorical_features=valid_categorical,
        normalize=True,
        normalize_method='robust', 
        imputation_type='simple',
        numeric_imputation='mean',
        categorical_imputation='constant',
        session_id=42,
        verbose=True,
        html=False
    )
    
    print("✅ PyCaret Setup Complete.")
else:
    print("❌ Setup skipped (No data or target).")

## 4. Compare Models & Save Best Pipeline

In [None]:
if not df.empty:
    best_model = compare_models(sort='Recall', n_select=1)
    print(pull())
    
    save_path = "../models/best_pipeline"
    os.makedirs("../models", exist_ok=True)
    save_model(best_model, save_path)
    print(f"✅ Best pipeline saved to {save_path}.pkl")