In [None]:
import pandas as pd
from pycaret.classification import *
import os
import json

# ==========================================
# CONFIGURATION
# ==========================================
SAMPLE_FRAC = 0.5  # Set to 1.0 for full training
DATA_PATH = "../data/02_intermediate/process_data.parquet"
MODEL_DIR = "../models"
MODEL_NAME = "best_pipeline"
CONFIG_PATH = "../models/model_config.json"

print(f"Running Training with SAMPLE_FRAC = {SAMPLE_FRAC}")

In [None]:
# ==========================================
# 1. LOAD DATA
# ==========================================
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Data file not found at {DATA_PATH}")

df = pd.read_parquet(DATA_PATH)
print(f"Original Data Shape: {df.shape}")

# Load Schema Config
with open(CONFIG_PATH, 'r') as f:
    config = json.load(f)

features = config['features']
target = config['target']
numeric_features = config['numeric_features']
categorical_features = config['categorical_features']

# Filter only relevant columns
df = df[features + [target]]

if SAMPLE_FRAC < 1.0:
    df = df.sample(frac=SAMPLE_FRAC, random_state=42)
    print(f"Sampled Data Shape: {df.shape}")
else:
    print("Using Full Dataset")


In [None]:
# ==========================================
# 2. SETUP PYCARET
# ==========================================
# normalize=True (RobustScaler)
# remove_outliers=True
# fix_imbalance=True

exp = setup(
    data=df,
    target=target,
    numeric_features=numeric_features,
    categorical_features=categorical_features,
    normalize=True,
    normalize_method='robust',
    remove_outliers=True,
    fix_imbalance=True,
    session_id=123,
    verbose=True
)

In [None]:
# ==========================================
# 3. COMPARE & TRAIN
# ==========================================
# Optimizing for Recall (Sensitivity) as per medical requirements
best_model = compare_models(sort='Recall')
print("Best Model Found:")
print(best_model)

In [None]:
# ==========================================
# 4. FINALIZE & SAVE
# ==========================================
final_model = finalize_model(best_model)
os.makedirs(MODEL_DIR, exist_ok=True)
save_path = os.path.join(MODEL_DIR, MODEL_NAME)
save_model(final_model, save_path)
print(f"Model saved successfully to {save_path}.pkl")