# ü§ñ Model Training Pipeline

**Project:** Predicting Paid Amount for Medical Claims  
**Stage:** Model Training & Selection  

---

## Overview

1. **Data Preparation** - Load processed features, train/test split
2. **Linear Models** - Lasso, Ridge regression
3. **Ensemble Models** - Random Forest, Gradient Boosting
4. **Model Comparison** - Compare all models
5. **Model Selection** - Save best model for deployment

In [None]:
# Imports
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np

from src.utils.logger import setup_logging, get_logger, PipelineLogger
from src.models.model_trainer import ModelTrainer, ModelRegistry
from src.models.model_evaluator import ModelEvaluator

setup_logging(log_level="INFO")
logger = get_logger(__name__)

# Paths
PROCESSED_DIR = project_root / "data" / "processed"
MODELS_DIR = project_root / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print("‚úì Setup complete")

## 1. Load Data and Split

In [None]:
# Load processed data
parquet_path = PROCESSED_DIR / "processed_claims.parquet"
TARGET = 'AMT_PAID'

if parquet_path.exists():
    df = pd.read_parquet(parquet_path)
else:
    # Create demo data
    np.random.seed(42)
    n = 50000
    n_features = 20
    X_demo = pd.DataFrame(np.random.randn(n, n_features), 
                          columns=[f'feature_{i}' for i in range(n_features)])
    y_demo = 500 + 200 * X_demo['feature_0'] + 100 * X_demo['feature_1'] + np.random.randn(n) * 100
    df = pd.concat([X_demo, pd.Series(y_demo, name=TARGET)], axis=1)

# Separate features and target
y = df[TARGET]
X = df.drop(columns=[TARGET])

print(f"‚úì Data loaded: {X.shape[0]:,} samples, {X.shape[1]} features")

# Train/test split
trainer = ModelTrainer(random_state=42, test_size=0.2)
X_train, X_test, y_train, y_test = trainer.split_data(X, y)

## 2. Train Linear Models

In [None]:
with PipelineLogger("Training Linear Models", logger):
    # Lasso
    lasso_result = trainer.train_model('lasso', X_train, y_train, X_test, y_test,
                                       params={'alpha': 0.1})
    
    # Ridge  
    ridge_result = trainer.train_model('ridge', X_train, y_train, X_test, y_test,
                                       params={'alpha': 0.5})

## 3. Train Ensemble Models

In [None]:
with PipelineLogger("Training Ensemble Models", logger):
    # Random Forest
    rf_result = trainer.train_model('random_forest', X_train, y_train, X_test, y_test,
                                    params={'n_estimators': 100, 'max_depth': 20, 'n_jobs': -1})
    
    # Gradient Boosting
    gb_result = trainer.train_model('gradient_boosting', X_train, y_train, X_test, y_test,
                                    params={'n_estimators': 100, 'max_depth': 5})

## 4. Model Comparison

In [None]:
# Compare all models
print("\n" + "="*60)
print("üìä MODEL COMPARISON")
print("="*60)

results = []
for name, result in trainer.trained_models.items():
    metrics = result.metadata.validation_metrics
    results.append({
        'Model': name,
        'R¬≤': metrics['r2'],
        'RMSE': metrics['rmse'],
        'MAE': metrics['mae']
    })

comparison_df = pd.DataFrame(results).sort_values('R¬≤', ascending=False)
print(comparison_df.to_string(index=False))

# Get best model
best_result = trainer.get_best_model()
print(f"\nüèÜ Best Model: {best_result.metadata.model_type}")
print(f"   Validation R¬≤: {best_result.metadata.validation_metrics['r2']:.4f}")

## 5. Save Best Model

In [None]:
# Initialize model registry
registry = ModelRegistry(str(MODELS_DIR))

# Save best model
best_result = trainer.get_best_model()
model_path = registry.save_model(best_result, model_name="claims_predictor")

# Set as production model
registry.set_production_model("claims_predictor")

print(f"\nüíæ Model saved to: {model_path}")
print(f"\n‚úÖ Training completed! Next: Run 05_model_evaluation.ipynb")