# 03 - ML Modeling

Train and evaluate machine learning models for risk prediction.

## Models to Compare
- Random Forest
- XGBoost
- LightGBM

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '..')

from src.models.ml import MLTrainer, ModelEvaluator
from src.data import FeatureEngineer

from sklearn.model_selection import train_test_split

print("Modules loaded!")

In [None]:
# Load and prepare data
df = pd.read_csv('../data/raw/sample_projects.csv')

# Create features
fe = FeatureEngineer()
df = fe.create_features(df)

print(f"Data shape: {df.shape}")

In [None]:
# Prepare features and target
# Using risk_level as target (convert to binary: High=1, else=0)
df['target'] = (df['risk_level'] == 'High').astype(int)

# Select feature columns
feature_cols = [
    'completion_rate', 'team_size', 'budget', 'spent',
    'schedule_performance_index', 'cost_performance_index',
    'budget_variance_pct', 'team_stability'
]

# Filter to available columns
feature_cols = [c for c in feature_cols if c in df.columns]

X = df[feature_cols].fillna(0)
y = df['target']

print(f"Features: {len(feature_cols)}")
print(f"Target distribution: {y.value_counts().to_dict()}")

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {len(X_train)}, Test: {len(X_test)}")

In [None]:
# Compare models
trainer = MLTrainer()
comparison = trainer.compare_models(X_train, y_train)
comparison

In [None]:
# Train best model
trainer = MLTrainer(model_type='random_forest')
result = trainer.train_with_cv(X_train, y_train)

print(f"CV Score: {result['mean_score']:.3f} (+/- {result['std_score']:.3f})")

In [None]:
# Evaluate on test set
y_pred = trainer.model.predict(X_test)
y_proba = trainer.model.predict_proba(X_test)[:, 1]

evaluator = ModelEvaluator()
results = evaluator.evaluate(y_test, y_pred, y_proba)
evaluator.print_report()

In [None]:
# Save model
# trainer.save_model('../models/ml/best_model.pkl')