# 10: Random Forest Classifier

Train and evaluate Random Forest classifier for head impact prediction.

## Model Configuration

- **Algorithm**: RandomForestClassifier from sklearn
- **Task**: Classification (severity 0-5)
- **Features**: 21 RF features
- **Hyperparameters**:
  - n_estimators: 100
  - max_depth: 20
  - class_weight: balanced (handles class imbalance)
  - max_features: sqrt


In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
sys.path.append('../../src')

from sledhead_imu.models.random_forest import (
    train_random_forest,
    predict_random_forest,
    evaluate_random_forest
)
from sklearn.metrics import classification_report, confusion_matrix

# Load train/val/test splits
data_dir = Path('../data')
splits_dir = data_dir / '09_splits'
models_dir = data_dir / '10_models'

# Check if splits exist
train_X_file = splits_dir / 'train' / 'X_train.csv'
train_y_file = splits_dir / 'train' / 'y_train.csv'
val_X_file = splits_dir / 'val' / 'X_val.csv'
val_y_file = splits_dir / 'val' / 'y_val.csv'
test_X_file = splits_dir / 'test' / 'X_test.csv'
test_y_file = splits_dir / 'test' / 'y_test.csv'

if all(f.exists() for f in [train_X_file, train_y_file, val_X_file, val_y_file]):
    # Load splits
    X_train = pd.read_csv(train_X_file)
    y_train = pd.read_csv(train_y_file)
    X_val = pd.read_csv(val_X_file)
    y_val = pd.read_csv(val_y_file)
    
    # Handle DataFrame/series
    if isinstance(y_train, pd.DataFrame):
        y_train = y_train.iloc[:, 0]
    if isinstance(y_val, pd.DataFrame):
        y_val = y_val.iloc[:, 0]
    
    print("✓ Loaded splits")
    print(f"  Train: {X_train.shape} features, {len(y_train)} samples")
    print(f"  Val: {X_val.shape} features, {len(y_val)} samples")
    
    if test_X_file.exists() and test_y_file.exists():
        X_test = pd.read_csv(test_X_file)
        y_test = pd.read_csv(test_y_file)
        if isinstance(y_test, pd.DataFrame):
            y_test = y_test.iloc[:, 0]
        print(f"  Test: {X_test.shape} features, {len(y_test)} samples")
    
    print(f"\n✓ Feature columns: {list(X_train.columns)[:5]}... ({X_train.shape[1]} total)")
else:
    print("⚠️  Splits not found. Run 09_train_test_split.ipynb first.")


In [None]:
# Train Random Forest model
if 'X_train' in locals():
    print("Training Random Forest...")
    
    config = {
        'n_estimators': 100,
        'max_depth': 20,
        'min_samples_split': 2,
        'min_samples_leaf': 1,
        'class_weight': 'balanced',
        'random_state': 42,
        'n_jobs': -1
    }
    
    model = train_random_forest(X_train, y_train, X_val, y_val, config)
    
    print(f"✓ Model trained with {config['n_estimators']} trees")
    print(f"  Max depth: {config['max_depth']}")
    print(f"  Class weight: {config['class_weight']}")


In [None]:
# Evaluate on validation set
if 'model' in locals() and 'X_val' in locals():
    print("Evaluating on validation set...")
    
    results = evaluate_random_forest(model, X_val, y_val)
    
    print(f"\n✓ Validation Accuracy: {results['accuracy']:.3f}")
    print(f"\nConfusion Matrix:")
    print(results['confusion_matrix'])
    
    print(f"\nClassification Report:")
    print(classification_report(y_val, results['predictions']))
    
    print(f"\nTop 10 Feature Importances:")
    print(results['feature_importance'].head(10))


In [None]:
# Evaluate on test set (if available)
if 'model' in locals() and 'X_test' in locals():
    print("Evaluating on test set...")
    
    test_results = evaluate_random_forest(model, X_test, y_test)
    
    print(f"\n✓ Test Accuracy: {test_results['accuracy']:.3f}")
    print(f"\nConfusion Matrix:")
    print(test_results['confusion_matrix'])
    
    print(f"\nClassification Report:")
    print(classification_report(y_test, test_results['predictions']))
else:
    print("Test set not available")


In [None]:
# Save model and results
if 'model' in locals():
    import pickle
    import json
    
    models_dir.mkdir(parents=True, exist_ok=True)
    
    # Save model
    model_file = models_dir / 'rf' / 'model.pkl'
    model_file.parent.mkdir(parents=True, exist_ok=True)
    with open(model_file, 'wb') as f:
        pickle.dump(model, f)
    print(f"✓ Saved model to {model_file}")
    
    # Save feature importances
    if 'results' in locals():
        feature_importance_file = models_dir / 'rf' / 'feature_importance.csv'
        results['feature_importance'].to_csv(feature_importance_file, index=False)
        print(f"✓ Saved feature importances to {feature_importance_file}")
    
    # Save config
    config_file = models_dir / 'rf' / 'config.json'
    config['model_type'] = 'RandomForestClassifier'
    with open(config_file, 'w') as f:
        json.dump(config, f, indent=2)
    print(f"✓ Saved config to {config_file}")
    
    print("\n✅ Model training and evaluation complete!")
