# Train All Models - Model Comparison

This notebook trains and compares all available model types:

**Boosting Models:**
- XGBoost
- LightGBM
- CatBoost

**Neural Models:**
- LSTM
- GRU
- TCN

**Classical Models:**
- Random Forest
- Logistic Regression
- SVM

**Ensemble Models:**
- Voting Ensemble
- Stacking Ensemble
- Blending Ensemble

At the end, we compare all models on the same test set.

## Setup

In [None]:
# Install dependencies (uncomment if needed)
# !pip install -q xgboost lightgbm catboost scikit-learn torch pandas numpy matplotlib tqdm pyarrow

import sys
sys.path.insert(0, '..')

from src.utils.notebook import (
    setup_notebook, display_metrics, download_sample_data,
    plot_confusion_matrix, plot_model_comparison, get_sample_config
)

env = setup_notebook()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import time
import warnings
warnings.filterwarnings('ignore')

from src.models import ModelRegistry
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Show available models
print("Available Models:")
all_models = ModelRegistry.list_models()
for family, models in all_models.items():
    print(f"  {family}: {', '.join(models)}")

## Prepare Data

In [None]:
# Generate sample data
sample_paths = download_sample_data(output_dir="../data/sample", symbols=["SAMPLE"])
df = pd.read_parquet(sample_paths["SAMPLE"])
print(f"Loaded {len(df):,} samples")

In [None]:
def compute_features(df):
    """Compute technical features."""
    df = df.copy()
    
    # Returns
    df['log_return'] = np.log(df['close'] / df['close'].shift(1))
    for p in [5, 10, 20]:
        df[f'return_{p}'] = df['close'].pct_change(p)
    
    # Moving averages
    for p in [10, 20, 50]:
        df[f'sma_{p}'] = df['close'].rolling(p).mean()
        df[f'close_to_sma_{p}'] = df['close'] / df[f'sma_{p}'] - 1
    
    # RSI
    delta = df['close'].diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / loss.replace(0, np.inf)
    df['rsi'] = 100 - (100 / (1 + rs))
    
    # ATR
    tr = pd.concat([
        df['high'] - df['low'],
        abs(df['high'] - df['close'].shift(1)),
        abs(df['low'] - df['close'].shift(1))
    ], axis=1).max(axis=1)
    df['atr_14'] = tr.rolling(14).mean()
    df['atr_pct'] = df['atr_14'] / df['close']
    
    # Bollinger
    sma20 = df['close'].rolling(20).mean()
    std20 = df['close'].rolling(20).std()
    df['bb_position'] = (df['close'] - (sma20 - 2*std20)) / (4*std20)
    
    # Volume
    df['volume_sma'] = df['volume'].rolling(20).mean()
    df['volume_ratio'] = df['volume'] / df['volume_sma']
    
    # Volatility
    df['volatility_20'] = df['log_return'].rolling(20).std()
    
    # MACD
    ema12 = df['close'].ewm(span=12).mean()
    ema26 = df['close'].ewm(span=26).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    return df.dropna()

def create_labels(df, horizon=20):
    """Create triple-barrier labels."""
    df = df.copy()
    labels = np.zeros(len(df))
    
    for i in range(len(df) - horizon):
        entry = df['close'].iloc[i]
        atr = df['atr_14'].iloc[i]
        upper = entry + 1.5 * atr
        lower = entry - 1.0 * atr
        
        for j in range(1, horizon + 1):
            if i + j >= len(df):
                break
            if df['high'].iloc[i + j] >= upper:
                labels[i] = 1
                break
            if df['low'].iloc[i + j] <= lower:
                labels[i] = -1
                break
    
    df['label'] = labels
    return df

# Process data
df_features = compute_features(df)
df_labeled = create_labels(df_features)

# Define features
exclude_cols = ['datetime', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'label']
feature_cols = [c for c in df_labeled.columns if c not in exclude_cols]

print(f"Features: {len(feature_cols)}")

In [None]:
# Split data
n = len(df_labeled)
train_end = int(n * 0.70)
val_end = int(n * 0.85)
purge = 60

train_df = df_labeled.iloc[:train_end - purge]
val_df = df_labeled.iloc[train_end + purge:val_end - purge]
test_df = df_labeled.iloc[val_end + purge:]

# Tabular data (for boosting/classical)
X_train = train_df[feature_cols].values
y_train = train_df['label'].values.astype(int) + 1
X_val = val_df[feature_cols].values
y_val = val_df['label'].values.astype(int) + 1
X_test = test_df[feature_cols].values
y_test = test_df['label'].values.astype(int) + 1

# Sequence data (for neural models)
seq_len = 60

def create_sequences(X, y, seq_len):
    X_seq, y_seq = [], []
    for i in range(seq_len, len(X)):
        X_seq.append(X[i-seq_len:i])
        y_seq.append(y[i])
    return np.array(X_seq), np.array(y_seq)

X_train_seq, y_train_seq = create_sequences(X_train, y_train, seq_len)
X_val_seq, y_val_seq = create_sequences(X_val, y_val, seq_len)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, seq_len)

print(f"Tabular shapes: Train {X_train.shape}, Val {X_val.shape}, Test {X_test.shape}")
print(f"Sequence shapes: Train {X_train_seq.shape}, Val {X_val_seq.shape}, Test {X_test_seq.shape}")

## Train All Models

In [None]:
# Store results for comparison
all_results = {}

def train_and_evaluate(model_name, X_train, y_train, X_val, y_val, X_test, y_test, config=None):
    """Train a model and return results."""
    print(f"\n{'='*60}")
    print(f"Training: {model_name.upper()}")
    print(f"{'='*60}")
    
    try:
        # Create model
        model = ModelRegistry.create(model_name, config=config)
        print(f"Model family: {model.model_family}")
        
        # Train
        start_time = time.time()
        training_metrics = model.fit(X_train, y_train, X_val, y_val)
        train_time = time.time() - start_time
        
        # Evaluate on test set
        predictions = model.predict(X_test)
        y_pred = predictions.class_predictions
        
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='macro')
        
        print(f"\nResults:")
        print(f"  Train Time: {train_time:.1f}s")
        print(f"  Val F1: {training_metrics.val_f1:.4f}")
        print(f"  Test Accuracy: {accuracy:.4f}")
        print(f"  Test F1 (macro): {f1:.4f}")
        
        return {
            'model': model,
            'training_metrics': training_metrics.to_dict(),
            'evaluation_metrics': {
                'accuracy': accuracy,
                'macro_f1': f1,
            },
            'predictions': y_pred,
            'train_time': train_time,
            'success': True,
        }
        
    except Exception as e:
        print(f"ERROR: {e}")
        return {'success': False, 'error': str(e)}

### Boosting Models

In [None]:
# XGBoost
all_results['xgboost'] = train_and_evaluate(
    'xgboost', X_train, y_train, X_val, y_val, X_test, y_test,
    config={'n_estimators': 150, 'max_depth': 6, 'early_stopping_rounds': 20}
)

In [None]:
# LightGBM
all_results['lightgbm'] = train_and_evaluate(
    'lightgbm', X_train, y_train, X_val, y_val, X_test, y_test,
    config={'n_estimators': 150, 'max_depth': 6, 'early_stopping_rounds': 20}
)

In [None]:
# CatBoost
all_results['catboost'] = train_and_evaluate(
    'catboost', X_train, y_train, X_val, y_val, X_test, y_test,
    config={'iterations': 150, 'depth': 6, 'early_stopping_rounds': 20}
)

### Classical Models

In [None]:
# Random Forest
all_results['random_forest'] = train_and_evaluate(
    'random_forest', X_train, y_train, X_val, y_val, X_test, y_test,
    config={'n_estimators': 100, 'max_depth': 10}
)

In [None]:
# Logistic Regression (needs scaled data)
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

all_results['logistic'] = train_and_evaluate(
    'logistic', X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test,
    config={'max_iter': 500}
)

In [None]:
# SVM (use subset for speed)
subset_size = min(5000, len(X_train_scaled))
all_results['svm'] = train_and_evaluate(
    'svm', X_train_scaled[:subset_size], y_train[:subset_size], 
    X_val_scaled, y_val, X_test_scaled, y_test,
    config={'kernel': 'rbf', 'C': 1.0}
)

### Neural Models (GPU required for best performance)

In [None]:
# Check if we have GPU for neural models
if env['gpu_available']:
    print(f"GPU available: {env['gpu_name']} - training neural models")
    train_neural = True
else:
    print("No GPU available - neural models will be slow. Training anyway...")
    train_neural = True  # Train anyway but with reduced settings

In [None]:
if train_neural:
    # LSTM
    all_results['lstm'] = train_and_evaluate(
        'lstm', X_train_seq, y_train_seq, X_val_seq, y_val_seq, X_test_seq, y_test_seq,
        config={'hidden_size': 64, 'num_layers': 1, 'epochs': 10, 'batch_size': 64}
    )

In [None]:
if train_neural:
    # GRU
    all_results['gru'] = train_and_evaluate(
        'gru', X_train_seq, y_train_seq, X_val_seq, y_val_seq, X_test_seq, y_test_seq,
        config={'hidden_size': 64, 'num_layers': 1, 'epochs': 10, 'batch_size': 64}
    )

In [None]:
if train_neural:
    # TCN
    all_results['tcn'] = train_and_evaluate(
        'tcn', X_train_seq, y_train_seq, X_val_seq, y_val_seq, X_test_seq, y_test_seq,
        config={'num_channels': [32, 32], 'epochs': 10, 'batch_size': 64}
    )

### Ensemble Models

In [None]:
# Voting Ensemble (using boosting models)
successful_models = {k: v['model'] for k, v in all_results.items() 
                     if v.get('success') and k in ['xgboost', 'lightgbm', 'catboost']}

if len(successful_models) >= 2:
    all_results['voting'] = train_and_evaluate(
        'voting', X_train, y_train, X_val, y_val, X_test, y_test,
        config={
            'base_models': list(successful_models.values()),
            'voting': 'soft'
        }
    )
else:
    print("Not enough successful models for voting ensemble")

In [None]:
# Stacking Ensemble
if len(successful_models) >= 2:
    all_results['stacking'] = train_and_evaluate(
        'stacking', X_train, y_train, X_val, y_val, X_test, y_test,
        config={
            'base_models': list(successful_models.values()),
            'meta_learner': 'logistic'
        }
    )

## Compare All Models

In [None]:
# Build comparison table
comparison_data = []

for model_name, result in all_results.items():
    if result.get('success'):
        comparison_data.append({
            'Model': model_name,
            'Test Accuracy': result['evaluation_metrics']['accuracy'],
            'Test F1 (macro)': result['evaluation_metrics']['macro_f1'],
            'Val F1': result['training_metrics'].get('val_f1', 0),
            'Train Time (s)': result['train_time'],
        })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Test F1 (macro)', ascending=False)
comparison_df = comparison_df.reset_index(drop=True)

print("\n" + "="*80)
print(" MODEL COMPARISON - Sorted by Test F1")
print("="*80)
print(comparison_df.to_string(index=False))

In [None]:
# Visualize comparison
plot_model_comparison(
    {k: v for k, v in all_results.items() if v.get('success')},
    metric='macro_f1',
    title='Model Comparison - Test F1 Score'
)

In [None]:
# Training time comparison
fig, ax = plt.subplots(figsize=(10, 6))

models = comparison_df['Model'].tolist()
times = comparison_df['Train Time (s)'].tolist()
f1_scores = comparison_df['Test F1 (macro)'].tolist()

colors = plt.cm.RdYlGn(np.array(f1_scores) / max(f1_scores))
bars = ax.barh(models[::-1], times[::-1], color=colors[::-1])

ax.set_xlabel('Training Time (seconds)')
ax.set_title('Training Time by Model (color = F1 score)')
ax.grid(True, axis='x', alpha=0.3)

# Add time labels
for bar, t in zip(bars, times[::-1]):
    ax.text(t + 0.5, bar.get_y() + bar.get_height()/2, f'{t:.1f}s', va='center')

plt.tight_layout()
plt.show()

## Best Model Analysis

In [None]:
# Get best model
best_model_name = comparison_df.iloc[0]['Model']
best_result = all_results[best_model_name]

print(f"Best Model: {best_model_name.upper()}")
print(f"Test F1: {best_result['evaluation_metrics']['macro_f1']:.4f}")
print(f"Test Accuracy: {best_result['evaluation_metrics']['accuracy']:.4f}")

In [None]:
# Confusion matrix for best model
# Use appropriate test labels based on model type
if best_model_name in ['lstm', 'gru', 'tcn']:
    y_test_best = y_test_seq
else:
    y_test_best = y_test

plot_confusion_matrix(
    y_test_best, best_result['predictions'],
    labels=['Short', 'Neutral', 'Long'],
    title=f'{best_model_name.upper()} - Confusion Matrix'
)

In [None]:
# Classification report for best model
print(f"\n{best_model_name.upper()} - Classification Report:")
print(classification_report(
    y_test_best, best_result['predictions'],
    target_names=['Short', 'Neutral', 'Long']
))

## Save Results

In [None]:
from pathlib import Path
import json

# Save comparison results
output_dir = Path('../outputs/model_comparison')
output_dir.mkdir(parents=True, exist_ok=True)

# Save comparison table
comparison_df.to_csv(output_dir / 'model_comparison.csv', index=False)
print(f"Saved comparison to {output_dir / 'model_comparison.csv'}")

# Save best model
best_model_path = output_dir / f'best_model_{best_model_name}'
best_result['model'].save(best_model_path)
print(f"Saved best model to {best_model_path}")

## Summary

This notebook compared all available model types:

**Key Findings:**
1. Boosting models (XGBoost, LightGBM, CatBoost) typically offer best accuracy/speed tradeoff
2. Neural models (LSTM, GRU, TCN) can capture temporal patterns but require more training time
3. Ensemble methods can improve robustness by combining multiple models

**Next Steps:**
- Use `03_cross_validation.ipynb` for proper cross-validation
- Fine-tune the best model's hyperparameters
- Build ensemble of top-performing models