# Cross-Validation Pipeline

This notebook demonstrates proper time-series cross-validation with:

1. **Purged K-Fold CV** - Prevents information leakage between folds
2. **Walk-Forward Feature Selection** - Select features that generalize
3. **Out-of-Fold Predictions** - Generate OOF predictions for stacking
4. **Hyperparameter Tuning** - Optuna-based optimization

This is critical for developing robust trading strategies that don't overfit.

## Setup

In [None]:
# Install dependencies
# !pip install -q xgboost lightgbm catboost scikit-learn optuna pandas numpy matplotlib tqdm pyarrow

import sys
sys.path.insert(0, '..')

from src.utils.notebook import setup_notebook, download_sample_data, plot_confusion_matrix
env = setup_notebook()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

from src.models import ModelRegistry
from src.cross_validation import (
    PurgedKFold, PurgedKFoldConfig,
    WalkForwardFeatureSelector,
    OOFGenerator,
    CrossValidationRunner,
)
from sklearn.metrics import accuracy_score, f1_score

## Prepare Data

In [None]:
# Generate sample data
sample_paths = download_sample_data(output_dir="../data/sample", symbols=["SAMPLE"])
df = pd.read_parquet(sample_paths["SAMPLE"])
print(f"Loaded {len(df):,} samples")

In [None]:
def compute_features(df):
    """Compute technical features."""
    df = df.copy()
    
    # Returns
    df['log_return'] = np.log(df['close'] / df['close'].shift(1))
    for p in [5, 10, 20, 50]:
        df[f'return_{p}'] = df['close'].pct_change(p)
    
    # Moving averages
    for p in [10, 20, 50, 100]:
        df[f'sma_{p}'] = df['close'].rolling(p).mean()
        df[f'close_to_sma_{p}'] = df['close'] / df[f'sma_{p}'] - 1
    
    # RSI
    for period in [7, 14, 21]:
        delta = df['close'].diff()
        gain = delta.where(delta > 0, 0).rolling(period).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(period).mean()
        rs = gain / loss.replace(0, np.inf)
        df[f'rsi_{period}'] = 100 - (100 / (1 + rs))
    
    # ATR
    tr = pd.concat([
        df['high'] - df['low'],
        abs(df['high'] - df['close'].shift(1)),
        abs(df['low'] - df['close'].shift(1))
    ], axis=1).max(axis=1)
    for period in [7, 14, 21]:
        df[f'atr_{period}'] = tr.rolling(period).mean()
        df[f'atr_{period}_pct'] = df[f'atr_{period}'] / df['close']
    
    # Bollinger
    for period in [10, 20]:
        sma = df['close'].rolling(period).mean()
        std = df['close'].rolling(period).std()
        df[f'bb_position_{period}'] = (df['close'] - (sma - 2*std)) / (4*std)
        df[f'bb_width_{period}'] = (4*std) / sma
    
    # Volume
    df['volume_sma_10'] = df['volume'].rolling(10).mean()
    df['volume_sma_20'] = df['volume'].rolling(20).mean()
    df['volume_ratio'] = df['volume'] / df['volume_sma_20']
    
    # Volatility
    for period in [10, 20, 50]:
        df[f'volatility_{period}'] = df['log_return'].rolling(period).std()
    
    # MACD
    ema12 = df['close'].ewm(span=12).mean()
    ema26 = df['close'].ewm(span=26).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    # Stochastic
    low_14 = df['low'].rolling(14).min()
    high_14 = df['high'].rolling(14).max()
    df['stoch_k'] = 100 * (df['close'] - low_14) / (high_14 - low_14)
    df['stoch_d'] = df['stoch_k'].rolling(3).mean()
    
    return df.dropna()

def create_labels(df, horizon=20):
    """Create triple-barrier labels."""
    df = df.copy()
    labels = np.zeros(len(df))
    
    for i in range(len(df) - horizon):
        entry = df['close'].iloc[i]
        atr = df['atr_14'].iloc[i]
        upper = entry + 1.5 * atr
        lower = entry - 1.0 * atr
        
        for j in range(1, horizon + 1):
            if i + j >= len(df):
                break
            if df['high'].iloc[i + j] >= upper:
                labels[i] = 1
                break
            if df['low'].iloc[i + j] <= lower:
                labels[i] = -1
                break
    
    df['label'] = labels
    return df

# Process data
df_features = compute_features(df)
df_labeled = create_labels(df_features)

# Define features
exclude_cols = ['datetime', 'symbol', 'open', 'high', 'low', 'close', 'volume', 'label']
feature_cols = [c for c in df_labeled.columns if c not in exclude_cols]

print(f"Features: {len(feature_cols)}")
print(f"Samples: {len(df_labeled):,}")

In [None]:
# Prepare arrays
X = df_labeled[feature_cols].values
y = df_labeled['label'].values.astype(int) + 1  # Convert to 0,1,2

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Label distribution: {np.bincount(y)}")

## 1. Purged K-Fold Cross-Validation

Standard K-Fold CV causes information leakage in time series. Purged K-Fold:
- Maintains chronological order
- Adds a "purge" gap between train and validation to remove overlapping labels
- Adds an "embargo" period after validation to prevent serial correlation leakage

In [None]:
# Configure purged K-Fold
cv_config = PurgedKFoldConfig(
    n_splits=5,
    purge_bars=60,      # Remove 60 bars around train/val boundary (3x horizon)
    embargo_bars=288,   # Embargo 288 bars after validation (~1 day)
)

cv = PurgedKFold(cv_config)
print(f"CV Configuration:")
print(f"  Splits: {cv_config.n_splits}")
print(f"  Purge bars: {cv_config.purge_bars}")
print(f"  Embargo bars: {cv_config.embargo_bars}")

In [None]:
# Visualize the splits
fig, ax = plt.subplots(figsize=(14, 6))

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    # Plot train indices
    ax.scatter(train_idx, [fold_idx] * len(train_idx), c='blue', alpha=0.3, s=1, label='Train' if fold_idx == 0 else '')
    # Plot val indices
    ax.scatter(val_idx, [fold_idx] * len(val_idx), c='red', alpha=0.5, s=1, label='Val' if fold_idx == 0 else '')
    
    print(f"Fold {fold_idx + 1}: Train {len(train_idx):,} samples, Val {len(val_idx):,} samples")

ax.set_xlabel('Sample Index')
ax.set_ylabel('Fold')
ax.set_title('Purged K-Fold Split Visualization')
ax.legend()
plt.tight_layout()
plt.show()

In [None]:
# Run cross-validation with XGBoost
model_config = {
    'n_estimators': 100,
    'max_depth': 6,
    'learning_rate': 0.1,
    'early_stopping_rounds': 15,
}

fold_results = []

for fold_idx, (train_idx, val_idx) in enumerate(tqdm(cv.split(X, y), total=cv_config.n_splits, desc="CV Folds")):
    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    # Train model
    model = ModelRegistry.create('xgboost', config=model_config)
    metrics = model.fit(X_train, y_train, X_val, y_val)
    
    # Evaluate
    predictions = model.predict(X_val)
    accuracy = accuracy_score(y_val, predictions.class_predictions)
    f1 = f1_score(y_val, predictions.class_predictions, average='macro')
    
    fold_results.append({
        'fold': fold_idx + 1,
        'train_samples': len(train_idx),
        'val_samples': len(val_idx),
        'train_f1': metrics.train_f1,
        'val_f1': f1,
        'accuracy': accuracy,
    })

# Summary
results_df = pd.DataFrame(fold_results)
print("\nCross-Validation Results:")
print(results_df.to_string(index=False))
print(f"\nMean Val F1: {results_df['val_f1'].mean():.4f} (+/- {results_df['val_f1'].std():.4f})")
print(f"Mean Accuracy: {results_df['accuracy'].mean():.4f} (+/- {results_df['accuracy'].std():.4f})")

## 2. Walk-Forward Feature Selection

Select features that generalize across time periods using walk-forward validation.

In [None]:
# Initialize feature selector
feature_selector = WalkForwardFeatureSelector(
    n_splits=3,
    purge_bars=60,
    embargo_bars=288,
    method='mdi',  # Mean Decrease Impurity
    n_top_features=20,
)

print("Running walk-forward feature selection...")
selected_features = feature_selector.fit_select(X, y, feature_names=feature_cols)

print(f"\nSelected {len(selected_features)} features:")
for i, feat in enumerate(selected_features, 1):
    print(f"  {i}. {feat}")

In [None]:
# Get feature importance scores
importance_df = feature_selector.get_importance_scores()

# Plot top features
fig, ax = plt.subplots(figsize=(10, 8))

top_n = 20
top_features = importance_df.head(top_n)

ax.barh(top_features['feature'][::-1], top_features['importance'][::-1], color='steelblue')
ax.set_xlabel('Mean Importance')
ax.set_title(f'Top {top_n} Features by Walk-Forward Importance')
ax.grid(True, axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Compare CV performance with selected features vs all features
selected_indices = [feature_cols.index(f) for f in selected_features]
X_selected = X[:, selected_indices]

print(f"Original features: {X.shape[1]}")
print(f"Selected features: {X_selected.shape[1]}")

# Re-run CV with selected features
fold_results_selected = []

for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
    X_train, X_val = X_selected[train_idx], X_selected[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    model = ModelRegistry.create('xgboost', config=model_config)
    metrics = model.fit(X_train, y_train, X_val, y_val)
    
    predictions = model.predict(X_val)
    f1 = f1_score(y_val, predictions.class_predictions, average='macro')
    
    fold_results_selected.append({'fold': fold_idx + 1, 'val_f1': f1})

results_selected_df = pd.DataFrame(fold_results_selected)

print(f"\nAll Features - Mean Val F1: {results_df['val_f1'].mean():.4f}")
print(f"Selected Features - Mean Val F1: {results_selected_df['val_f1'].mean():.4f}")

## 3. Out-of-Fold Predictions for Stacking

Generate OOF predictions that can be used as meta-features for stacking ensembles.

In [None]:
# Initialize OOF generator
oof_generator = OOFGenerator(
    cv=cv,
    return_probabilities=True,
)

# Generate OOF predictions for multiple models
model_configs = {
    'xgboost': {'n_estimators': 100, 'max_depth': 6, 'early_stopping_rounds': 15},
    'lightgbm': {'n_estimators': 100, 'max_depth': 6, 'early_stopping_rounds': 15},
    'catboost': {'iterations': 100, 'depth': 6, 'early_stopping_rounds': 15},
}

oof_predictions = {}

for model_name, config in model_configs.items():
    print(f"\nGenerating OOF predictions for {model_name}...")
    
    # This generates predictions for each fold's validation set
    oof_result = oof_generator.generate(
        X=X,
        y=y,
        model_factory=lambda: ModelRegistry.create(model_name, config=config),
    )
    
    oof_predictions[model_name] = oof_result
    
    # Calculate OOF metrics
    mask = oof_result['predictions'] >= 0  # Valid predictions
    oof_f1 = f1_score(y[mask], oof_result['predictions'][mask], average='macro')
    print(f"  OOF F1: {oof_f1:.4f}")

In [None]:
# Stack OOF predictions for meta-learning
n_samples = len(y)
n_classes = 3

# Create stacking features from OOF probabilities
stacking_features = []
for model_name, oof_result in oof_predictions.items():
    if 'probabilities' in oof_result:
        stacking_features.append(oof_result['probabilities'])

X_meta = np.hstack(stacking_features)
print(f"Stacking features shape: {X_meta.shape}")
print(f"  {len(model_configs)} models x {n_classes} classes = {len(model_configs) * n_classes} features")

In [None]:
# Train meta-learner on stacking features
from sklearn.linear_model import LogisticRegression

# Get valid samples (those with OOF predictions)
valid_mask = X_meta.sum(axis=1) > 0  # Rows with predictions
X_meta_valid = X_meta[valid_mask]
y_valid = y[valid_mask]

# Split for evaluation
n_valid = len(y_valid)
train_end = int(n_valid * 0.8)

X_meta_train = X_meta_valid[:train_end]
y_meta_train = y_valid[:train_end]
X_meta_test = X_meta_valid[train_end:]
y_meta_test = y_valid[train_end:]

# Train meta-learner
meta_learner = LogisticRegression(max_iter=1000, multi_class='multinomial')
meta_learner.fit(X_meta_train, y_meta_train)

# Evaluate
meta_predictions = meta_learner.predict(X_meta_test)
meta_f1 = f1_score(y_meta_test, meta_predictions, average='macro')
meta_accuracy = accuracy_score(y_meta_test, meta_predictions)

print(f"\nMeta-Learner Performance:")
print(f"  F1 (macro): {meta_f1:.4f}")
print(f"  Accuracy: {meta_accuracy:.4f}")

## 4. Hyperparameter Tuning with Optuna

In [None]:
import optuna
from optuna.samplers import TPESampler

def objective(trial):
    """Optuna objective function for XGBoost."""
    
    # Define hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'early_stopping_rounds': 20,
    }
    
    # Cross-validation
    fold_scores = []
    
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        model = ModelRegistry.create('xgboost', config=params)
        model.fit(X_train, y_train, X_val, y_val)
        
        predictions = model.predict(X_val)
        f1 = f1_score(y_val, predictions.class_predictions, average='macro')
        fold_scores.append(f1)
    
    return np.mean(fold_scores)

# Run optimization
print("Starting hyperparameter optimization...")
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42),
)

study.optimize(
    objective,
    n_trials=20,  # Increase for better results
    show_progress_bar=True,
)

In [None]:
# Show best parameters
print("\nBest Parameters:")
for param, value in study.best_params.items():
    print(f"  {param}: {value}")

print(f"\nBest CV F1: {study.best_value:.4f}")

In [None]:
# Visualize optimization history
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Optimization history
ax = axes[0]
trials = [t.number for t in study.trials]
values = [t.value for t in study.trials]
ax.plot(trials, values, 'o-', alpha=0.7)
ax.axhline(y=study.best_value, color='r', linestyle='--', label=f'Best: {study.best_value:.4f}')
ax.set_xlabel('Trial')
ax.set_ylabel('CV F1 Score')
ax.set_title('Optimization History')
ax.legend()
ax.grid(True, alpha=0.3)

# Parameter importance
ax = axes[1]
importance = optuna.importance.get_param_importances(study)
params = list(importance.keys())
values = list(importance.values())
ax.barh(params[::-1], values[::-1], color='steelblue')
ax.set_xlabel('Importance')
ax.set_title('Hyperparameter Importance')
ax.grid(True, axis='x', alpha=0.3)

plt.tight_layout()
plt.show()

## 5. Final Model Training with Best Parameters

In [None]:
# Train final model with best parameters
best_params = study.best_params.copy()
best_params['early_stopping_rounds'] = 30

# Split data for final evaluation
n = len(X)
train_end = int(n * 0.7)
val_end = int(n * 0.85)

X_train_final = X[:train_end]
y_train_final = y[:train_end]
X_val_final = X[train_end:val_end]
y_val_final = y[train_end:val_end]
X_test_final = X[val_end:]
y_test_final = y[val_end:]

print(f"Final splits:")
print(f"  Train: {len(X_train_final):,}")
print(f"  Val: {len(X_val_final):,}")
print(f"  Test: {len(X_test_final):,}")

# Train
final_model = ModelRegistry.create('xgboost', config=best_params)
final_metrics = final_model.fit(X_train_final, y_train_final, X_val_final, y_val_final)

# Evaluate on test set
final_predictions = final_model.predict(X_test_final)
final_f1 = f1_score(y_test_final, final_predictions.class_predictions, average='macro')
final_accuracy = accuracy_score(y_test_final, final_predictions.class_predictions)

print(f"\nFinal Model Performance:")
print(f"  Test F1 (macro): {final_f1:.4f}")
print(f"  Test Accuracy: {final_accuracy:.4f}")

In [None]:
# Confusion matrix for final model
plot_confusion_matrix(
    y_test_final, final_predictions.class_predictions,
    labels=['Short', 'Neutral', 'Long'],
    title='Final Model - Test Set Confusion Matrix'
)

## Summary

This notebook demonstrated:

1. **Purged K-Fold CV** - Proper time-series cross-validation that prevents information leakage
2. **Walk-Forward Feature Selection** - Selecting features that generalize across time
3. **OOF Predictions** - Generating predictions for stacking ensembles
4. **Hyperparameter Tuning** - Using Optuna for automated optimization

**Key Takeaways:**
- Always use purged CV for time series to prevent lookahead bias
- Feature selection should be done within the CV loop
- OOF predictions enable powerful stacking ensembles
- Hyperparameter tuning should use CV, not a single train/val split

**Next Steps:**
- Apply these techniques to your real trading data
- Build stacking ensembles using the OOF predictions
- Run backtests to validate real-world performance