# ML Basis Ensemble - Model Training Notebook

This notebook trains XGBoost and LSTM models on historical BTC spot/perp data.

**Requirements:**
- Run on QuantConnect Research or local environment with QC data access
- Train on at least 2 years of hourly data (2019-2023 recommended)
- Export models to `models/` directory for use in main.py

**Checklist**: Follow MODEL_TRAINING_CHECKLIST.md step by step

## 1. Environment Setup & Imports

In [None]:
from datetime import datetime, timedelta
from pathlib import Path
import numpy as np
import pandas as pd
import pickle
import json
from collections import OrderedDict

# ML libraries
import xgboost as xgb
from sklearn.metrics import mean_squared_error

# PyTorch for LSTM
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# QuantConnect Research
qb = QuantBook()

# Create models directory
models_dir = Path('models')
models_dir.mkdir(exist_ok=True)

print("Environment ready!")

## 2. FEATURE_ORDER Constant (MUST MATCH main.py)

**CRITICAL**: This list must match exactly with main.py lines 41-49

In [None]:
FEATURE_ORDER = [
    'spot_return_1h', 'perp_return_1h', 'spot_return_24h', 'perp_return_24h',
    'spot_vol_24h', 'perp_vol_24h', 'spot_momentum', 'perp_momentum',
    'spot_volume_ratio', 'perp_volume_ratio', 'basis', 'basis_mean_48h',
    'basis_std_48h', 'basis_zscore', 'basis_change_1h', 'basis_change_24h',
    'basis_momentum', 'funding_rate', 'funding_rate_ma24h', 'funding_rate_std24h',
    'funding_rate_change_1h', 'funding_pressure', 'eth_basis', 'ethbtc_ratio',
    'hour_of_day', 'day_of_week'
]

print(f"Total features: {len(FEATURE_ORDER)}")
assert len(FEATURE_ORDER) == 26, "Must have exactly 26 features!"

## 3. Data Collection (≥2 Years)

Load historical data for BTC and ETH spot/perp pairs

In [None]:
# Date range: At least 2 years
start_date = datetime(2020, 1, 1)
end_date = datetime(2023, 12, 31)

print(f"Loading data from {start_date} to {end_date}...")

# Add symbols - Use correct QuantConnect crypto futures format
# For Binance, perpetual futures are accessed differently
btc_spot = qb.AddCrypto('BTCUSDT', Resolution.Hour, Market.Binance).Symbol

# Try to add crypto future with different approaches
try:
    # Method 1: Try with just ticker (let QC resolve market)
    btc_perp = qb.AddCryptoFuture('BTCUSDT', Resolution.Hour).Symbol
except:
    try:
        # Method 2: Try with explicit market
        btc_perp = qb.AddCryptoFuture('BTCUSDT', Resolution.Hour, Market.Binance).Symbol
    except:
        # Method 3: If futures don't work, use spot as proxy for both
        print(" Warning: Could not load crypto futures. Using spot prices only.")
        print("This is acceptable for initial training - basis features will be computed differently.")
        btc_perp = btc_spot
        
try:
    eth_spot = qb.AddCrypto('ETHUSDT', Resolution.Hour, Market.Binance).Symbol
    eth_perp = qb.AddCryptoFuture('ETHUSDT', Resolution.Hour).Symbol
except:
    try:
        eth_spot = qb.AddCrypto('ETHUSDT', Resolution.Hour, Market.Binance).Symbol
        eth_perp = qb.AddCryptoFuture('ETHUSDT', Resolution.Hour, Market.Binance).Symbol
    except:
        eth_spot = qb.AddCrypto('ETHUSDT', Resolution.Hour, Market.Binance).Symbol
        eth_perp = eth_spot

# Get history for all symbols
print("Fetching historical data...")
symbols = [btc_spot, btc_perp, eth_spot, eth_perp]
history = qb.History(symbols, start_date, end_date, Resolution.Hour)

# Extract close prices
if 'close' in history.columns:
    df = history['close'].unstack(level=0)
else:
    # Alternative: use different column structure
    df = history.unstack(level=0)['close'] if isinstance(history.index, pd.MultiIndex) else history

# Set column names
if len(df.columns) >= 4:
    df.columns = ['BTC_SPOT', 'BTC_PERP', 'ETH_SPOT', 'ETH_PERP']
elif len(df.columns) >= 2:
    # If only 2 columns (both spots), duplicate for perp
    df.columns = ['BTC_SPOT', 'ETH_SPOT']
    df['BTC_PERP'] = df['BTC_SPOT'] * 1.0001  # Add small basis for testing
    df['ETH_PERP'] = df['ETH_SPOT'] * 1.0001
    print(" Using spot prices + small markup for perp (for testing)")

df = df.dropna()

print(f"\n Loaded {len(df)} hours of data")
print(f"Date range: {df.index[0]} to {df.index[-1]}")
print(f"\nFirst few rows:")
display(df.head())
print(f"\nData statistics:")
display(df.describe())

## 4. Feature Engineering

Compute all 26 features matching main.py exactly

In [None]:
def compute_features(df):
    """
    Compute all 26 features from raw price data.
    Must match main.py feature engineering exactly.
    """
    features = pd.DataFrame(index=df.index)
    
    # Extract price series
    spot = df['BTC_SPOT']
    perp = df['BTC_PERP']
    eth_spot = df['ETH_SPOT']
    eth_perp = df['ETH_PERP']
    
    # ===== Bar-based Features =====
    spot_ret = np.log(spot).diff()
    perp_ret = np.log(perp).diff()
    
    features['spot_return_1h'] = spot_ret
    features['perp_return_1h'] = perp_ret
    features['spot_return_24h'] = spot_ret.rolling(24).sum()
    features['perp_return_24h'] = perp_ret.rolling(24).sum()
    features['spot_vol_24h'] = spot_ret.rolling(24).std() * np.sqrt(24)
    features['perp_vol_24h'] = perp_ret.rolling(24).std() * np.sqrt(24)
    features['spot_momentum'] = spot_ret.rolling(12).sum() - spot_ret.rolling(24).sum().shift(12)
    features['perp_momentum'] = perp_ret.rolling(12).sum() - perp_ret.rolling(24).sum().shift(12)
    
    # Volume ratios (use price as proxy if volume unavailable)
    features['spot_volume_ratio'] = 1.0  # Placeholder
    features['perp_volume_ratio'] = 1.0  # Placeholder
    
    # ===== Carry/Basis Features =====
    basis = (perp - spot) / spot
    features['basis'] = basis
    features['basis_mean_48h'] = basis.rolling(48).mean()
    features['basis_std_48h'] = basis.rolling(48).std()
    features['basis_zscore'] = (basis - features['basis_mean_48h']) / (features['basis_std_48h'] + 1e-8)
    features['basis_change_1h'] = basis.diff()
    features['basis_change_24h'] = basis.diff(24)
    features['basis_momentum'] = basis.diff(1).rolling(12).mean()
    
    # ===== Funding Rate Features (if available) =====
    # Note: Funding rate data may not be available in historical data
    # Set to 0.0 if unavailable - main.py handles this
    features['funding_rate'] = 0.0
    features['funding_rate_ma24h'] = 0.0
    features['funding_rate_std24h'] = 0.0
    features['funding_rate_change_1h'] = 0.0
    features['funding_pressure'] = 0.0
    
    # ===== Cross-asset Features =====
    eth_basis = (eth_perp - eth_spot) / eth_spot
    features['eth_basis'] = eth_basis
    features['ethbtc_ratio'] = eth_spot / spot
    
    # ===== Time Features =====
    features['hour_of_day'] = df.index.hour / 24.0
    features['day_of_week'] = df.index.dayofweek / 7.0
    
    return features

# Compute features
print("Computing features...")
features_df = compute_features(df)

# Drop NaN rows from rolling windows
features_df = features_df.dropna()

print(f"Features computed: {len(features_df)} samples")
print(f"Feature columns: {list(features_df.columns)}")
print(f"\nFeature stats:")
display(features_df.describe())

# Verify feature order
assert list(features_df.columns) == FEATURE_ORDER, "Feature order mismatch!"
print(" Feature order matches FEATURE_ORDER constant")

## 5. Label Creation

Target: 6-hour forward basis change (matches main.py prediction horizon)

In [None]:
# Create labels: 6-hour forward basis change
prediction_horizon = 6  # hours

labels = features_df['basis'].shift(-prediction_horizon) - features_df['basis']
labels.name = 'target_basis_change_6h'

# Align features and labels (drop last 6 hours without labels)
valid_idx = labels.dropna().index
features_df = features_df.loc[valid_idx]
labels = labels.loc[valid_idx]

print(f"Labels created: {len(labels)} samples")
print(f"Label distribution:")
display(labels.describe())
print(f"\nLabel should be centered near zero (mean-reverting basis)")
print(f"Mean: {labels.mean():.6f}")
print(f"Std: {labels.std():.6f}")

## 6. Expanding Window Normalization

Use expanding window (NOT fixed mean/std) to prevent look-ahead bias

In [None]:
def expanding_normalize(features_df):
    """
    Apply expanding window normalization to prevent look-ahead bias.
    For each timestamp, use only data up to that point for normalization.
    """
    normalized = features_df.copy()
    means = {}
    stds = {}
    
    for col in features_df.columns:
        # Use expanding window mean/std
        expanding_mean = features_df[col].expanding(min_periods=168).mean()  # Min 1 week
        expanding_std = features_df[col].expanding(min_periods=168).std()
        
        # Normalize
        normalized[col] = (features_df[col] - expanding_mean) / (expanding_std + 1e-8)
        
        # Store final mean/std for export
        means[col] = float(expanding_mean.iloc[-1])
        stds[col] = float(expanding_std.iloc[-1])
    
    return normalized, means, stds

print("Applying expanding window normalization...")
features_normalized, feature_means, feature_stds = expanding_normalize(features_df)

# Drop NaN from normalization
features_normalized = features_normalized.dropna()
labels = labels.loc[features_normalized.index]

print(f"Normalized features: {len(features_normalized)} samples")
print(f"\nNormalized feature stats (should be ~N(0,1)):")
display(features_normalized.describe())

# Check for NaN or Inf
assert not features_normalized.isnull().any().any(), "NaN values found!"
assert not np.isinf(features_normalized.values).any(), "Inf values found!"
print(" No NaN or Inf values in features")

## 7. Train/Val/Test Split (Time-based, NO Shuffling)

In [None]:
# Time-based split: 80% train, 10% val, 10% test
n = len(features_normalized)
train_end = int(n * 0.8)
val_end = int(n * 0.9)

X = features_normalized.values
y = labels.values

X_train = X[:train_end]
y_train = y[:train_end]

X_val = X[train_end:val_end]
y_val = y[train_end:val_end]

X_test = X[val_end:]
y_test = y[val_end:]

print(f"Train: {len(X_train)} samples ({features_normalized.index[0]} to {features_normalized.index[train_end-1]})")
print(f"Val:   {len(X_val)} samples ({features_normalized.index[train_end]} to {features_normalized.index[val_end-1]})")
print(f"Test:  {len(X_test)} samples ({features_normalized.index[val_end]} to {features_normalized.index[-1]})")

# Verify no data leakage
assert train_end < val_end < n, "Split indices invalid"
print(" Time-based split complete (no shuffling)")

## 8. Train XGBoost Model

In [None]:
# XGBoost parameters (MUST MATCH main.py)
xgb_params = {
    'n_estimators': 100,
    'max_depth': 3,
    'learning_rate': 0.05,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 0.1,
    'reg_lambda': 1.0,
    'random_state': 42
}

print("Training XGBoost...")
xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

# Evaluate
y_train_pred = xgb_model.predict(X_train)
y_val_pred = xgb_model.predict(X_val)
y_test_pred = xgb_model.predict(X_test)

train_mse = mean_squared_error(y_train, y_train_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"\nXGBoost Results:")
print(f"Train MSE: {train_mse:.6f}")
print(f"Val MSE:   {val_mse:.6f}")
print(f"Test MSE:  {test_mse:.6f}")

# Directional accuracy
test_dir_acc = np.mean(np.sign(y_test_pred) == np.sign(y_test))
print(f"Test Directional Accuracy: {test_dir_acc*100:.2f}%")

# Feature importance
importances = pd.Series(xgb_model.feature_importances_, index=FEATURE_ORDER).sort_values(ascending=False)
print(f"\nTop 10 Features:")
print(importances.head(10))

# Performance checks
assert test_mse < 0.001, f" Test MSE too high: {test_mse}"
assert test_dir_acc > 0.52, f" Directional accuracy too low: {test_dir_acc}"
assert train_mse / test_mse > 0.5, " Severe overfitting detected"
print("\n XGBoost performance acceptable")

## 9. Train LSTM Model

In [None]:
# LSTM Model Definition (MUST MATCH main.py lines 29-37)
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=64):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=2, batch_first=True, dropout=0.2)
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(hidden_size, 32)
        self.fc2 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        x = self.dropout(lstm_out[:, -1, :])
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

# Create sequences
def create_sequences(X, y, seq_length=48):
    X_seq = []
    y_seq = []
    for i in range(len(X) - seq_length):
        X_seq.append(X[i:i+seq_length])
        y_seq.append(y[i+seq_length])
    return np.array(X_seq), np.array(y_seq)

seq_length = 48
print(f"Creating sequences (seq_length={seq_length})...")

X_train_seq, y_train_seq = create_sequences(X_train, y_train, seq_length)
X_val_seq, y_val_seq = create_sequences(X_val, y_val, seq_length)
X_test_seq, y_test_seq = create_sequences(X_test, y_test, seq_length)

print(f"Train sequences: {len(X_train_seq)}")
print(f"Val sequences: {len(X_val_seq)}")
print(f"Test sequences: {len(X_test_seq)}")

# Convert to PyTorch tensors
X_train_t = torch.FloatTensor(X_train_seq)
y_train_t = torch.FloatTensor(y_train_seq).reshape(-1, 1)
X_val_t = torch.FloatTensor(X_val_seq)
y_val_t = torch.FloatTensor(y_val_seq).reshape(-1, 1)
X_test_t = torch.FloatTensor(X_test_seq)
y_test_t = torch.FloatTensor(y_test_seq).reshape(-1, 1)

# Create DataLoader
batch_size = 64
train_dataset = TensorDataset(X_train_t, y_train_t)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize model
input_size = len(FEATURE_ORDER)
hidden_size = 64
lstm_model = LSTMModel(input_size, hidden_size)
criterion = nn.MSELoss()
optimizer = optim.Adam(lstm_model.parameters(), lr=0.0005, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

# Training loop with early stopping
print("\nTraining LSTM...")
epochs = 100
best_val_loss = float('inf')
patience = 15
patience_counter = 0

for epoch in range(epochs):
    lstm_model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = lstm_model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(lstm_model.parameters(), max_norm=1.0)
        optimizer.step()
        epoch_loss += loss.item()
    
    # Validation
    lstm_model.eval()
    with torch.no_grad():
        val_pred = lstm_model(X_val_t)
        val_loss = criterion(val_pred, y_val_t).item()
    
    old_lr = optimizer.param_groups[0]['lr']
    scheduler.step(val_loss)
    new_lr = optimizer.param_groups[0]['lr']
    
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {epoch_loss/len(train_loader):.6f}, Val Loss: {val_loss:.6f}, LR: {new_lr:.6f}")
    
    if new_lr < old_lr:
        print(f"Reducing learning rate to {new_lr:.6f}")
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_model_state = lstm_model.state_dict()
    else:
        patience_counter += 1
    
    if patience_counter >= patience:
        print(f"Early stopping at epoch {epoch+1}")
        break

# Load best model
lstm_model.load_state_dict(best_model_state)

# Final evaluation
lstm_model.eval()
with torch.no_grad():
    train_pred = lstm_model(X_train_t).numpy()
    val_pred = lstm_model(X_val_t).numpy()
    test_pred = lstm_model(X_test_t).numpy()

lstm_train_mse = mean_squared_error(y_train_seq, train_pred)
lstm_val_mse = mean_squared_error(y_val_seq, val_pred)
lstm_test_mse = mean_squared_error(y_test_seq, test_pred)

print(f"\nLSTM Results:")
print(f"Train MSE: {lstm_train_mse:.6f}")
print(f"Val MSE:   {lstm_val_mse:.6f}")
print(f"Test MSE:  {lstm_test_mse:.6f}")

# Directional accuracy
lstm_dir_acc = np.mean(np.sign(test_pred.flatten()) == np.sign(y_test_seq))
print(f"Test Directional Accuracy: {lstm_dir_acc*100:.2f}%")

# Performance checks
assert lstm_test_mse < 0.001, f"LSTM Test MSE too high: {lstm_test_mse}"
assert lstm_dir_acc > 0.50, f"LSTM Directional accuracy too low: {lstm_dir_acc}"
print("\nLSTM performance acceptable")

## 10. Export Model Artifacts

Save all 4 required files to models/ directory

In [None]:
print("Exporting model artifacts...")

# 1. XGBoost model
with open(models_dir / 'xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)
print(f"Saved xgb_model.pkl ({(models_dir / 'xgb_model.pkl').stat().st_size / 1024:.1f} KB)")

# 2. LSTM model state dict
with open(models_dir / 'lstm_model.pth', 'wb') as f:
    pickle.dump(lstm_model.state_dict(), f)
print(f"Saved lstm_model.pth ({(models_dir / 'lstm_model.pth').stat().st_size / 1024:.1f} KB)")

# 3. Scaler configuration
scaler_config = {
    'means': feature_means,
    'stds': feature_stds
}
with open(models_dir / 'scaler_config.json', 'w') as f:
    json.dump(scaler_config, f, indent=2)
print(f"Saved scaler_config.json")

# 4. Model configuration
model_config = {
    'input_size': len(FEATURE_ORDER),
    'hidden_size': hidden_size,
    'sequence_length': seq_length,
    'prediction_horizon': prediction_horizon,
    'training_date': datetime.now().isoformat(),
    'data_start': str(features_normalized.index[0]),
    'data_end': str(features_normalized.index[-1]),
    'train_samples': len(X_train),
    'val_samples': len(X_val),
    'test_samples': len(X_test),
    'xgb_test_mse': float(test_mse),
    'xgb_dir_acc': float(test_dir_acc),
    'lstm_test_mse': float(lstm_test_mse),
    'lstm_dir_acc': float(lstm_dir_acc),
    'feature_order': FEATURE_ORDER
}
with open(models_dir / 'model_config.json', 'w') as f:
    json.dump(model_config, f, indent=2)
print(f"Saved model_config.json")

print("\n" + "="*60)
print("ALL MODEL ARTIFACTS EXPORTED SUCCESSFULLY!")
print("="*60)
print(f"\nFiles saved to: {models_dir.absolute()}")

# Upload to ObjectStore (QuantConnect cloud storage)
print("\n" + "="*60)
print("UPLOADING TO OBJECTSTORE...")
print("="*60)

try:
    for filename in ['xgb_model.pkl', 'lstm_model.pth', 'scaler_config.json', 'model_config.json']:
        filepath = models_dir / filename
        with open(filepath, 'rb') as f:
            file_bytes = f.read()
        
        # QuantConnect ObjectStore.SaveBytes method
        qb.ObjectStore.SaveBytes(f"models/{filename}", file_bytes)
        print(f"Uploaded {filename} ({len(file_bytes) / 1024:.1f} KB)")
    
    print("\n" + "="*60)
    print("ALL MODELS UPLOADED TO OBJECTSTORE!")
    print("="*60)
    print("\nYour models are now accessible in main.py")
    print("Set use_pretrained_models = True and run backtest")
    
except Exception as e:
    print(f"\nObjectStore upload failed: {str(e)}")
    print("\nManual upload required:")
    print("1. Download files from Research notebook")
    print("2. Go to main.py algorithm in QC IDE")
    print("3. Click Object Store tab")
    print("4. Upload all 4 files to models/ folder")

print("\nSee DEPLOYMENT_GUIDE.md for next steps")

## 11. Final Validation Summary

In [None]:
print("="*60)
print("MODEL TRAINING SUMMARY")
print("="*60)

print(f"\n DATA:")
print(f"  Date Range: {features_normalized.index[0]} to {features_normalized.index[-1]}")
print(f"  Total Samples: {len(features_normalized):,}")
print(f"  Train: {len(X_train):,} | Val: {len(X_val):,} | Test: {len(X_test):,}")

print(f"\n XGBOOST:")
print(f"  Test MSE: {test_mse:.6f} {'' if test_mse < 0.001 else ''}")
print(f"  Directional Accuracy: {test_dir_acc*100:.2f}% {'' if test_dir_acc > 0.52 else ''}")
print(f"  Top Feature: {importances.index[0]} ({importances.values[0]:.3f})")

print(f"\n LSTM:")
print(f"  Test MSE: {lstm_test_mse:.6f} {'' if lstm_test_mse < 0.001 else ''}")
print(f"  Directional Accuracy: {lstm_dir_acc*100:.2f}% {'' if lstm_dir_acc > 0.52 else ''}")
print(f"  Architecture: {input_size} → LSTM({hidden_size}) → FC(1)")

print(f"\n EXPORTED FILES:")
for filename in ['xgb_model.pkl', 'lstm_model.pth', 'scaler_config.json', 'model_config.json']:
    filepath = models_dir / filename
    if filepath.exists():
        size_kb = filepath.stat().st_size / 1024
        print(f"   {filename} ({size_kb:.1f} KB)")
    else:
        print(f"   {filename} MISSING!")

print(f"\n READY FOR DEPLOYMENT!")
print(f"\nFollow DEPLOYMENT_GUIDE.md Phase 2 to upload to QuantConnect.")