# Phase 1: AI Ensemble Trading - Data Preparation Pipeline

This notebook runs the complete Phase 1 pipeline for preparing your trading data:
1. **Data Ingestion** - Load raw 1-minute OHLCV data
2. **Data Cleaning** - Fix gaps, outliers, duplicates
3. **Feature Engineering** - Generate 60+ technical indicators
4. **Triple-Barrier Labeling** - Create trading labels
5. **GA Optimization** - Optimize label parameters
6. **Final Labels + Weights** - Apply quality-based weighting
7. **Time-Based Splits** - Create train/val/test sets
8. **Validation + Backtest** - Verify data quality

## Instructions
1. Upload your data files (MES_1m.csv, MGC_1m.csv) to Google Drive
2. Update the `DATA_PATHS` dictionary below with your file IDs
3. Run all cells in order
4. Download the final artifacts from the `outputs/` folder

## Step 0: Setup & Configuration

In [None]:
# Install required packages
!pip install -q pandas numpy numba deap scipy matplotlib seaborn tqdm pyarrow

In [None]:
# Mount Google Drive to access your data
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Configuration - UPDATE THESE PATHS
DATA_PATHS = {
    'MES': '/content/drive/MyDrive/trading_data/MES_1m.csv',  # Update this path
    'MGC': '/content/drive/MyDrive/trading_data/MGC_1m.csv',  # Update this path
}

# Pipeline configuration
CONFIG = {
    'symbols': ['MES', 'MGC'],
    'horizons': [1, 5, 20],  # Prediction horizons in bars
    'train_ratio': 0.70,
    'val_ratio': 0.15,
    'test_ratio': 0.15,
    'purge_bars': 20,
    'embargo_bars': 288,  # ~1 day for 5-min data
    'ga_population': 30,  # Reduced for Colab speed
    'ga_generations': 20,
}

# Create output directories
import os
for d in ['outputs/raw', 'outputs/clean', 'outputs/features', 
          'outputs/labels', 'outputs/final', 'outputs/splits', 
          'outputs/reports', 'outputs/ga_results']:
    os.makedirs(d, exist_ok=True)

print("Configuration loaded!")
print(f"Symbols: {CONFIG['symbols']}")
print(f"Horizons: {CONFIG['horizons']}")

## Step 1: Data Ingestion

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

def load_and_standardize(filepath, symbol):
    """Load raw data and standardize column names."""
    print(f"Loading {symbol} from {filepath}...")
    
    # Load data
    if filepath.endswith('.parquet'):
        df = pd.read_parquet(filepath)
    else:
        df = pd.read_csv(filepath)
    
    print(f"  Raw shape: {df.shape}")
    print(f"  Columns: {df.columns.tolist()}")
    
    # Standardize column names
    col_map = {}
    for col in df.columns:
        col_lower = col.lower()
        if 'time' in col_lower or 'date' in col_lower:
            col_map[col] = 'datetime'
        elif col_lower == 'open' or col_lower == 'o':
            col_map[col] = 'open'
        elif col_lower == 'high' or col_lower == 'h':
            col_map[col] = 'high'
        elif col_lower == 'low' or col_lower == 'l':
            col_map[col] = 'low'
        elif col_lower == 'close' or col_lower == 'c':
            col_map[col] = 'close'
        elif col_lower in ['volume', 'vol', 'v']:
            col_map[col] = 'volume'
    
    df = df.rename(columns=col_map)
    
    # Convert datetime
    df['datetime'] = pd.to_datetime(df['datetime'])
    df = df.sort_values('datetime').reset_index(drop=True)
    
    # Add symbol
    df['symbol'] = symbol
    
    # Validate OHLC
    df['high'] = df[['high', 'open', 'close']].max(axis=1)
    df['low'] = df[['low', 'open', 'close']].min(axis=1)
    
    print(f"  Standardized shape: {df.shape}")
    print(f"  Date range: {df['datetime'].min()} to {df['datetime'].max()}")
    
    return df

# Load all symbols
raw_data = {}
for symbol, path in DATA_PATHS.items():
    if os.path.exists(path):
        raw_data[symbol] = load_and_standardize(path, symbol)
        # Save as parquet
        raw_data[symbol].to_parquet(f'outputs/raw/{symbol}_1m.parquet', index=False)
    else:
        print(f"WARNING: File not found: {path}")

print(f"\nLoaded {len(raw_data)} symbols")

## Step 2: Data Cleaning

In [None]:
def clean_data(df, symbol, max_gap_fill=30):
    """Clean and resample data to 5-minute bars."""
    print(f"\nCleaning {symbol}...")
    initial_rows = len(df)
    
    # 1. Remove duplicates
    df = df.drop_duplicates(subset=['datetime'], keep='first')
    print(f"  After dedup: {len(df)} rows")
    
    # 2. Detect gaps
    df = df.set_index('datetime')
    time_diff = df.index.to_series().diff().dt.total_seconds() / 60
    gaps = time_diff[time_diff > 2].dropna()
    print(f"  Gaps found: {len(gaps)} (largest: {gaps.max() if len(gaps) > 0 else 0:.0f} min)")
    
    # 3. Fill small gaps (up to max_gap_fill minutes)
    full_index = pd.date_range(start=df.index.min(), end=df.index.max(), freq='1min')
    df = df.reindex(full_index)
    df = df.ffill(limit=max_gap_fill)
    df = df.dropna()
    print(f"  After gap fill: {len(df)} rows")
    
    # 4. Resample to 5-minute bars
    df_5m = df.resample('5min').agg({
        'open': 'first',
        'high': 'max',
        'low': 'min',
        'close': 'last',
        'volume': 'sum',
        'symbol': 'first'
    }).dropna()
    
    df_5m = df_5m.reset_index().rename(columns={'index': 'datetime'})
    
    # 5. Remove outliers (price spikes > 5 ATRs)
    atr = (df_5m['high'] - df_5m['low']).rolling(20).mean()
    price_change = df_5m['close'].diff().abs()
    spike_mask = price_change > (5 * atr)
    if spike_mask.any():
        print(f"  Removed {spike_mask.sum()} spike outliers")
        df_5m.loc[spike_mask, ['open', 'high', 'low', 'close']] = np.nan
        df_5m = df_5m.ffill().dropna()
    
    print(f"  Final: {len(df_5m)} 5-min bars")
    
    return df_5m

# Clean all symbols
clean_data_dict = {}
for symbol, df in raw_data.items():
    clean_data_dict[symbol] = clean_data(df.copy(), symbol)
    clean_data_dict[symbol].to_parquet(f'outputs/clean/{symbol}_5m_clean.parquet', index=False)

print("\nCleaning complete!")

## Step 3: Feature Engineering

In [None]:
def compute_features(df):
    """Compute 60+ technical features."""
    df = df.copy()
    
    # === Returns ===
    df['log_return'] = np.log(df['close'] / df['close'].shift(1))
    df['simple_return'] = df['close'].pct_change()
    for p in [5, 10, 20]:
        df[f'return_{p}'] = df['close'].pct_change(p)
    
    # === Moving Averages ===
    for p in [10, 20, 50, 100, 200]:
        df[f'sma_{p}'] = df['close'].rolling(p).mean()
        df[f'close_to_sma_{p}'] = df['close'] / df[f'sma_{p}'] - 1
    
    for p in [9, 21, 50]:
        df[f'ema_{p}'] = df['close'].ewm(span=p, adjust=False).mean()
        df[f'close_to_ema_{p}'] = df['close'] / df[f'ema_{p}'] - 1
    
    # === RSI ===
    delta = df['close'].diff()
    gain = delta.where(delta > 0, 0).rolling(14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
    rs = gain / loss.replace(0, np.inf)
    df['rsi'] = 100 - (100 / (1 + rs))
    df['rsi_oversold'] = (df['rsi'] < 30).astype(int)
    df['rsi_overbought'] = (df['rsi'] > 70).astype(int)
    
    # === MACD ===
    ema12 = df['close'].ewm(span=12, adjust=False).mean()
    ema26 = df['close'].ewm(span=26, adjust=False).mean()
    df['macd'] = ema12 - ema26
    df['macd_signal'] = df['macd'].ewm(span=9, adjust=False).mean()
    df['macd_hist'] = df['macd'] - df['macd_signal']
    
    # === Bollinger Bands ===
    sma20 = df['close'].rolling(20).mean()
    std20 = df['close'].rolling(20).std()
    df['bb_upper'] = sma20 + 2 * std20
    df['bb_lower'] = sma20 - 2 * std20
    df['bb_width'] = (df['bb_upper'] - df['bb_lower']) / sma20
    df['bb_position'] = (df['close'] - df['bb_lower']) / (df['bb_upper'] - df['bb_lower'])
    
    # === ATR ===
    tr = pd.concat([
        df['high'] - df['low'],
        abs(df['high'] - df['close'].shift(1)),
        abs(df['low'] - df['close'].shift(1))
    ], axis=1).max(axis=1)
    
    for p in [7, 14, 21]:
        df[f'atr_{p}'] = tr.rolling(p).mean()
        df[f'atr_{p}_pct'] = df[f'atr_{p}'] / df['close']
    
    # === Stochastic ===
    low_14 = df['low'].rolling(14).min()
    high_14 = df['high'].rolling(14).max()
    df['stoch_k'] = 100 * (df['close'] - low_14) / (high_14 - low_14)
    df['stoch_d'] = df['stoch_k'].rolling(3).mean()
    
    # === ADX ===
    high_diff = df['high'].diff()
    low_diff = -df['low'].diff()
    plus_dm = np.where((high_diff > low_diff) & (high_diff > 0), high_diff, 0)
    minus_dm = np.where((low_diff > high_diff) & (low_diff > 0), low_diff, 0)
    atr14 = tr.rolling(14).mean()
    plus_di = 100 * pd.Series(plus_dm).rolling(14).mean() / atr14
    minus_di = 100 * pd.Series(minus_dm).rolling(14).mean() / atr14
    dx = 100 * abs(plus_di - minus_di) / (plus_di + minus_di)
    df['adx'] = dx.rolling(14).mean()
    df['plus_di'] = plus_di
    df['minus_di'] = minus_di
    
    # === Volume Features ===
    df['volume_sma_20'] = df['volume'].rolling(20).mean()
    df['volume_ratio'] = df['volume'] / df['volume_sma_20']
    df['obv'] = (np.sign(df['close'].diff()) * df['volume']).cumsum()
    
    # === Temporal Features ===
    df['hour'] = df['datetime'].dt.hour
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['dow'] = df['datetime'].dt.dayofweek
    df['dow_sin'] = np.sin(2 * np.pi * df['dow'] / 5)
    df['dow_cos'] = np.cos(2 * np.pi * df['dow'] / 5)
    df['is_rth'] = ((df['hour'] >= 9) & (df['hour'] < 16)).astype(int)
    df = df.drop(columns=['hour', 'dow'])
    
    # === Regime Features ===
    vol_20 = df['log_return'].rolling(20).std()
    vol_60 = df['log_return'].rolling(60).std()
    df['vol_regime'] = np.where(vol_20 > vol_60, 1, -1)
    df['trend_regime'] = np.where(df['sma_20'] > df['sma_50'], 1, -1)
    
    # Drop NaN rows
    df = df.dropna()
    df = df.replace([np.inf, -np.inf], np.nan).dropna()
    
    return df

# Generate features for all symbols
feature_data = {}
for symbol, df in clean_data_dict.items():
    print(f"\nGenerating features for {symbol}...")
    feature_data[symbol] = compute_features(df)
    n_features = len([c for c in feature_data[symbol].columns 
                      if c not in ['datetime', 'symbol', 'open', 'high', 'low', 'close', 'volume']])
    print(f"  Generated {n_features} features")
    print(f"  Shape: {feature_data[symbol].shape}")
    feature_data[symbol].to_parquet(f'outputs/features/{symbol}_5m_features.parquet', index=False)

print("\nFeature engineering complete!")

## Step 4: Triple-Barrier Labeling

In [None]:
from numba import njit, prange

@njit(parallel=True)
def apply_triple_barrier_numba(close, high, low, atr, k_up, k_down, max_bars):
    """Numba-optimized triple barrier labeling."""
    n = len(close)
    labels = np.zeros(n, dtype=np.int32)
    bars_to_hit = np.zeros(n, dtype=np.int32)
    mae = np.zeros(n, dtype=np.float64)
    mfe = np.zeros(n, dtype=np.float64)
    
    for i in prange(n - max_bars):
        entry = close[i]
        curr_atr = atr[i]
        
        if curr_atr <= 0 or np.isnan(curr_atr):
            continue
        
        upper = entry + k_up * curr_atr
        lower = entry - k_down * curr_atr
        
        max_adverse = 0.0
        max_favorable = 0.0
        hit_bar = max_bars
        hit_label = 0
        
        for j in range(1, max_bars + 1):
            idx = i + j
            if idx >= n:
                break
            
            # Track MFE/MAE
            favorable = (high[idx] - entry) / entry
            adverse = (entry - low[idx]) / entry
            max_favorable = max(max_favorable, favorable)
            max_adverse = max(max_adverse, adverse)
            
            # Check barriers
            if high[idx] >= upper:
                hit_bar = j
                hit_label = 1
                break
            if low[idx] <= lower:
                hit_bar = j
                hit_label = -1
                break
        
        labels[i] = hit_label
        bars_to_hit[i] = hit_bar
        mae[i] = max_adverse
        mfe[i] = max_favorable
    
    return labels, bars_to_hit, mae, mfe

def apply_labels(df, horizon, k_up, k_down, max_bars):
    """Apply triple barrier labeling to dataframe."""
    atr_col = f'atr_{max(7, min(horizon * 2, 21))}'
    if atr_col not in df.columns:
        atr_col = 'atr_14'
    
    labels, bars_to_hit, mae, mfe = apply_triple_barrier_numba(
        df['close'].values.astype(np.float64),
        df['high'].values.astype(np.float64),
        df['low'].values.astype(np.float64),
        df[atr_col].values.astype(np.float64),
        k_up, k_down, max_bars
    )
    
    df[f'label_h{horizon}'] = labels
    df[f'bars_to_hit_h{horizon}'] = bars_to_hit
    df[f'mae_h{horizon}'] = mae
    df[f'mfe_h{horizon}'] = mfe
    
    return df

# Initial labeling parameters
INIT_PARAMS = {
    1: (1.0, 1.0, 2),
    5: (1.5, 1.0, 10),
    20: (2.0, 1.5, 40)
}

# Apply initial labels
labeled_data = {}
for symbol, df in feature_data.items():
    print(f"\nLabeling {symbol}...")
    df = df.copy()
    
    for horizon in CONFIG['horizons']:
        k_up, k_down, max_bars = INIT_PARAMS[horizon]
        df = apply_labels(df, horizon, k_up, k_down, max_bars)
        
        # Print distribution
        dist = pd.Series(df[f'label_h{horizon}']).value_counts().sort_index()
        print(f"  H{horizon}: Long={dist.get(1, 0):,} Neutral={dist.get(0, 0):,} Short={dist.get(-1, 0):,}")
    
    labeled_data[symbol] = df
    df.to_parquet(f'outputs/labels/{symbol}_labels_init.parquet', index=False)

print("\nInitial labeling complete!")

## Step 5: GA Optimization (Optional - Takes ~5-10 min)

In [None]:
RUN_GA_OPTIMIZATION = True  # Set to False to skip GA and use initial params

if RUN_GA_OPTIMIZATION:
    from deap import base, creator, tools, algorithms
    import random
    import json
    
    def evaluate_params(individual, df, horizon):
        """Evaluate labeling parameters."""
        k_up, k_down, max_bars = individual
        max_bars = int(max_bars)
        
        # Apply labels
        atr_col = f'atr_{max(7, min(horizon * 2, 21))}'
        if atr_col not in df.columns:
            atr_col = 'atr_14'
        
        labels, bars_to_hit, _, _ = apply_triple_barrier_numba(
            df['close'].values.astype(np.float64),
            df['high'].values.astype(np.float64),
            df['low'].values.astype(np.float64),
            df[atr_col].values.astype(np.float64),
            k_up, k_down, max_bars
        )
        
        # Compute fitness
        n_long = np.sum(labels == 1)
        n_short = np.sum(labels == -1)
        n_neutral = np.sum(labels == 0)
        total = len(labels)
        
        if n_long + n_short < 100:
            return (0.0,)
        
        # Balance score (prefer 30-40% signals)
        signal_ratio = (n_long + n_short) / total
        balance_score = 1 - abs(signal_ratio - 0.35) * 2
        
        # Win rate score (prefer 45-55%)
        win_rate = n_long / (n_long + n_short) if (n_long + n_short) > 0 else 0.5
        winrate_score = 1 - abs(win_rate - 0.50) * 4
        
        # Speed score (faster hits are better)
        avg_bars = np.mean(bars_to_hit[labels != 0]) if np.sum(labels != 0) > 0 else max_bars
        speed_score = 1 - (avg_bars / max_bars)
        
        fitness = 0.4 * balance_score + 0.3 * winrate_score + 0.3 * speed_score
        return (max(0, fitness),)
    
    def run_ga(df, horizon, pop_size=30, n_gen=20):
        """Run GA optimization for a horizon."""
        # Setup DEAP
        if hasattr(creator, 'FitnessMax'):
            del creator.FitnessMax
        if hasattr(creator, 'Individual'):
            del creator.Individual
        
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMax)
        
        toolbox = base.Toolbox()
        
        # Attributes
        toolbox.register("attr_k_up", random.uniform, 0.5, 3.0)
        toolbox.register("attr_k_down", random.uniform, 0.5, 3.0)
        toolbox.register("attr_max_bars", random.uniform, horizon, horizon * 5)
        
        toolbox.register("individual", tools.initCycle, creator.Individual,
                        (toolbox.attr_k_up, toolbox.attr_k_down, toolbox.attr_max_bars), n=1)
        toolbox.register("population", tools.initRepeat, list, toolbox.individual)
        
        toolbox.register("evaluate", evaluate_params, df=df, horizon=horizon)
        toolbox.register("mate", tools.cxBlend, alpha=0.5)
        toolbox.register("mutate", tools.mutGaussian, mu=0, sigma=0.3, indpb=0.3)
        toolbox.register("select", tools.selTournament, tournsize=3)
        
        # Run GA
        pop = toolbox.population(n=pop_size)
        hof = tools.HallOfFame(1)
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("max", np.max)
        stats.register("avg", np.mean)
        
        pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.7, mutpb=0.2,
                                        ngen=n_gen, stats=stats, halloffame=hof, verbose=False)
        
        best = hof[0]
        return {
            'k_up': best[0],
            'k_down': best[1],
            'max_bars': int(best[2]),
            'fitness': best.fitness.values[0]
        }
    
    # Run GA for each symbol and horizon
    ga_results = {}
    for symbol, df in labeled_data.items():
        print(f"\nOptimizing {symbol}...")
        ga_results[symbol] = {}
        
        # Use subset for speed
        subset = df.sample(frac=0.2, random_state=42)
        
        for horizon in CONFIG['horizons']:
            print(f"  Horizon {horizon}...", end=" ")
            result = run_ga(subset, horizon, 
                           pop_size=CONFIG['ga_population'],
                           n_gen=CONFIG['ga_generations'])
            ga_results[symbol][horizon] = result
            print(f"k_up={result['k_up']:.2f}, k_down={result['k_down']:.2f}, "
                  f"max_bars={result['max_bars']}, fitness={result['fitness']:.3f}")
    
    # Save GA results
    with open('outputs/ga_results/ga_optimization_results.json', 'w') as f:
        json.dump({s: {str(h): v for h, v in hs.items()} 
                   for s, hs in ga_results.items()}, f, indent=2)
    
    print("\nGA optimization complete!")
else:
    ga_results = None
    print("GA optimization skipped. Using initial parameters.")

## Step 6: Final Labels with Quality Weights

In [None]:
def compute_quality_weights(df, horizon):
    """Compute quality scores and sample weights."""
    labels = df[f'label_h{horizon}']
    bars = df[f'bars_to_hit_h{horizon}']
    mae = df[f'mae_h{horizon}']
    mfe = df[f'mfe_h{horizon}']
    
    # Get max_bars for this horizon
    max_bars = bars.max()
    
    # Speed score (faster = better)
    speed_score = 1 - (bars / max_bars)
    
    # MAE score (lower adverse excursion = better)
    mae_score = 1 - mae.clip(0, 0.05) / 0.05
    
    # MFE score (higher favorable excursion = better)
    mfe_score = mfe.clip(0, 0.05) / 0.05
    
    # Combined quality
    quality = 0.3 * speed_score + 0.4 * mae_score + 0.3 * mfe_score
    quality = quality.clip(0, 1)
    
    # Assign weights based on quality tiers
    weights = np.ones(len(df))
    q80 = quality.quantile(0.8)
    q20 = quality.quantile(0.2)
    
    weights = np.where(quality >= q80, 1.5,  # Top 20%
              np.where(quality <= q20, 0.5,  # Bottom 20%
                       1.0))                  # Middle 60%
    
    df[f'quality_h{horizon}'] = quality
    df[f'sample_weight_h{horizon}'] = weights
    
    return df

# Apply final labels using GA-optimized params (or initial params)
final_data = {}
for symbol, df in labeled_data.items():
    print(f"\nFinalizing {symbol}...")
    df = df.copy()
    
    for horizon in CONFIG['horizons']:
        # Get parameters
        if ga_results and symbol in ga_results:
            params = ga_results[symbol][horizon]
            k_up, k_down, max_bars = params['k_up'], params['k_down'], params['max_bars']
        else:
            k_up, k_down, max_bars = INIT_PARAMS[horizon]
        
        # Re-apply labels with optimized params
        atr_col = f'atr_{max(7, min(horizon * 2, 21))}'
        if atr_col not in df.columns:
            atr_col = 'atr_14'
        
        labels, bars_to_hit, mae, mfe = apply_triple_barrier_numba(
            df['close'].values.astype(np.float64),
            df['high'].values.astype(np.float64),
            df['low'].values.astype(np.float64),
            df[atr_col].values.astype(np.float64),
            k_up, k_down, max_bars
        )
        
        df[f'label_h{horizon}'] = labels
        df[f'bars_to_hit_h{horizon}'] = bars_to_hit
        df[f'mae_h{horizon}'] = mae
        df[f'mfe_h{horizon}'] = mfe
        
        # Compute quality weights
        df = compute_quality_weights(df, horizon)
        
        # Print stats
        dist = pd.Series(labels).value_counts().sort_index()
        print(f"  H{horizon}: Long={dist.get(1, 0):,} Neutral={dist.get(0, 0):,} "
              f"Short={dist.get(-1, 0):,} | Avg Quality={df[f'quality_h{horizon}'].mean():.3f}")
    
    final_data[symbol] = df
    df.to_parquet(f'outputs/final/{symbol}_final_labeled.parquet', index=False)

print("\nFinal labeling complete!")

## Step 7: Time-Based Splits

In [None]:
import json

# Combine all symbols
print("Creating combined dataset...")
combined_df = pd.concat(final_data.values(), ignore_index=True)
combined_df = combined_df.sort_values('datetime').reset_index(drop=True)
print(f"Combined dataset: {len(combined_df):,} rows")

# Save combined dataset
combined_df.to_parquet('outputs/final/combined_final_labeled.parquet', index=False)

# Create splits
n = len(combined_df)
train_end = int(n * CONFIG['train_ratio'])
val_end = int(n * (CONFIG['train_ratio'] + CONFIG['val_ratio']))

# Apply purging and embargo
train_end_purged = train_end - CONFIG['purge_bars']
val_start = train_end + CONFIG['embargo_bars']
test_start = val_end + CONFIG['embargo_bars']

# Create indices
train_indices = np.arange(0, train_end_purged)
val_indices = np.arange(val_start, val_end)
test_indices = np.arange(test_start, n)

print(f"\nSplit sizes:")
print(f"  Train: {len(train_indices):,} samples ({len(train_indices)/n*100:.1f}%)")
print(f"  Val:   {len(val_indices):,} samples ({len(val_indices)/n*100:.1f}%)")
print(f"  Test:  {len(test_indices):,} samples ({len(test_indices)/n*100:.1f}%)")

# Get date ranges
print(f"\nDate ranges:")
print(f"  Train: {combined_df.iloc[train_indices]['datetime'].min()} to {combined_df.iloc[train_indices]['datetime'].max()}")
print(f"  Val:   {combined_df.iloc[val_indices]['datetime'].min()} to {combined_df.iloc[val_indices]['datetime'].max()}")
print(f"  Test:  {combined_df.iloc[test_indices]['datetime'].min()} to {combined_df.iloc[test_indices]['datetime'].max()}")

# Save indices
np.save('outputs/splits/train_indices.npy', train_indices)
np.save('outputs/splits/val_indices.npy', val_indices)
np.save('outputs/splits/test_indices.npy', test_indices)

# Save metadata
split_config = {
    'total_samples': n,
    'train_samples': len(train_indices),
    'val_samples': len(val_indices),
    'test_samples': len(test_indices),
    'purge_bars': CONFIG['purge_bars'],
    'embargo_bars': CONFIG['embargo_bars'],
    'train_date_start': str(combined_df.iloc[train_indices]['datetime'].min()),
    'train_date_end': str(combined_df.iloc[train_indices]['datetime'].max()),
    'val_date_start': str(combined_df.iloc[val_indices]['datetime'].min()),
    'val_date_end': str(combined_df.iloc[val_indices]['datetime'].max()),
    'test_date_start': str(combined_df.iloc[test_indices]['datetime'].min()),
    'test_date_end': str(combined_df.iloc[test_indices]['datetime'].max()),
}

with open('outputs/splits/split_config.json', 'w') as f:
    json.dump(split_config, f, indent=2)

print("\nSplits saved!")

## Step 8: Validation & Baseline Backtest

In [None]:
import matplotlib.pyplot as plt

print("="*60)
print("PHASE 1 VALIDATION REPORT")
print("="*60)

# Data Integrity
print("\n--- Data Integrity ---")
print(f"Total samples: {len(combined_df):,}")
print(f"Duplicate timestamps: {combined_df.duplicated(subset=['datetime', 'symbol']).sum()}")
print(f"NaN values: {combined_df.isnull().sum().sum()}")
print(f"Inf values: {np.isinf(combined_df.select_dtypes(include=[np.number])).sum().sum()}")

# Feature Statistics
print("\n--- Feature Statistics ---")
feature_cols = [c for c in combined_df.columns 
                if c not in ['datetime', 'symbol', 'open', 'high', 'low', 'close', 'volume']
                and not c.startswith('label_') and not c.startswith('bars_to_hit_')
                and not c.startswith('mae_') and not c.startswith('mfe_')
                and not c.startswith('quality_') and not c.startswith('sample_weight_')]
print(f"Number of features: {len(feature_cols)}")

# Label Distribution
print("\n--- Label Distribution ---")
for horizon in CONFIG['horizons']:
    col = f'label_h{horizon}'
    if col in combined_df.columns:
        dist = combined_df[col].value_counts().sort_index()
        total = len(combined_df)
        print(f"\nHorizon {horizon}:")
        print(f"  Short (-1): {dist.get(-1, 0):,} ({dist.get(-1, 0)/total*100:.1f}%)")
        print(f"  Neutral (0): {dist.get(0, 0):,} ({dist.get(0, 0)/total*100:.1f}%)")
        print(f"  Long (+1): {dist.get(1, 0):,} ({dist.get(1, 0)/total*100:.1f}%)")

# Plot label distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, horizon in enumerate(CONFIG['horizons']):
    col = f'label_h{horizon}'
    combined_df[col].value_counts().sort_index().plot(kind='bar', ax=axes[i], 
                                                       color=['red', 'gray', 'green'])
    axes[i].set_title(f'Horizon {horizon} Label Distribution')
    axes[i].set_xlabel('Label')
    axes[i].set_ylabel('Count')
plt.tight_layout()
plt.savefig('outputs/reports/label_distribution.png', dpi=150)
plt.show()

# Quality distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for i, horizon in enumerate(CONFIG['horizons']):
    col = f'quality_h{horizon}'
    combined_df[col].hist(bins=50, ax=axes[i], color='steelblue', alpha=0.7)
    axes[i].set_title(f'Horizon {horizon} Quality Score Distribution')
    axes[i].set_xlabel('Quality Score')
    axes[i].set_ylabel('Count')
plt.tight_layout()
plt.savefig('outputs/reports/quality_distribution.png', dpi=150)
plt.show()

In [None]:
# Simple Baseline Backtest
print("\n--- Baseline Backtest ---")
print("(Trading in label direction when quality > 0.5)\n")

for horizon in CONFIG['horizons']:
    # Get signals (shifted to prevent lookahead)
    signals = combined_df[f'label_h{horizon}'].shift(1)
    quality = combined_df[f'quality_h{horizon}'].shift(1)
    returns = combined_df['close'].pct_change(horizon).shift(-horizon)
    
    # Filter by quality
    mask = (quality > 0.5) & (signals != 0)
    
    # Calculate strategy returns
    strategy_returns = signals * returns
    strategy_returns = strategy_returns[mask].dropna()
    
    if len(strategy_returns) > 0:
        n_trades = len(strategy_returns)
        win_rate = (strategy_returns > 0).mean()
        avg_return = strategy_returns.mean()
        sharpe = strategy_returns.mean() / strategy_returns.std() * np.sqrt(252 * 78) if strategy_returns.std() > 0 else 0
        
        # Profit factor
        wins = strategy_returns[strategy_returns > 0].sum()
        losses = abs(strategy_returns[strategy_returns < 0].sum())
        pf = wins / losses if losses > 0 else np.inf
        
        print(f"Horizon {horizon}:")
        print(f"  Trades: {n_trades:,}")
        print(f"  Win Rate: {win_rate*100:.1f}%")
        print(f"  Avg Return: {avg_return*100:.3f}%")
        print(f"  Profit Factor: {pf:.2f}")
        print(f"  Sharpe Ratio: {sharpe:.2f}")
        print()

print("\n" + "="*60)
print("PHASE 1 COMPLETE!")
print("="*60)

## Download Results

In [None]:
# Create a summary of outputs
import os
from datetime import datetime

summary = f"""
# Phase 1 Pipeline Results
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Dataset Summary
- Total samples: {len(combined_df):,}
- Symbols: {', '.join(CONFIG['symbols'])}
- Features: {len(feature_cols)}
- Horizons: {CONFIG['horizons']}

## Splits
- Train: {len(train_indices):,} samples
- Validation: {len(val_indices):,} samples  
- Test: {len(test_indices):,} samples

## Output Files
"""

for root, dirs, files in os.walk('outputs'):
    for f in files:
        path = os.path.join(root, f)
        size = os.path.getsize(path) / 1024 / 1024  # MB
        summary += f"- {path}: {size:.2f} MB\n"

print(summary)

with open('outputs/reports/phase1_summary.md', 'w') as f:
    f.write(summary)

# Zip outputs for download
!zip -r phase1_outputs.zip outputs/

print("\n" + "="*60)
print("Download 'phase1_outputs.zip' from the Files panel on the left")
print("="*60)

## Usage Example for Phase 2

In [None]:
# Example: Load data for model training
print("Example: Loading data for Phase 2 model training\n")

# Load splits
train_idx = np.load('outputs/splits/train_indices.npy')
val_idx = np.load('outputs/splits/val_indices.npy')
test_idx = np.load('outputs/splits/test_indices.npy')

# Load data
df = pd.read_parquet('outputs/final/combined_final_labeled.parquet')

# Get train/val/test splits
train_df = df.iloc[train_idx]
val_df = df.iloc[val_idx]
test_df = df.iloc[test_idx]

# Get features (exclude non-feature columns)
exclude_cols = ['datetime', 'symbol', 'open', 'high', 'low', 'close', 'volume']
label_cols = [c for c in df.columns if c.startswith(('label_', 'bars_to_hit_', 'mae_', 'mfe_', 'quality_', 'sample_weight_'))]
feature_cols = [c for c in df.columns if c not in exclude_cols + label_cols]

# Prepare training data for horizon 5
horizon = 5
X_train = train_df[feature_cols].values
y_train = train_df[f'label_h{horizon}'].values
weights_train = train_df[f'sample_weight_h{horizon}'].values

X_val = val_df[feature_cols].values
y_val = val_df[f'label_h{horizon}'].values

X_test = test_df[feature_cols].values
y_test = test_df[f'label_h{horizon}'].values

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"\nFeatures: {len(feature_cols)}")
print(f"Sample features: {feature_cols[:10]}...")

print("\n" + "="*60)
print("Data ready for Phase 2 model training!")
print("="*60)