# 02 Model Building

### Table of Contents

### [1. Pre-Processing](#2-pre-processing)
- [1.1 Feature Engineering](#21-feature-engineering)
- [1.2 Data splitting](#22-data-splitting)
- [1.3 Optional PCA](#23-optional-pca)
- [1.4 Standardization](#24-standardization)
- [1.5 Data Leakage Check](#25-data-leakage-check)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mplfinance as mpf
import lightgbm as lgbm
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load your data
df = pd.read_csv('../data/dataset_storage/final/all_matched_data.csv')

# Filter: Only coins that listed BEFORE May 1, 2025
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
cutoff_date = pd.Timestamp('2025-05-01')

# Get first timestamp per coin
first_timestamps = df.groupby('instrument_id')['timestamp'].min()
coins_before_cutoff = first_timestamps[first_timestamps < cutoff_date].index

# Filter dataframe
df = df[df['instrument_id'].isin(coins_before_cutoff)].copy()

print(f"Coins listed before {cutoff_date.date()}: {len(coins_before_cutoff)}")
print(f"Total rows: {len(df)}")
df


Coins listed before 2025-05-01: 199
Total rows: 522732


Unnamed: 0,open,high,sol_close,btc_close,metrics_long_short_ratio,metrics_open_interest,doge_close,metrics_funding_rate,timestamp,close,fng_fng,ts_since_listing,eth_close,volume,low,instrument_id
2658,0.001702,0.001880,153.74,67249.2,0.000000,0.000000e+00,0.14028,0.0001,2024-06-11 13:15:00,0.001880,74,1,3538.99,6.514891e+08,0.001647,1000000BABYDOGEUSDT-LINEAR
2659,0.001880,0.001910,151.76,66967.1,0.000000,0.000000e+00,0.13886,0.0001,2024-06-11 13:30:00,0.001812,74,2,3521.26,1.501967e+09,0.001782,1000000BABYDOGEUSDT-LINEAR
2660,0.001812,0.001887,151.18,66803.4,0.000000,0.000000e+00,0.13838,0.0001,2024-06-11 13:45:00,0.001887,74,3,3520.21,8.974216e+08,0.001790,1000000BABYDOGEUSDT-LINEAR
2661,0.001887,0.002162,152.82,67010.0,0.000000,1.020377e+09,0.13960,0.0001,2024-06-11 14:00:00,0.002078,74,4,3536.32,2.147484e+09,0.001850,1000000BABYDOGEUSDT-LINEAR
2662,0.002078,0.002134,152.03,66900.0,0.000000,1.020377e+09,0.13925,0.0001,2024-06-11 14:15:00,0.002066,74,5,3524.73,2.147484e+09,0.002014,1000000BABYDOGEUSDT-LINEAR
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868170,3.997400,4.003100,155.96,64230.8,1.896871,4.552538e+06,0.12181,0.0001,2024-07-17 22:45:00,3.998500,69,2624,3393.40,4.421150e+04,3.986000,ZROUSDT-LINEAR
868171,3.998500,3.998500,155.40,64136.7,1.896871,4.552538e+06,0.12160,0.0001,2024-07-17 23:00:00,3.984500,69,2625,3387.61,5.672660e+04,3.974500,ZROUSDT-LINEAR
868172,3.984500,4.039400,155.97,64261.2,1.896871,4.552538e+06,0.12211,0.0001,2024-07-17 23:15:00,4.027000,69,2626,3400.17,1.285346e+05,3.983200,ZROUSDT-LINEAR
868173,4.027000,4.034800,155.79,64190.9,1.896871,4.552538e+06,0.12206,0.0001,2024-07-17 23:30:00,4.015600,69,2627,3395.61,7.128870e+04,4.010900,ZROUSDT-LINEAR


### 1.1 Feature Engineering

In [3]:
X = df[["open", "high", "low", "close", "volume", "metrics_long_short_ratio", "metrics_funding_rate", "metrics_open_interest", "btc_close", "doge_close", "sol_close", "fng_fng", "eth_close", "ts_since_listing"]]
X

Unnamed: 0,open,high,low,close,volume,metrics_long_short_ratio,metrics_funding_rate,metrics_open_interest,btc_close,doge_close,sol_close,fng_fng,eth_close,ts_since_listing
2658,0.001702,0.001880,0.001647,0.001880,6.514891e+08,0.000000,0.0001,0.000000e+00,67249.2,0.14028,153.74,74,3538.99,1
2659,0.001880,0.001910,0.001782,0.001812,1.501967e+09,0.000000,0.0001,0.000000e+00,66967.1,0.13886,151.76,74,3521.26,2
2660,0.001812,0.001887,0.001790,0.001887,8.974216e+08,0.000000,0.0001,0.000000e+00,66803.4,0.13838,151.18,74,3520.21,3
2661,0.001887,0.002162,0.001850,0.002078,2.147484e+09,0.000000,0.0001,1.020377e+09,67010.0,0.13960,152.82,74,3536.32,4
2662,0.002078,0.002134,0.002014,0.002066,2.147484e+09,0.000000,0.0001,1.020377e+09,66900.0,0.13925,152.03,74,3524.73,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868170,3.997400,4.003100,3.986000,3.998500,4.421150e+04,1.896871,0.0001,4.552538e+06,64230.8,0.12181,155.96,69,3393.40,2624
868171,3.998500,3.998500,3.974500,3.984500,5.672660e+04,1.896871,0.0001,4.552538e+06,64136.7,0.12160,155.40,69,3387.61,2625
868172,3.984500,4.039400,3.983200,4.027000,1.285346e+05,1.896871,0.0001,4.552538e+06,64261.2,0.12211,155.97,69,3400.17,2626
868173,4.027000,4.034800,4.010900,4.015600,7.128870e+04,1.896871,0.0001,4.552538e+06,64190.9,0.12206,155.79,69,3395.61,2627


## Model 1 (Binary Classifier)  
#### Predict after the first day if the coin will keep dropping and never climb above the first day or not  
  
X: First day of data with all features  
Y: Will price be above the highest price from day 1 ever again throughout all data (Yes/No)

### 1.2 Define Target

In [4]:
def first_day_breakout_label(
    df: pd.DataFrame,
    existing: dict | None = None,
    instrument_col: str = "instrument_id",
    ts_col: str = "timestamp",
    high_col: str = "high",
) -> dict:

    if existing is None:
        existing = {}
    else:
        existing = dict(existing)  # copy so we don't mutate caller's dict

    d = df[[instrument_col, ts_col, high_col]].copy()
    d[ts_col] = pd.to_datetime(d[ts_col], errors="coerce")
    d = d.dropna(subset=[instrument_col, ts_col, high_col]).sort_values([instrument_col, ts_col])

    for inst, g in d.groupby(instrument_col, sort=False):
        t0 = g[ts_col].iloc[0]
        t1 = t0 + pd.Timedelta(days=1)

        first_day = g[(g[ts_col] >= t0) & (g[ts_col] <= t1)]
        if first_day.empty:
            # If somehow no data in first day window, default to 1 (no breakout detected)
            existing[inst] = 1
            continue

        max_high_1d = first_day[high_col].max()

        after = g[g[ts_col] > t1]
        breakout = (after[high_col] > max_high_1d).any() if not after.empty else False

        existing[inst] = 0 if breakout else 1

    return existing

first_day_breakout_label(df)

{'1000000BABYDOGEUSDT-LINEAR': 1,
 '1000000CHEEMSUSDT-LINEAR': 0,
 '1000000MOGUSDT-LINEAR': 0,
 '10000ELONUSDT-LINEAR': 1,
 '10000QUBICUSDT-LINEAR': 1,
 '1000CATUSDT-LINEAR': 0,
 '1000NEIROCTOUSDT-LINEAR': 0,
 '1000TOSHIUSDT-LINEAR': 1,
 '1000TURBOUSDT-LINEAR': 0,
 '1000XUSDT-LINEAR': 0,
 'ACTUSDT-LINEAR': 0,
 'ACXUSDT-LINEAR': 1,
 'AEROUSDT-LINEAR': 0,
 'AEVOUSDT-LINEAR': 0,
 'AIOZUSDT-LINEAR': 1,
 'AIXBTUSDT-LINEAR': 0,
 'AKTUSDT-LINEAR': 0,
 'ALCHUSDT-LINEAR': 1,
 'ALEOUSDT-LINEAR': 0,
 'ALTUSDT-LINEAR': 0,
 'ALUUSDT-LINEAR': 1,
 'ANIMEUSDT-LINEAR': 1,
 'ARCUSDT-LINEAR': 0,
 'ATHUSDT-LINEAR': 1,
 'AVAAIUSDT-LINEAR': 0,
 'AVAUSDT-LINEAR': 1,
 'AVLUSDT-LINEAR': 0,
 'B3USDT-LINEAR': 1,
 'BABYUSDT-LINEAR': 0,
 'BANANAS31USDT-LINEAR': 0,
 'BANANAUSDT-LINEAR': 1,
 'BANUSDT-LINEAR': 1,
 'BBUSDT-LINEAR': 0,
 'BERAUSDT-LINEAR': 1,
 'BIOUSDT-LINEAR': 0,
 'BLASTUSDT-LINEAR': 1,
 'BMTUSDT-LINEAR': 0,
 'BOMEUSDT-LINEAR': 0,
 'BRETTUSDT-LINEAR': 0,
 'BROCCOLIUSDT-LINEAR': 1,
 'BRUSDT-LINEAR': 0,


In [5]:
labels = first_day_breakout_label(df)
bearishs = [k for k, v in labels.items() if v == 1]
print(f"{len(bearishs)} out of {len(labels.items())} never break their first day high")

82 out of 199 never break their first day high


### 1.3 Data Splitting

In [6]:
def first_day_breakout_label(df):
    labels = {}
    d = df[['instrument_id', 'timestamp', 'high']].copy()
    d['timestamp'] = pd.to_datetime(d['timestamp'], errors='coerce')
    d = d.dropna(subset=['instrument_id', 'timestamp', 'high']).sort_values(['instrument_id', 'timestamp'])
    
    for inst, g in d.groupby('instrument_id', sort=False):
        t0 = g['timestamp'].iloc[0]
        t1 = t0 + pd.Timedelta(days=1)
        
        first_day = g[(g['timestamp'] >= t0) & (g['timestamp'] <= t1)]
        if first_day.empty:
            labels[inst] = 1
            continue
        
        max_high_1d = first_day['high'].max()
        after = g[g['timestamp'] > t1]
        breakout = (after['high'] > max_high_1d).any() if not after.empty else False
        labels[inst] = 0 if breakout else 1
    
    return labels

labels_dict = first_day_breakout_label(df)
print(f"Labels created: {len(labels_dict)} coins")
print(f"Breakout (0): {sum(1 for v in labels_dict.values() if v == 0)}")
print(f"No breakout (1): {sum(1 for v in labels_dict.values() if v == 1)}")



Labels created: 199 coins
Breakout (0): 117
No breakout (1): 82


In [7]:
df_work = df.copy()
df_work['timestamp'] = pd.to_datetime(df_work['timestamp'], errors='coerce')
combined = pd.concat([df_work[['instrument_id', 'timestamp']], X], axis=1)
combined = combined.sort_values(['instrument_id', 'timestamp'])

first_day_data = []
for inst, g in combined.groupby('instrument_id', sort=False):
    t0 = g['timestamp'].iloc[0]
    t1 = t0 + pd.Timedelta(days=1)
    first_day = g[(g['timestamp'] >= t0) & (g['timestamp'] <= t1)]
    
    if first_day.empty:
        continue
    
    agg_dict = {'instrument_id': inst}
    for col in X.columns:
        if first_day[col].dtype in [np.float64, np.float32, np.int64, np.int32]:
            agg_dict[f'{col}_last'] = first_day[col].iloc[-1]
            agg_dict[f'{col}_last10_mean'] = first_day[col].loc[-10:-1].mean()

    
    first_day_data.append(agg_dict)

first_day_df = pd.DataFrame(first_day_data)
print(f"\nFirst day aggregated shape: {first_day_df.shape}")


First day aggregated shape: (199, 29)


In [8]:
first_day_df['target'] = first_day_df['instrument_id'].map(labels_dict)
first_day_df = first_day_df.dropna(subset=['target'])

feature_cols = [col for col in first_day_df.columns if col not in ['instrument_id', 'target']]
X_model = first_day_df[feature_cols]
y_model = first_day_df['target'].astype(int)

print(f"Final dataset: {X_model.shape[0]} coins, {X_model.shape[1]} features")
print(f"Training on ALL data (no test split)")

# Train on ALL data
model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=6,
    num_leaves=31,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

model.fit(X_model, y_model)
print(f"\nModel trained on {len(X_model)} coins")


Final dataset: 199 coins, 28 features
Training on ALL data (no test split)

Model trained on 199 coins

Model trained on 199 coins


In [9]:
# Training statistics (on training data itself since no test split)
y_pred_train = model.predict(X_model)
y_pred_proba_train = model.predict_proba(X_model)[:, 1]

train_acc = accuracy_score(y_model, y_pred_train)
train_auc = roc_auc_score(y_model, y_pred_proba_train)

print("\n" + "="*60)
print("TRAINING RESULTS (No Test Data)")
print("="*60)
print(f"Train Accuracy: {train_acc:.4f}")
print(f"Train AUC-ROC:  {train_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_model, y_pred_train, target_names=['Breakout', 'No Breakout']))

print("\nConfusion Matrix:")
cm = confusion_matrix(y_model, y_pred_train)
print(f"              Predicted")
print(f"            Breakout  No Break")
print(f"Breakout       {cm[0,0]:4d}     {cm[0,1]:4d}")
print(f"No Break       {cm[1,0]:4d}     {cm[1,1]:4d}")

print("\nTop 15 Features:")
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

for idx, row in feature_importance.head(15).iterrows():
    print(f"{row['feature']:40s}: {row['importance']:8.1f}")

# Save model as pickle with date in filename
import pickle
with open('lgbm_model_trained_until_2025-05-01.pkl', 'wb') as f:
    pickle.dump(model, f)
print("\nModel saved as lgbm_model_trained_until_2025-05-01.pkl")



TRAINING RESULTS (No Test Data)
Train Accuracy: 0.9146
Train AUC-ROC:  0.9726

Classification Report:
              precision    recall  f1-score   support

    Breakout       0.92      0.94      0.93       117
 No Breakout       0.91      0.88      0.89        82

    accuracy                           0.91       199
   macro avg       0.91      0.91      0.91       199
weighted avg       0.91      0.91      0.91       199


Confusion Matrix:
              Predicted
            Breakout  No Break
Breakout        110        7
No Break         10       72

Top 15 Features:
open_last                               :    101.0
metrics_open_interest_last              :     91.0
doge_close_last                         :     80.0
volume_last                             :     70.0
fng_fng_last                            :     67.0
btc_close_last                          :     67.0
eth_close_last                          :     60.0
sol_close_last                          :     49.0
high_last   

## Model 2 (Multiclass Classifier) ``[tbd]``
#### Predict Lows of coin
  
X: All X features up to point + distance entry-high  
Y: Will price drop down 1:1, 2:1, 3:1, 5:1?

## Backtest Implementation

In [10]:
# NOTE Vibe coded - tst properly

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

class ShortBacktest:
    def __init__(self, df, X, model, test_instruments, 
                 starting_equity=10000, risk_per_trade=0.05, max_positions=20):
        self.df = df[df['instrument_id'].isin(test_instruments)].copy()
        self.df['timestamp'] = pd.to_datetime(self.df['timestamp'])
        self.df = self.df.sort_values('timestamp').reset_index(drop=True)
        
        self.X = X
        self.model = model
        self.test_instruments = test_instruments
        self.starting_equity = starting_equity
        self.risk_per_trade = risk_per_trade
        self.max_positions = max_positions
        
        self.equity = starting_equity
        self.open_positions = {}
        self.closed_trades = []
        self.equity_curve = []
        
        # Get first day data and last timestamps per coin
        self.first_day_highs = {}
        self.listing_times = {}
        self.entry_times = {}  # NEW: When we can actually enter (1 day after listing)
        self.last_timestamps = {}
        self.predictions = {}
        
        self._prepare_data()
    
    def _prepare_data(self):
        """Get first day highs, listing times, and predictions"""
        combined = pd.concat([self.df[['instrument_id', 'timestamp']], 
                             self.X.loc[self.df.index]], axis=1)
        
        for inst in self.test_instruments:
            inst_data = self.df[self.df['instrument_id'] == inst].sort_values('timestamp')
            if len(inst_data) == 0:
                continue
            
            t0 = inst_data['timestamp'].iloc[0]
            t1 = t0 + pd.Timedelta(days=1)
            first_day = inst_data[(inst_data['timestamp'] >= t0) & 
                                 (inst_data['timestamp'] <= t1)]
            
            if not first_day.empty:
                self.first_day_highs[inst] = first_day['high'].max()
                self.listing_times[inst] = t0
                self.last_timestamps[inst] = inst_data['timestamp'].max()
                
                # NEW: Find the first timestamp AFTER the first day
                after_first_day = inst_data[inst_data['timestamp'] > t1]
                if not after_first_day.empty:
                    self.entry_times[inst] = after_first_day['timestamp'].iloc[0]
                
                # Get prediction
                first_day_combined = combined[(combined['instrument_id'] == inst) & 
                                             (combined['timestamp'] >= t0) & 
                                             (combined['timestamp'] <= t1)]
                
                if not first_day_combined.empty:
                    features = {}
                    for col in self.X.columns:
                        if first_day_combined[col].dtype in [np.float64, np.float32, np.int64, np.int32]:
                            features[f'{col}_last'] = first_day_combined[col].iloc[-1]
                            features[f'{col}_last10_mean'] = first_day_combined[col].iloc[-10:].mean()
                    
                    X_pred = pd.DataFrame([features])
                    self.predictions[inst] = self.model.predict(X_pred)[0]
    
    def _enter_position(self, inst, entry_price, entry_time):
        """Enter a short position"""
        stop_loss = self.first_day_highs[inst] * 1.001
        sl_distance_pct = (stop_loss - entry_price) / entry_price
        
        if sl_distance_pct <= 0:
            print(f"  SKIP: {inst} - Entry ${entry_price:.4f} already above SL ${stop_loss:.4f}")
            return False
        
        equity_risked = self.equity * self.risk_per_trade
        position_size = equity_risked / sl_distance_pct
        
        self.open_positions[inst] = {
            'entry_price': entry_price,
            'entry_time': entry_time,
            'entry_equity': self.equity,
            'stop_loss': stop_loss,
            'position_size': position_size,
            'equity_risked': equity_risked,
        }
        print(f"  ENTRY: {inst} @ ${entry_price:.4f} | SL: ${stop_loss:.4f} | Risk: ${equity_risked:.2f} | Size: ${position_size:.2f} | Equity: ${self.equity:.2f}")
        return True
    
    def _close_position(self, inst, exit_price, exit_time, reason):
        """Close a position and record P&L"""
        if inst not in self.open_positions:
            return 0
            
        pos = self.open_positions[inst]
        
        # Calculate P&L for shorts: profit when price goes down
        price_change_pct = (pos['entry_price'] - exit_price) / pos['entry_price']
        pnl = pos['position_size'] * price_change_pct
        
        self.equity += pnl
        
        print(f"  EXIT: {inst} @ ${exit_price:.4f} | {reason} | P&L: ${pnl:.2f} ({price_change_pct*100:.2f}%) | New Equity: ${self.equity:.2f}")
        
        self.closed_trades.append({
            'instrument': inst,
            'entry_time': pos['entry_time'],
            'entry_price': pos['entry_price'],
            'entry_equity': pos['entry_equity'],
            'exit_time': exit_time,
            'exit_price': exit_price,
            'stop_loss': pos['stop_loss'],
            'exit_reason': reason,
            'position_size': pos['position_size'],
            'pnl': pnl,
            'pnl_pct': (pnl / pos['equity_risked']) * 100
        })
        
        del self.open_positions[inst]
        return pnl
    
    def run(self):
        """Run the backtest"""
        timestamps = sorted(self.df['timestamp'].unique())
        
        print("="*80)
        print("BACKTEST: SHORT NEW LISTINGS")
        print("="*80)
        print(f"Starting Equity: ${self.starting_equity:,.2f}")
        print(f"Test Coins: {len(self.test_instruments)}")
        print(f"Predicted Shorts: {sum(1 for v in self.predictions.values() if v == 1)}")
        print(f"Coins with entry times: {len(self.entry_times)}")
        print(f"Timerange: {timestamps[0]} to {timestamps[-1]}")
        print(f"Risk per Trade: {self.risk_per_trade*100}%")
        print(f"Max Positions: {self.max_positions}")
        print("="*80)
        
        for current_time in timestamps:
            current_data = self.df[self.df['timestamp'] == current_time]
            
            # 1. Check existing positions for stops/exits
            for inst in list(self.open_positions.keys()):
                pos = self.open_positions[inst]
                inst_data = current_data[current_data['instrument_id'] == inst]
                
                # Check if this is the last timestamp for this coin
                if current_time == self.last_timestamps.get(inst):
                    if not inst_data.empty:
                        current_close = inst_data['close'].iloc[0]
                        current_high = inst_data['high'].iloc[0]
                        
                        # Check stop loss first
                        if current_high >= pos['stop_loss']:
                            self._close_position(inst, pos['stop_loss'], current_time, 'Stop Loss')
                        # Check month end
                        elif current_time >= pos['entry_time'] + pd.Timedelta(days=30):
                            self._close_position(inst, current_close, current_time, 'Month End')
                        else:
                            self._close_position(inst, current_close, current_time, 'Data Ended')
                    continue
                
                # Check if coin's data has completely ended
                if current_time > self.last_timestamps.get(inst, current_time):
                    continue
                
                if inst_data.empty:
                    continue
                
                current_high = inst_data['high'].iloc[0]
                current_close = inst_data['close'].iloc[0]
                
                # Check stop loss
                if current_high >= pos['stop_loss']:
                    self._close_position(inst, pos['stop_loss'], current_time, 'Stop Loss')
                    continue
                
                # Check if month is over
                if current_time >= pos['entry_time'] + pd.Timedelta(days=30):
                    self._close_position(inst, current_close, current_time, 'Month End')
                    continue
            
            # 2. Check for new entries (AFTER first day completes)
            for inst in current_data['instrument_id'].unique():
                # Check if this is the entry time (first candle after first day)
                if inst not in self.entry_times:
                    continue
                if self.entry_times[inst] != current_time:
                    continue
                
                # Do we have a prediction?
                if inst not in self.predictions:
                    continue
                
                # Only short if model predicts 1 (no breakout)
                if self.predictions[inst] != 1:
                    continue
                
                # Can we open more positions?
                if len(self.open_positions) >= self.max_positions:
                    continue
                
                # Enter at the close of the first candle after first day
                entry_price = current_data[current_data['instrument_id'] == inst]['close'].iloc[0]
                self._enter_position(inst, entry_price, current_time)
            
            # 3. Record equity curve
            unrealized_pnl = 0
            for inst, pos in self.open_positions.items():
                inst_data = current_data[current_data['instrument_id'] == inst]
                if not inst_data.empty:
                    current_price = inst_data['close'].iloc[0]
                    price_change_pct = (pos['entry_price'] - current_price) / pos['entry_price']
                    unrealized_pnl += pos['position_size'] * price_change_pct
            
            total_equity = self.equity + unrealized_pnl
            self.equity_curve.append({
                'timestamp': current_time,
                'equity': total_equity,
                'realized_equity': self.equity,
                'unrealized_pnl': unrealized_pnl,
                'open_positions': len(self.open_positions)
            })
        
        # Close any remaining positions
        for inst in list(self.open_positions.keys()):
            pos = self.open_positions[inst]
            final_data = self.df[self.df['instrument_id'] == inst].sort_values('timestamp')
            if not final_data.empty:
                final_price = final_data['close'].iloc[-1]
                self._close_position(inst, final_price, timestamps[-1], 'End of Backtest')
        
        self._print_results()
        return self._get_results()
    
    def _print_results(self):
        """Print backtest results"""
        equity_df = pd.DataFrame(self.equity_curve)
        trades_df = pd.DataFrame(self.closed_trades)
        
        total_pnl = self.equity - self.starting_equity
        return_pct = (total_pnl / self.starting_equity) * 100
        
        # Calculate drawdown
        equity_df['peak'] = equity_df['equity'].cummax()
        equity_df['drawdown'] = (equity_df['equity'] - equity_df['peak']) / equity_df['peak'] * 100
        max_drawdown = equity_df['drawdown'].min()
        
        # Trade stats
        total_trades = len(trades_df)
        if total_trades > 0:
            winners = trades_df[trades_df['pnl'] > 0]
            losers = trades_df[trades_df['pnl'] <= 0]
            win_rate = len(winners) / total_trades
            avg_win = winners['pnl'].mean() if len(winners) > 0 else 0
            avg_loss = losers['pnl'].mean() if len(losers) > 0 else 0
            
            print("\n" + "="*80)
            print("RESULTS")
            print("="*80)
            print(f"Final Equity:    ${self.equity:,.2f}")
            print(f"Total P&L:       ${total_pnl:,.2f} ({return_pct:+.2f}%)")
            print(f"Max Drawdown:    {max_drawdown:.2f}%")
            print(f"\nTotal Trades:    {total_trades}")
            print(f"Winners:         {len(winners)} ({win_rate*100:.1f}%)")
            print(f"Losers:          {len(losers)} ({(1-win_rate)*100:.1f}%)")
            print(f"Avg Win:         ${avg_win:.2f}")
            print(f"Avg Loss:        ${avg_loss:.2f}")
            
            if len(losers) > 0 and losers['pnl'].sum() != 0:
                profit_factor = abs(winners['pnl'].sum() / losers['pnl'].sum())
                print(f"Profit Factor:   {profit_factor:.2f}")
            
            # Exit reasons
            print("\nExit Reasons:")
            print(trades_df['exit_reason'].value_counts())
            
            print("\n" + "="*80)
            print("SAMPLE TRADES (First 10)")
            print("="*80)
            print(trades_df[['instrument', 'entry_time', 'entry_price', 'exit_price', 'stop_loss', 'exit_reason', 'position_size', 'pnl', 'pnl_pct']].head(10).to_string(index=False))
    
    def _get_results(self):
        """Return results dictionary"""
        return {
            'equity_curve': pd.DataFrame(self.equity_curve),
            'trades': pd.DataFrame(self.closed_trades),
            'final_equity': self.equity,
            'total_pnl': self.equity - self.starting_equity,
            'return_pct': ((self.equity - self.starting_equity) / self.starting_equity) * 100
        }
    
    def plot(self):
        """Plot results"""
        equity_df = pd.DataFrame(self.equity_curve)
        
        fig, axes = plt.subplots(3, 1, figsize=(14, 10))
        
        # Equity curve
        axes[0].plot(equity_df['timestamp'], equity_df['equity'], label='Total Equity', linewidth=2)
        axes[0].plot(equity_df['timestamp'], equity_df['realized_equity'], 
                    label='Realized Equity', alpha=0.7, linestyle='--')
        axes[0].axhline(y=self.starting_equity, color='gray', linestyle=':', alpha=0.5, label='Starting Equity')
        axes[0].set_title('Equity Curve', fontsize=14, fontweight='bold')
        axes[0].set_ylabel('Equity ($)')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)
        
        # Drawdown
        equity_df['peak'] = equity_df['equity'].cummax()
        equity_df['drawdown'] = (equity_df['equity'] - equity_df['peak']) / equity_df['peak'] * 100
        axes[1].fill_between(equity_df['timestamp'], equity_df['drawdown'], 0, alpha=0.3, color='red')
        axes[1].plot(equity_df['timestamp'], equity_df['drawdown'], color='red', linewidth=1)
        axes[1].set_title('Drawdown', fontsize=14, fontweight='bold')
        axes[1].set_ylabel('Drawdown (%)')
        axes[1].grid(True, alpha=0.3)
        
        # Open positions
        axes[2].plot(equity_df['timestamp'], equity_df['open_positions'], color='green', linewidth=1.5)
        axes[2].fill_between(equity_df['timestamp'], equity_df['open_positions'], 0, alpha=0.2, color='green')
        axes[2].axhline(y=self.max_positions, color='red', linestyle='--', alpha=0.5, label=f'Max ({self.max_positions})')
        axes[2].set_title('Open Positions Over Time', fontsize=14, fontweight='bold')
        axes[2].set_xlabel('Date')
        axes[2].set_ylabel('# Positions')
        axes[2].legend()
        axes[2].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()


# ============================================================================
# RUN BACKTEST
# ============================================================================

backtest = ShortBacktest(
    df=df,
    X=X,
    model=model,
    test_instruments=test_instruments,
    starting_equity=10000,
    risk_per_trade=0.05,
    max_positions=20
)

results = backtest.run()
backtest.plot()

print(f"\nFinal Return: {results['return_pct']:.2f}%")

NameError: name 'test_instruments' is not defined