# Imports

In [1]:
import os

import pandas as pd
import numpy as np

import plotly.express as px

from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report

from pandas_market_calendars import get_calendar
from datetime import datetime, timedelta

from tqdm.notebook import tqdm

from databuilder import build_spread_backtest_dataset


# Fetch data + add relevant info

In [6]:
polygon_api_key = os.getenv("POLYGON_API_KEY")
polygon_api_key

import os
print(os.environ)  # See all environment variables accessible in this environment


environ({'COMMAND_MODE': 'unix2003', 'CONDA_DEFAULT_ENV': 'base', 'CONDA_EXE': '/opt/anaconda3/bin/conda', 'CONDA_PREFIX': '/opt/anaconda3', 'CONDA_PROMPT_MODIFIER': '(base) ', 'CONDA_PYTHON_EXE': '/opt/anaconda3/bin/python', 'CONDA_SHLVL': '1', 'GSETTINGS_SCHEMA_DIR': '/opt/anaconda3/share/glib-2.0/schemas', 'HOME': '/Users/teymour', 'LOGNAME': 'teymour', 'MallocNanoZone': '0', 'OLDPWD': '/', 'ORIGINAL_XDG_CURRENT_DESKTOP': 'undefined', 'PATH': '/Users/teymour/Desktop/qnt-projs/env/bin:/opt/anaconda3/bin:/opt/anaconda3/condabin:/Library/Frameworks/Python.framework/Versions/3.12/bin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin', 'PWD': '/', 'SHELL': '/bin/zsh', 'SHLVL': '1', 'SSH_AUTH_SOCK': '/private/tmp/com.apple.launchd.PYPRWmAoAk/Li

In [2]:
polygon_api_key = os.getenv("POLYGON_API_KEY")

calendar = get_calendar("NYSE")
trading_dates = calendar.schedule(start_date="2023-04-20", end_date=datetime.today()).index.strftime("%Y-%m-%d").values

#  Call the function from databuilder.py to generate a DataFrame with all relevant info for base strategy backtesting
base_backtest_df = build_spread_backtest_dataset(dates=trading_dates, ticker='I:SPX', index_ticker="I:VIX1D", 
                                              options_ticker="SPX", trade_time="09:35", move_adjustment=0.5, spread_width=1, api_key=polygon_api_key)

name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygon_api_key' is not defined
name 'polygo

KeyError: "None of ['date'] are in the columns"

In [None]:
# trading assumptions and max loss calculations
base_backtest_df['nat_price_cost'] = base_backtest_df['short_bid_price'] - base_backtest_df['long_ask_price']
base_backtest_df['max_nat_price_loss'] = abs(base_backtest_df['short_strike'].iloc[0] - base_backtest_df['long_strike'].iloc[0]) - base_backtest_df['nat_price_cost']
base_backtest_df['mid_price_cost'] = base_backtest_df['short_mid_price'] - base_backtest_df['long_mid_price']
base_backtest_df['max_mid_price_loss'] = abs(base_backtest_df['short_strike'].iloc[0] - base_backtest_df['long_strike'].iloc[0]) - base_backtest_df['mid_price_cost']
base_backtest_df["contracts"] = 1
base_backtest_df["fees"] = base_backtest_df["contracts"] * 0.04

In [None]:
# implied and realized volatility metrics
base_backtest_df['trade_to_close_vol'] = abs((base_backtest_df['underlying_price_at_trade'] - base_backtest_df['underlying_closing_price']) / base_backtest_df['underlying_price_at_trade']) * 100
base_backtest_df['current_day_IV'] = base_backtest_df['vix1d_value'] / np.sqrt(252)
base_backtest_df['current_day_VRP'] = base_backtest_df['current_day_IV'] - base_backtest_df['trade_to_close_vol']

# Backtest

In [None]:
def calculate_pnl(row):
    if row['direction'] == 1:
        settlement = row['underlying_closing_price'] - row['short_strike']
        if settlement > 0:
            settlement = 0
            final_pnl = row['mid_price_cost']
        else:
            final_pnl = settlement + row['mid_price_cost']
            
    elif row['direction'] == 0:
        settlement = row['short_strike'] - row['underlying_closing_price']
        if settlement > 0:
            settlement = 0
            final_pnl = row['mid_price_cost']
        else:
            final_pnl = settlement + row['mid_price_cost']

    gross_pnl = np.maximum(final_pnl, row['max_mid_price_loss'] * -1)
    
    return gross_pnl

In [None]:
base_backtest_df['gross_pnl'] = base_backtest_df.apply(calculate_pnl, axis=1)
base_backtest_df['net_pnl'] = (base_backtest_df['gross_pnl'] * base_backtest_df['contracts']) - base_backtest_df['fees']

capital = 100

base_backtest_df['net_capital'] = capital + (base_backtest_df['net_pnl']*100).cumsum()
base_backtest_df['cumulative_pnl'] = base_backtest_df['net_pnl'].cumsum()

# Meta-Labeling

In [None]:
query = "SELECT * FROM sp500_daily_OHLCV"
ml_data = pd.read_sql(query, engine)
ml_data.set_index('t', inplace=True)

In [None]:
# create features from underlying OHLC data
for days in range(1, 6):
    ml_data[f'return_{days}d'] = ml_data['c'].pct_change(periods=days)

for lag in range(1, 6):
    ml_data[f'lag_{lag}d'] = ml_data['c'].shift(lag)

for lag in range(3, 6):
    ml_data[f'serial_corr_{lag}d'] = ml_data['c'].rolling(window=lag).apply(lambda x: x.autocorr(), raw=False)

ml_data['50d_volatility'] = ml_data['c'].rolling(window=50).std()

vol_windows = [10, 20, 50, 100]
for window in vol_windows:
    ml_data[f'{window}d_volatility'] = ml_data['c'].rolling(window=window).std()

ml_data['high_low_range'] = ml_data['h'] - ml_data['l']
for window in vol_windows:
    ml_data[f'{window}d_high_low_vol'] = ml_data['high_low_range'].rolling(window=window).std()

ml_data['tr'] = np.maximum((ml_data['h'] - ml_data['l']), 
                           np.maximum(abs(ml_data['h'] - ml_data['c'].shift(1)),
                                      abs(ml_data['l'] - ml_data['c'].shift(1))))
ml_data['14d_ATR'] = ml_data['tr'].rolling(window=14).mean()


aligned_vrp = base_backtest_df['current_day_VRP'].reindex(ml_data.index)
ml_data = pd.concat([ml_data, aligned_vrp], axis=1)

def compute_rsi(data, window=14):
    delta = data.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

ml_data['14d_RSI'] = compute_rsi(ml_data['c'], window=14)

ma_windows = [10, 50, 100, 200]
for window in ma_windows:
    ml_data[f'{window}d_MA'] = ml_data['c'].rolling(window=window).mean()

ml_data['12d_EMA'] = ml_data['c'].ewm(span=12, adjust=False).mean()
ml_data['26d_EMA'] = ml_data['c'].ewm(span=26, adjust=False).mean()
ml_data['MACD'] = ml_data['12d_EMA'] - ml_data['26d_EMA']
ml_data['MACD_signal'] = ml_data['MACD'].ewm(span=9, adjust=False).mean()

ml_data['momentum_5d'] = ml_data['c'] / ml_data['c'].shift(5) - 1
ml_data['momentum_10d'] = ml_data['c'] / ml_data['c'].shift(10) - 1

ml_data['20d_MA'] = ml_data['c'].rolling(window=20).mean()
ml_data['20d_stddev'] = ml_data['c'].rolling(window=20).std()
ml_data['upper_band'] = ml_data['20d_MA'] + (ml_data['20d_stddev'] * 2)
ml_data['lower_band'] = ml_data['20d_MA'] - (ml_data['20d_stddev'] * 2)

ml_data['volatility_ratio'] = ml_data['10d_volatility'] / ml_data['50d_volatility']

ml_data['20d_high'] = ml_data['h'].rolling(window=20).max()
ml_data['20d_low'] = ml_data['l'].rolling(window=20).min()

ml_data['14d_ATRP'] = ml_data['14d_ATR'] / ml_data['c'] * 100

ml_data['MA_crossover_10_50'] = np.where(ml_data['10d_MA'] > ml_data['50d_MA'], 1, 0)

ml_data['price_to_50d_MA'] = ml_data['c'] / ml_data['50d_MA']
ml_data['price_to_200d_MA'] = ml_data['c'] / ml_data['200d_MA']

# Final cleanup: Drop intermediate calculation columns that were temporary
ml_data.drop(['tr', '20d_stddev', '20d_MA'], axis=1, inplace=True)

ml_data = ml_data.dropna()

In [None]:
# target variable - whether we should have traded on a given day or not
base_backtest_df['to_trade'] = np.where(base_backtest_df['net_pnl'] >= 0, 1, 0 )
aligned_target = base_backtest_df['to_trade'].reindex(ml_data.index)

ml_data = pd.concat([ml_data, aligned_target], axis=1)
ml_data = ml_data.rename(columns={'to_trade': 'target'})

X = ml_data.drop(['o', 'c', 'h', 'l', 'target'], axis=1)

In [None]:
def metalabel(data, training_periods, testing_periods, quant_feature_list, cat_feature_list):
    data = data[:-1].copy()
    
    keys, backtest_keys, period_data_dict, backtest_period_data_dict = fr.group_by_period(
        data, training_periods, testing_periods
    )

    agg_backtest_df = pd.DataFrame()
    num_iterations = len(keys)

    for i in tqdm(range(num_iterations)):
        model_key = keys[i]
        train_df = period_data_dict[model_key].copy()
        
        scaler = StandardScaler()
        train_df[quant_feature_list] = scaler.fit_transform(train_df[quant_feature_list])

        all_features = quant_feature_list + cat_feature_list
        
        train_features = train_df[all_features]
        train_target = train_df['target'].values.flatten()

        model = CatBoostClassifier(
                                   loss_function='Logloss',
                                   eval_metric='Logloss',
                                   task_type='CPU',
                                   cat_features=[f for f in all_features if f in cat_feature_list],
                                   verbose=False)

        model.fit(train_features, train_target,
                  early_stopping_rounds=20,
                  plot=False)

        backtest_key = backtest_keys[i]
        backtest_df = backtest_period_data_dict[backtest_key].copy()
        
        backtest_df[quant_feature_list] = scaler.transform(backtest_df[quant_feature_list])
        
        test_features = backtest_df[all_features]

        probabilities = model.predict_proba(test_features)[:, 1] 
        predictions = (probabilities > 0.5).astype(int) 
        confidence = np.maximum(probabilities, 1 - probabilities)  

        prediction_df = pd.DataFrame({
            'predicted_trade_action': predictions,
            'prediction_confidence': confidence
        }, index=backtest_df.index)

        backtest_df = backtest_df.join(prediction_df)

        agg_backtest_df = pd.concat([agg_backtest_df, backtest_df], axis=0)

    return agg_backtest_df

metalabeled_backtest_df = metalabel(data=ml_data, training_periods=150, testing_periods=1, quant_feature_list=list(X.columns), cat_feature_list=[])

In [None]:
y_true = backtest_df['target']
y_pred = backtest_df['predicted_trade_action']

# Calculate the classification metrics
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

# Print the metrics
print("Classification Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Display the confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)

# Detailed classification report
report = classification_report(y_true, y_pred, target_names=['No Trade', 'Trade'])
print("\nClassification Report:")
print(report)

In [None]:
metalabeled_data = data.copy()
aligned_preds = backtest_df['predicted_trade_action'].reindex(metalabeled_data.index)
metalabeled_data['predicted_trade_action'] = aligned_preds
metalabeled_data = metalabeled_data.dropna()
metalabeled_data['predicted_trade_action'] = metalabeled_data['predicted_trade_action'].astype(int)

In [None]:
metalabeled_data

In [None]:
metalabeled_data['gross_pnl'] = metalabeled_data.apply(calculate_pnl, axis=1)
metalabeled_data['net_pnl'] = np.where(metalabeled_data['predicted_trade_action'] == 1, metalabeled_data['gross_pnl'] * metalabeled_data['contracts'] - metalabeled_data['fees'], 0)

capital = 3000

metalabeled_data['net_capital'] = capital + (metalabeled_data['net_pnl']*100).cumsum()
metalabeled_data['cumulative_pnl'] = metalabeled_data['net_pnl'].cumsum()

In [None]:
metalabeled_data

In [None]:
data

In [None]:
px.line(metalabeled_data['net_capital'])

In [None]:
px.line(data['net_capital'])

In [None]:
data = backtest_data