In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import (
    mean_squared_error,
    r2_score,
    mean_absolute_error,
    mean_absolute_percentage_error,
)
import warnings
from pandas.errors import SettingWithCopyWarning

# suppress pandas copy warnings
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:

# 1. LOAD EXPANDED DATASET & FILTER YEARS
# ---------------------------------------------------
mr = (
    pd.read_csv(
        "updated_QQQ_MarketRisk.csv",
        parse_dates=["Date"]
    )
    .sort_values("Date")
    .query("Date.dt.year >= 2015 and Date.dt.year <= 2023")
    .reset_index(drop=True)
)

In [3]:
print(mr.sample(10))

           Date  QQQ_Return  QQQ_Realized_Volatility  QQQ_Volume        VIX  \
1156 2019-09-27   -0.012409                 0.143274    40773500  17.219999   
416  2016-10-17   -0.002732                 0.099715    14182700  16.209999   
2088 2023-06-12    0.016897                 0.176719    44606500  15.010000   
1454 2020-12-02    0.001285                 0.233855    23620000  21.170000   
1934 2022-10-28    0.030601                 0.337867    62651300  25.750000   
650  2017-09-22   -0.000969                 0.093450    21702400   9.590000   
161  2015-10-13   -0.006461                 0.185388    26999600  17.670000   
1438 2020-11-09   -0.020434                 0.307692    86537100  25.750000   
2051 2023-04-19   -0.000470                 0.154783    38960100  16.459999   
2118 2023-07-26   -0.003327                 0.146302    47075700  13.190000   

         RSI_14        VXN  FedRate  
1156  38.058972  20.889999    1.730  
416   36.997203  17.580000    0.295  
2088  72.415060 

In [4]:
# 2. ENGINEER NEXT-DAY REALIZED VOLATILITY
# ---------------------------------------------------
mr["NextVol"] = mr["QQQ_Realized_Volatility"].shift(-1)

In [5]:
# 3. SELECT FEATURES & CLEAN
# ---------------------------------------------------
features = [
    "QQQ_Return",
    "QQQ_Realized_Volatility",
    "QQQ_Volume",
    "VIX",
    "RSI_14",
    "VXN",
    "FedRate",
]

# coerce numeric, drop any rows missing features or target
for col in features + ["NextVol"]:
    mr[col] = pd.to_numeric(mr[col], errors="coerce")

mr.dropna(subset=features + ["NextVol"], inplace=True)
mr.reset_index(drop=True, inplace=True)

In [6]:
# 4. ROLLING 3-YEAR WINDOW → PREDICT ONLY FOR 2020–2023
# ---------------------------------------------------
results = []
first_date = mr["Date"].min()

for _, row in mr.iterrows():
    today = row["Date"]
    year  = today.year

    # only record predictions in test window
    if year < 2020 or year > 2023:
        continue

    # require 3 full years of history
    if today < (first_date + pd.DateOffset(years=3)):
        continue

    # build train window [today - 3y, today)
    window_start = today - pd.DateOffset(years=3)
    train_df     = mr[(mr["Date"] >= window_start) & (mr["Date"] < today)]

    # skip if too few training rows
    if len(train_df) < 200:
        continue

    X_train = train_df[features]
    y_train = train_df["NextVol"]

    # today's features → one‐step forecast
    X_today = row[features].values.reshape(1, -1)
    y_true  = row["NextVol"]

    # train & predict
    model = xgb.XGBRegressor(
        objective="reg:squarederror",
        eval_metric="rmse",
        n_estimators=50,
        learning_rate=0.05,
        random_state=42,
    )
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_today)[0]

    results.append({
        "Date":              today,
        "PredictedNextVol":  y_pred,
        "ActualNextVol":     y_true
    })

In [7]:
# 5. ASSEMBLE & SAVE
# ---------------------------------------------------
out_df = pd.DataFrame(results)
out_df.to_csv("rolling_4yr_preds_2020_23.csv", index=False)
print(f"Saved {len(out_df)} predictions → rolling_3yr_preds_2020_23.csv")

Saved 1005 predictions → rolling_3yr_preds_2020_23.csv


In [8]:
# 6. COMPUTE TEST METRICS (2020–2023)
# ---------------------------------------------------
y_pred = out_df["PredictedNextVol"]
y_true = out_df["ActualNextVol"]

mse  = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)
r2   = r2_score(y_true, y_pred)
mae  = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)

print("\n--- Test Metrics (2020–2023) ---")
print(f"Test MSE : {mse:.6f}")
print(f"Test RMSE: {rmse:.6f}")
print(f"Test R²  : {r2:.4f}")
print(f"Test MAE : {mae:.6f}")
print(f"Test MAPE: {mape:.2%}")


--- Test Metrics (2020–2023) ---
Test MSE : 0.000778
Test RMSE: 0.027893
Test R²  : 0.9521
Test MAE : 0.014613
Test MAPE: 5.91%
