In [13]:
# 0) Installs + imports
!pip install statsmodels scikit-learn openpyxl --quiet

import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import (
    r2_score,
    mean_absolute_error,
    mean_squared_error
)

# 1) File paths and data loading
def load_df(path):
    df = pd.read_excel(path, parse_dates=["trading_day", "Date"])
    return df.sort_values("trading_day").set_index("trading_day")

train_df = load_df("/content/drive/MyDrive/Masters Thesis/Colab notebook/combined_training_data_.xlsx")
val_df   = load_df("/content/drive/MyDrive/Masters Thesis/Colab notebook/combined_validation_data.xlsx")
test_df  = load_df("/content/drive/MyDrive/Masters Thesis/Colab notebook/combined_test_data.xlsx")

# 2) Simplified exogenous set
TARGET = "return_t+1"
EXOG_SIMPLE = ["return_t", "VIX", "Sentiment_score"]
for df in (train_df, val_df, test_df):
    df.dropna(subset=EXOG_SIMPLE + [TARGET], inplace=True)

# Split endog/exog
y_train, X_train = train_df[TARGET], train_df[EXOG_SIMPLE]
y_val,   X_val   = val_df[TARGET],   val_df[EXOG_SIMPLE]
y_test,  X_test  = test_df[TARGET],  test_df[EXOG_SIMPLE]

# 3) Fit ARIMAX with simple exog
order = (1, 0, 1)
simple_mod = sm.tsa.SARIMAX(
    endog=y_train, exog=X_train,
    order=order,
    enforce_stationarity=False,
    enforce_invertibility=False
)
simple_res = simple_mod.fit(disp=False)
print(simple_res.summary())

# 4) Forecast utility
def forecast_and_metrics(res, y_true, X_exog, start_pos, label):
    # Predict
    y_pred = res.predict(
        start=start_pos,
        end=start_pos + len(y_true) - 1,
        exog=X_exog
    )
    # Convert to numpy
    yt = np.asarray(y_true)
    yp = np.asarray(y_pred)
    # Metrics
    r2   = r2_score(yt, yp)
    mae  = mean_absolute_error(yt, yp)
    da   = np.mean(np.sign(yp) == np.sign(yt))
    # P&L simulation: long if pred>0, short if pred<0
    pnl = np.sign(yp) * yt
    cum_pnl = np.cumsum(pnl)
    sharpe = np.mean(pnl) / np.std(pnl) * np.sqrt(252)
    print(f"--- {label} ---")
    print(f"R²: {r2:+.3f}, MAE: {mae:.4f}, DirAcc: {da:.3f}, Sharpe: {sharpe:.2f}")
    return yp, pnl, cum_pnl

# 5) Evaluate on validation
start_val = len(y_train)
pred_val, pnl_val, cum_val = forecast_and_metrics(
    simple_res, y_val, X_val, start_val, "Validation"
)

# 6) Evaluate on test via re-fitting on train+validation
# Combine train and validation sets
def concat_endog_exog(y_train, X_train, y_val, X_val):
    full_y = pd.concat([pd.Series(y_train, index=X_train.index), pd.Series(y_val, index=X_val.index)])
    full_X = pd.concat([X_train, X_val])
    return full_y, full_X

full_y, full_X = concat_endog_exog(y_train, X_train, y_val, X_val)
# Re-fit ARIMAX on combined data
full_mod = sm.tsa.SARIMAX(
    endog=full_y,
    exog=full_X,
    order=order,
    enforce_stationarity=False,
    enforce_invertibility=False
)
full_res = full_mod.fit(disp=False)
print("Re-fitted on train+val.", full_res.summary())

# Forecast on test only
start_full = len(full_y)
pred_test, pnl_test, cum_test = forecast_and_metrics(
    full_res, y_test, X_test, start_full, "Test"
)

# 7) Rolling-window CV (one-step ahead) (one-step ahead) (one-step ahead)
def rolling_arimax_cv(endog, exog, order, initial_train):
    preds, trues = [], []
    for t in range(initial_train, len(endog)):
        et = endog[:t]
        xt = exog[:t]
        mod = sm.tsa.SARIMAX(
            endog=et, exog=xt, order=order,
            enforce_stationarity=False, enforce_invertibility=False
        )
        res = mod.fit(disp=False)
        # one-step forecast
        x_next = exog.iloc[t:t+1]
        p = res.predict(start=t, end=t, exog=x_next)
        preds.append(p.iloc[0])
        trues.append(endog.iloc[t])
    return np.array(trues), np.array(preds)

# 8) Run rolling CV on combined train+val, then evaluate
combo_endog = pd.concat([train_df[TARGET], val_df[TARGET]])
combo_exog  = pd.concat([train_df[EXOG_SIMPLE], val_df[EXOG_SIMPLE]])
true_cv, pred_cv = rolling_arimax_cv(combo_endog, combo_exog, order, initial_train=len(y_train))

da_cv = np.mean(np.sign(pred_cv) == np.sign(true_cv))
print(f"Rolling CV directional accuracy: {da_cv:.3f}")

# (Feel free to adjust `EXOG_SIMPLE`, `order`, or `initial_train` for more robust backtests.)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                               SARIMAX Results                                
Dep. Variable:             return_t+1   No. Observations:                  459
Model:               SARIMAX(1, 0, 1)   Log Likelihood                 870.901
Date:                Mon, 05 May 2025   AIC                          -1729.802
Time:                        14:45:21   BIC                          -1705.054
Sample:                             0   HQIC                         -1720.055
                                - 459                                         
Covariance Type:                  opg                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
return_t            0.0534      0.121      0.442      0.659      -0.184       0.291
VIX              -6.75e-07   2.37e-05     -0.028      0.977   -4.71e-05    4.58e-05
Sentiment_score    -0.0099      

  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Re-fitted on train+val.                                SARIMAX Results                                
Dep. Variable:             return_t+1   No. Observations:                  606
Model:               SARIMAX(1, 0, 1)   Log Likelihood                1839.563
Date:                Mon, 05 May 2025   AIC                          -3667.126
Time:                        14:45:22   BIC                          -3640.704
Sample:                             0   HQIC                         -3656.843
                                - 606                                         
Covariance Type:                  opg                                         
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
return_t            0.0389      0.102      0.380      0.704      -0.162       0.240
VIX              1.896e-05   2.04e-05      0.931      0.352    -2.1e-05    5.89e-05
Sentimen

  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates

Rolling CV directional accuracy: 0.551


  return get_prediction_index(
  return get_prediction_index(
