In [4]:
import os
import sys
import numpy as np
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm
current_dir = os.path.dirname(r"C:\Users\JNoot\Documents\University\Bachelor Thesis\New Code\simulation_study\simulation_1")
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
sys.path.insert(0, parent_dir)
sys.path.append(os.path.dirname(os.path.abspath(r"C:\Users\JNoot\Documents\University\Bachelor Thesis\New Code\simulation_study\simulation_1\simulation_1.ipynb")) + '/..')
from simulation_study.models.random_forest import train_test_rf
from simulation_study.models.local_linear_forest import LocalLinearForestRegressor
from simulation_study.models.bart import predict_bart
from simulation_study.models.xgboost import predict_xgboost
from simulation_study.models.lasso_rf import LassoRandomForest
from simulation_study.hypertuning.hypertune import hypertune_model
from simulation_study.utils.qlike import get_qlike

def generate_GJR(n,omega, alpha, gamma, beta, mu, sigma):
  # Simulate GJR-GARCH(1,1) process
  np.random.seed()
  residuals = np.random.normal(scale=sigma, size=n)
  volatility = np.zeros(n)
  returns = np.zeros(n)

  # Initial volatility
  volatility[0] = np.sqrt(omega / (1 - alpha - gamma / 2 - beta))

  for t in range(1, n):
      # GJR-GARCH process
      indicator = (residuals[t-1] < 0).astype(float)
      volatility[t] = np.sqrt(omega + (alpha + gamma * indicator) * (residuals[t-1] ** 2) + beta * (volatility[t-1] ** 2))

      # Generate returns
      returns[t] = mu + volatility[t] * residuals[t]

  return returns, volatility

# Generate lagged features for forecasting
def create_lagged_features(data, lag=1):
    lagged_data = np.column_stack([np.roll(data, i) for i in range(1, lag+1)])
    return lagged_data[lag:]

def generate_features(returns, volatility, lag=5, p = 10):
    X_lagged = create_lagged_features(returns, lag)
    if p > 0:
        random_features = np.random.rand(X_lagged.shape[0], p)  # Ensure matching rows
        X = np.hstack((X_lagged, random_features))
    else:
        X = X_lagged
    y = volatility[lag:]
    return X, y

def get_parameters(tuning_size, n_trials, sigma, lag, p):
    returns_tune, volatility_tune = generate_GJR(tuning_size, omega, alpha, gamma, beta, mu, sigma)
    X_tune, Y_tune = generate_features(returns_tune, volatility_tune, lag, p)
    LLF_parameters = hypertune_model("LocalLinearForest", X_tune, Y_tune, n_trials = n_trials)
    RF_parameters = hypertune_model("RandomForest", X_tune, Y_tune, n_trials = n_trials)
    LRF_parameters = hypertune_model("LASSO-RF", X_tune, Y_tune, n_trials = n_trials)
    XGB_parameters = hypertune_model("XGBoost", X_tune, Y_tune, n_trials = n_trials)
    BART_parameters = hypertune_model("BART", X_tune, Y_tune, n_trials = 3)
    parameters = {"LLF": LLF_parameters, "RF": RF_parameters, "LRF" : LRF_parameters, 
                   "XGB": XGB_parameters, "BART": BART_parameters}
    return parameters

# Simulation and forecasting
def simulation_run(n, p, sigma, omega, alpha, gamma, beta, mu, num_reps=50, lag=5, parameters = None):
    errors_list = []
    qlike_list = []
    for _ in tqdm(range(num_reps)):
        returns, volatility = generate_GJR(n, omega, alpha, gamma, beta, mu, sigma)
        X, y = generate_features(returns, volatility, lag, p)
        train_size = len(X)-1

        # Train-test split
        X_train, X_test = X[:train_size], X[train_size:]
        Y_train, y_test = y[:train_size], y[train_size:]

        #Random Forest
        RF_predictions = train_test_rf(X_train, Y_train, X_test, **parameters["RF"])
        RF_mse = mean_squared_error(y_test, RF_predictions)
        RF_qlike = get_qlike(RF_predictions, y_test)

        #Lasso Random Forest
        LRF = LassoRandomForest(**parameters["LRF"])
        LRF.fit(X_train, Y_train)
        LRF_preds = LRF.predict(X_test)
        LRF_mse = mean_squared_error(y_test, LRF_preds)
        LRF_qlike = get_qlike(LRF_preds, y_test)

        #Local Linear Forest
        LLF = LocalLinearForestRegressor(**parameters["LLF"])
        LLF.fit(X_train, Y_train)
        LLF_predictions = LLF.predict_LLF(X_test)
        LLF_mse = mean_squared_error(y_test, LLF_predictions)
        LLF_qlike = get_qlike(LLF_predictions, y_test)

        #Bayesian Additive Regression Trees
        BART_predictions = predict_bart(X_train, Y_train, X_test, **parameters["BART"])
        BART_mse = mean_squared_error(y_test, BART_predictions)
        BART_qlike = get_qlike(BART_predictions, y_test)

        #XGBoost
        XG_predictions = predict_xgboost(X_train, Y_train, X_test, **parameters["XGB"])
        XG_mse = mean_squared_error(y_test, XG_predictions)
        XG_qlike = get_qlike(XG_predictions, y_test)

        # Collect errors
        errors = {
            "LLF": LLF_mse,
            "RF": RF_mse,
            "Lasso RF": LRF_mse,
            "BART": BART_mse,
            "XGBoost": XG_mse
        }
        errors_list.append(errors)

        qlikes = {
            "LLF": LLF_qlike,
            "RF": RF_qlike,
            "Lasso RF": LRF_qlike,
            "BART": BART_qlike,
            "XGBoost": XG_qlike
        }
        qlike_list.append(qlikes)

    mean_errors = {model: np.mean([errors[model] for errors in errors_list]) for model in errors_list[0]}
    mean_qlikes = {model: np.mean([qlikes[model] for qlikes in qlike_list]) for model in qlike_list[0]}
    return mean_errors, mean_qlikes

# Simulation parameters
num_reps = 10
efficient_run = True

# Parameters for GJR-GARCH(1,1) model
omega = 0.0000908
alpha = 0.03569
beta = 0.87636
gamma = 0.06178693
mu= -0.000215
lags = 5

ns = [1000, 5000]
ps = [0,5,10]
sigmas = [1,5,10]

if efficient_run:
  ns = [5000]


args = [(n, p, sigma) for n in ns for p in ps for sigma in sigmas]
mses_results = []
qlikes_results = []
model_parameters = get_parameters(1000, 50, 10, 5, 10)
for arguments in tqdm(args):
    print(arguments)
    mses, qlikes = simulation_run(*arguments, omega, alpha, gamma, beta, mu, num_reps, lags, model_parameters)
    mses_results.append([*np.round(np.sqrt(list(mses.values())), 3)])
    qlikes_results.append([*np.round(np.sqrt(list(qlikes.values())), 3)])
    print([*np.round(np.sqrt(list(mses.values())), 3)])
    print([*np.round(np.sqrt(list(qlikes.values())), 3)])

mses_results = np.array(mses_results)
qlikes_results = np.array(qlikes_results)

print("Mean RMSE: ", mses_results)
print("Mean QLIKE:", qlikes_results)

  0%|          | 0/50 [00:00<?, ?it/s]

LocalLinearForest {'n_estimators': 300, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 2, 'lam': 0.05}


  0%|          | 0/50 [00:00<?, ?it/s]

RandomForest {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 6, 'min_samples_leaf': 2}


  0%|          | 0/50 [00:00<?, ?it/s]

LASSO-RF {'lasso_alpha': 0.046012689864369204, 'n_estimators': 500, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 2}


  0%|          | 0/50 [00:00<?, ?it/s]

XGBoost {'n_estimators': 500, 'max_depth': 7, 'learning_rate': 0.5500292037407584}


  0%|          | 0/3 [00:00<?, ?it/s]

BART {'n_chains': 4, 'n_trees': 200, 'n_burn': 300, 'n_samples': 500}


  0%|          | 0/9 [00:00<?, ?it/s]

(5000, 0, 1)


  0%|          | 0/10 [00:00<?, ?it/s]

[0.099, 0.051, 0.047, 0.054, 0.033]
[0.108, 0.061, 0.056, 0.058, 0.039]
(5000, 0, 5)


  0%|          | 0/10 [00:00<?, ?it/s]

[0.43, 0.508, 0.483, 0.476, 0.512]
[0.084, 0.103, 0.098, 0.091, 0.104]
(5000, 0, 10)


  0%|          | 0/10 [00:00<?, ?it/s]

[0.317, 0.429, 0.421, 0.598, 0.815]
[0.031, 0.045, 0.049, 0.061, 0.089]
(5000, 5, 1)


  0%|          | 0/10 [00:00<?, ?it/s]

[0.139, 0.042, 0.043, 0.043, 0.027]
[0.14, 0.05, 0.052, 0.046, 0.027]
(5000, 5, 5)


  0%|          | 0/10 [00:00<?, ?it/s]

[0.378, 0.443, 0.235, 0.384, 0.378]
[0.083, 0.088, 0.057, 0.073, 0.069]
(5000, 5, 10)


  0%|          | 0/10 [00:00<?, ?it/s]

[0.626, 0.527, 0.638, 0.495, 0.444]
[0.066, 0.057, 0.063, 0.053, 0.045]
(5000, 10, 1)


  0%|          | 0/10 [00:00<?, ?it/s]

[0.054, 0.057, 0.06, 0.054, 0.063]
[0.054, 0.059, 0.061, 0.056, 0.068]
(5000, 10, 5)


  0%|          | 0/10 [00:00<?, ?it/s]

[0.341, 0.231, 0.241, 0.345, 0.247]
[0.076, 0.056, 0.053, 0.068, 0.048]
(5000, 10, 10)


  0%|          | 0/10 [00:00<?, ?it/s]

[0.487, 0.319, 0.394, 0.385, 0.449]
[0.048, 0.032, 0.038, 0.036, 0.04]
Mean RMSE:  [[0.099 0.051 0.047 0.054 0.033]
 [0.43  0.508 0.483 0.476 0.512]
 [0.317 0.429 0.421 0.598 0.815]
 [0.139 0.042 0.043 0.043 0.027]
 [0.378 0.443 0.235 0.384 0.378]
 [0.626 0.527 0.638 0.495 0.444]
 [0.054 0.057 0.06  0.054 0.063]
 [0.341 0.231 0.241 0.345 0.247]
 [0.487 0.319 0.394 0.385 0.449]]
Mean QLIKE: [[0.108 0.061 0.056 0.058 0.039]
 [0.084 0.103 0.098 0.091 0.104]
 [0.031 0.045 0.049 0.061 0.089]
 [0.14  0.05  0.052 0.046 0.027]
 [0.083 0.088 0.057 0.073 0.069]
 [0.066 0.057 0.063 0.053 0.045]
 [0.054 0.059 0.061 0.056 0.068]
 [0.076 0.056 0.053 0.068 0.048]
 [0.048 0.032 0.038 0.036 0.04 ]]
