# Model Training and Evaluating it

**Description:**
This notebook creates various models and evaulates them as needed.

In [2]:
#Installing Tensor flow

%pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt_einsum>=2.3.2 (from tensorflow)
  Using cached opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting termcolor>=1.1.0 (from tensorflow)
  Using cached termcolor-3.2.0-py3-none-any.whl.metadata (6.4 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Using cached tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting keras>=3.10.0 (from tensorflow)
  

In [5]:
#Setting everything up
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.arima.model import ARIMA
import tensorflow as tf
from tensorflow.keras import layers, models

sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.data_process import make_supervised_frame, time_series_split
from scripts.data_modeling import compute_regression_metrics, directional_accuracy, make_lstm_sequences

np.random.seed(42)
tf.random.set_seed(42)
plt.rcParams["figure.figsize"] = (10, 5)
plt.rcParams["axes.grid"] = True

In [12]:
#Loading data
df = pd.read_csv("../data/sp500.csv", index_col = "Date", parse_dates = True)

df_sup = make_supervised_frame(df, target_col="LogReturn", horizon=1, lags=3)

df_sup.head()

Unnamed: 0_level_0,Close,High,Low,Open,Volume,MA10,MA50,EMA10,EMA50,Return,LogReturn,Volatility20,Momentum10,MACD,MACD_signal,y,LogReturn_lag1,LogReturn_lag2,LogReturn_lag3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1990-03-13,336.0,338.670013,335.359985,338.670013,145440000,336.166,335.755001,335.94302,337.53614,-0.007884,-0.007915,0.007626,5.73999,0.922821,-0.212164,0.002586,0.002187,-0.006901,0.009805
1990-03-14,336.869995,337.630005,334.929993,336.0,145060000,336.663998,335.298601,336.111561,337.510017,0.002589,0.002586,0.007623,4.97998,0.965092,0.023287,0.003556,-0.007915,0.002187,-0.006901
1990-03-15,338.070007,338.910004,336.869995,336.869995,144410000,337.197,334.884801,336.467642,337.531977,0.003562,0.003556,0.007632,5.330017,1.08294,0.235218,0.011295,0.002586,-0.007915,0.002187
1990-03-16,341.910004,341.910004,338.070007,338.070007,222520000,337.834,334.609601,337.457163,337.703664,0.011359,0.011295,0.007797,6.369995,1.469254,0.482025,0.004727,0.003556,0.002586,-0.007915
1990-03-19,343.529999,343.76001,339.119995,341.910004,142300000,338.813,334.436201,338.561315,337.932148,0.004738,0.004727,0.007628,9.790009,1.884409,0.762502,-0.005722,0.011295,0.003556,0.002586


In [13]:
df_train, df_val, df_test = time_series_split(df_sup, train_frac=0.6, val_frac=0.2)

feature_cols = [c for c in df_sup.columns if c not in ["y"]]

X_train, y_train = df_train[feature_cols], df_train["y"]
X_val, y_val = df_val[feature_cols], df_val["y"]
X_test, y_test = df_test[feature_cols], df_test["y"]

print("Train size:", len(X_train))
print("Val size:", len(X_val))
print("Test size:", len(X_test))

Train size: 5400
Val size: 1800
Test size: 1801


In [14]:
#baseline
y_test_naive = df_test["LogReturn"]  

baseline_metrics = compute_regression_metrics(y_test, y_test_naive)
baseline_dir_acc = directional_accuracy(y_test, y_test_naive)

print("Naive baseline metrics:", baseline_metrics)
print("Naive baseline directional accuracy:", baseline_dir_acc)

Naive baseline metrics: {'RMSE': 0.019374860234720747, 'MAE': 0.012262720702485339, 'MAPE': 59087086.12040067, 'R2': -1.3104790606449535}
Naive baseline directional accuracy: 0.48917268184342033
