In [3]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
from scipy.stats import randint
import math

In [7]:
# Read the master dataset with augmented columns and engineered features
df = pd.read_csv("../data/master_dataset.csv")

## Helper Functions

In [5]:
def forward_selection(data, response_col, alpha=0.05):
    selected_features = []
    remaining_features = list(data.columns.drop(response_col))

    while remaining_features:
        best_pvalue = float('inf')
        best_feature = None

        for feature in remaining_features:
            X = data[selected_features + [feature]]
            X = sm.add_constant(X)  # Add a constant term for the intercept
            y = data[response_col]

            model = sm.OLS(y, X).fit()
            pvalue = model.pvalues[feature]

            if pvalue < best_pvalue:
                best_pvalue = pvalue
                best_feature = feature

        if best_pvalue < alpha:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
            print(f"Added {best_feature} with p-value {best_pvalue:.4f}")
        else:
            break

    return selected_features

In [16]:
def preprocess_and_predict(df, y_var_name="High", regressor = RandomForestRegressor(random_state=42), param_dist = None ):
    # Break into x and y
    X = df.drop(y_var_name, axis=1)

    # Predicts the change (delta) from the Open: for instance High - Open
    y = df[y_var_name] - df["Open"]
    y = pd.DataFrame(y)

    numeric_cols = X.select_dtypes(include=[np.number]).columns
    X = X[numeric_cols]

    # Avoid leakage
    drop_mask = []

    for col in X:
        if 'high' in str(col).lower():
            
            if 'lag' in str(col).lower():
                pass
            else:
                print(col)
                drop_mask.append(col)
        if 'low' in str(col).lower():
            
            if 'lag' in str(col).lower():
                pass
            else:
                print(col)
                drop_mask.append(col)

        if 'close' in str(col).lower():
            
            if 'lag' in str(col).lower():
                pass
            else:
                print(col)
                drop_mask.append(col)

    # Drop elems in drop mask
    X = X.drop(drop_mask, axis=1)
    X.drop('Unnamed: 0', axis=1, inplace=True)

    X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
    
    X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size = 0.25, random_state=42)

    # Variable Selection
    data_x = pd.DataFrame(X, columns=X.columns)
    data_y = pd.DataFrame(y, columns=[y_var_name])
    data = data_x.copy()
    data[y_var_name] = data_y
    data
    
    # Perform forward variable selection
    response_column = y_var_name
    selected_features = forward_selection(data, response_column)

    # Create a ColumnTransformer to apply scaling to numerical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), X.columns)
        ])

    # Create the pipeline with preprocessing and XGBoost
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])

    # Instantiate RandomizedSearchCV
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=20, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1, random_state=42)

    # Fit the random search to the data
    random_search.fit(X_train, y_train)

    # Print the best hyperparameters
    print("Best Hyperparameters:", random_search.best_params_)

    # Predict on the test set using the best model
    y_pred = random_search.best_estimator_.predict(X_val)

    # Evaluate the model with the Mean Squared Error and R2 error
    mse = mean_squared_error(y_val, y_pred)
    print(f'Mean Squared Error: {mse}')

    r2 = r2_score(y_val, y_pred)
    print(f"R2 score: {r2}")

    # Also predict on the train dataset to get metrics on train data as well
    y_pred_train = random_search.best_estimator_.predict(X_train)

    # Evaluate the model with the Mean Squared Error and R2 error for train data
    mse_train = mean_squared_error(y_train, y_pred_train)
    print(f'Train Mean Squared Error: {mse_train}')

    r2_train = r2_score(y_train, y_pred_train)
    print(f"Train R2 score: {r2_train}")

    dataframes = {'X_train' : X_train,'X_val' : X_val,'X_test' : X_test,'y_train' : y_train,'y_val' : y_val,'y_test' : y_test}
    predictions = y_pred

    return random_search.best_estimator_, predictions, dataframes

## Experiment #1: Linear Regressors

In [31]:
# Define the parameter grid for random search

param_dist = {
    'regressor__fit_intercept': [True, False],
    'regressor__positive': [True, False],
}

best_model_lr, high_delta_pred_lr, df_list_lr = preprocess_and_predict(df, "High", regressor=LinearRegression(), param_dist=param_dist)

Low
Close
Adj Close
High_DXY
Low_DXY
Close_DXY
Adj Close_DXY
High_CHF
Low_CHF
High_BNO
Low_BNO
Close_BNO
Adj Close_BNO
High_GSCI
Low_GSCI
High_USL
Low_USL
Close_USL
Adj Close_USL
Fitting 5 folds for each of 4 candidates, totalling 20 fits




Best Hyperparameters: {'regressor__positive': False, 'regressor__fit_intercept': False}
Mean Squared Error: 7.841955270602665
R2 score: 0.380188887026385
Train Mean Squared Error: 7.677822204467188
Train R2 score: 0.5055896763242659


In [32]:
best_model_low_lr, low_delta_pred_lr, df_list_lr = preprocess_and_predict(df, "Low", regressor=LinearRegression(), param_dist=param_dist)

High
Close
Adj Close
High_DXY
Low_DXY
Close_DXY
Adj Close_DXY
High_CHF
Low_CHF
High_BNO
Low_BNO
Close_BNO
Adj Close_BNO
High_GSCI
Low_GSCI
High_USL
Low_USL
Close_USL
Adj Close_USL
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Hyperparameters: {'regressor__positive': False, 'regressor__fit_intercept': False}
Mean Squared Error: 7.934293865890573
R2 score: 0.5006839908726448
Train Mean Squared Error: 8.260819916765513
Train R2 score: 0.5416021013395058




## Experiment #2: Random Forest Regressors (default)

In [17]:
# Define the parameter grid for random search
param_dist = {
    'regressor__n_estimators': [50, 100, 150, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['sqrt', 'log2']
}

best_model_rf, high_delta_pred_rf, df_list_rf = preprocess_and_predict(df, "High", param_dist=param_dist)

Low
Close
Adj Close
High_DXY
Low_DXY
Close_DXY
Adj Close_DXY
High_CHF
Low_CHF
High_BNO
Low_BNO
Close_BNO
Adj Close_BNO
High_GSCI
Low_GSCI
High_USL
Low_USL
Close_USL
Adj Close_USL
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  return fit_method(estimator, *args, **kwargs)


Best Hyperparameters: {'regressor__n_estimators': 200, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 4, 'regressor__max_features': 'log2', 'regressor__max_depth': 30}
Mean Squared Error: 8.372897300726871
R2 score: 0.338224382096167
Train Mean Squared Error: 4.395697441174484
Train R2 score: 0.7169408021187058


In [18]:
best_model_low_rf, low_delta_pred_rf, df_list_rf = preprocess_and_predict(df, "Low", param_dist=param_dist)

High
Close
Adj Close
High_DXY
Low_DXY
Close_DXY
Adj Close_DXY
High_CHF
Low_CHF
High_BNO
Low_BNO
Close_BNO
Adj Close_BNO
High_GSCI
Low_GSCI
High_USL
Low_USL
Close_USL
Adj Close_USL
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  return fit_method(estimator, *args, **kwargs)


Best Hyperparameters: {'regressor__n_estimators': 50, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'sqrt', 'regressor__max_depth': 30}
Mean Squared Error: 8.717670565791952
R2 score: 0.4513850193384956
Train Mean Squared Error: 3.4808703734837128
Train R2 score: 0.80684439549685


## Experiment #3: XGBoost Regressors

In [19]:
# Define the parameter grid for random search
param_dist = {
    'regressor__n_estimators': [50, 100, 150, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['sqrt', 'log2']
}

best_model_xg, high_delta_pred_xg, df_list_xg = preprocess_and_predict(df, "High", regressor=xgb.XGBRegressor(), param_dist=param_dist)

Low
Close
Adj Close
High_DXY
Low_DXY
Close_DXY
Adj Close_DXY
High_CHF
Low_CHF
High_BNO
Low_BNO
Close_BNO
Adj Close_BNO
High_GSCI
Low_GSCI
High_USL
Low_USL
Close_USL
Adj Close_USL
Fitting 5 folds for each of 20 candidates, totalling 100 fits


Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.



Best Hyperparameters: {'regressor__n_estimators': 50, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'log2', 'regressor__max_depth': 10}
Mean Squared Error: 10.410101574510882
R2 score: 0.17720818081507572
Train Mean Squared Error: 0.001357334901714871
Train R2 score: 0.9999125949559365


In [20]:
best_model_low_xg, low_delta_pred_xg, df_list_xg = preprocess_and_predict(df, "Low", regressor=xgb.XGBRegressor(), param_dist=param_dist)

High
Close
Adj Close
High_DXY
Low_DXY
Close_DXY
Adj Close_DXY
High_CHF
Low_CHF
High_BNO
Low_BNO
Close_BNO
Adj Close_BNO
High_GSCI
Low_GSCI
High_USL
Low_USL
Close_USL
Adj Close_USL
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters: {'regressor__n_estimators': 50, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 2, 'regressor__max_features': 'log2', 'regressor__max_depth': None}
Mean Squared Error: 8.402875761069414
R2 score: 0.4711954887066403
Train Mean Squared Error: 0.3545757246020291
Train R2 score: 0.9803243783654307


Parameters: { "max_features", "min_samples_leaf", "min_samples_split" } are not used.



## Experiment #4: Neural Network Regressors

In [23]:
# Define the parameter grid for random search

param_dist = {
    'regressor__hidden_layer_sizes': [(50, 50), (100, 50, 25), (200, 100, 50)],
    'regressor__activation': ['relu', 'tanh', 'logistic'],
    'regressor__solver': ['adam', 'sgd'],
    'regressor__alpha': [0.0001, 0.001, 0.01],
    'regressor__learning_rate': ['constant', 'adaptive'],
    'regressor__learning_rate_init': [0.001, 0.01, 0.1],
    'regressor__max_iter': [200, 300, 400]
}

best_model_nn, high_delta_pred_nn, df_list_nn = preprocess_and_predict(df, "High", regressor=MLPRegressor(random_state=42), param_dist=param_dist)

Low
Close
Adj Close
High_DXY
Low_DXY
Close_DXY
Adj Close_DXY
High_CHF
Low_CHF
High_BNO
Low_BNO
Close_BNO
Adj Close_BNO
High_GSCI
Low_GSCI
High_USL
Low_USL
Close_USL
Adj Close_USL
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights
  y = column_or_1d(y, warn=True)


Best Hyperparameters: {'regressor__solver': 'sgd', 'regressor__max_iter': 200, 'regressor__learning_rate_init': 0.01, 'regressor__learning_rate': 'adaptive', 'regressor__hidden_layer_sizes': (50, 50), 'regressor__alpha': 0.01, 'regressor__activation': 'logistic'}
Mean Squared Error: 9.128672218718945
R2 score: 0.27848957401402274
Train Mean Squared Error: 9.7835403679447
Train R2 score: 0.36999278816383774




In [24]:
best_model_low_nn, low_delta_pred_nn, df_list_nn = preprocess_and_predict(df, "Low", regressor=MLPRegressor(random_state=42), param_dist=param_dist)

High
Close
Adj Close
High_DXY
Low_DXY
Close_DXY
Adj Close_DXY
High_CHF
Low_CHF
High_BNO
Low_BNO
Close_BNO
Adj Close_BNO
High_GSCI
Low_GSCI
High_USL
Low_USL
Close_USL
Adj Close_USL
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights
  y = column_or_1d(y, warn=True)


Best Hyperparameters: {'regressor__solver': 'sgd', 'regressor__max_iter': 200, 'regressor__learning_rate_init': 0.01, 'regressor__learning_rate': 'adaptive', 'regressor__hidden_layer_sizes': (50, 50), 'regressor__alpha': 0.01, 'regressor__activation': 'logistic'}
Mean Squared Error: 8.763454310017995
R2 score: 0.44850378543964387
Train Mean Squared Error: 9.427883155032863
Train R2 score: 0.4768410556544508




# Taking Predictions from the Best Model and Saving Data

In [33]:
# The best model is Linear Regression!
data_train = df[df.index.isin(df_list_lr["X_train"].index)]
data_val = df[df.index.isin(df_list_lr["X_val"].index)]
data_test = df[df.index.isin(df_list_lr["X_test"].index)]

In [34]:
data_train

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,T10Y2Y_Ten_Two,Date_lag_Ten_Two,...,Close_USL,Adj Close_USL,Volume_USL,Date_lag_USL,Open_lag_USL,High_lag_USL,Low_lag_USL,Close_lag_USL,Adj Close_lag_USL,Volume_lag_USL
0,0,2006-04-10 00:00:00,546.000000,548.000000,541.359985,544.159973,544.159973,484738,0.08,1 days 00:00:00,...,0.000000,0.000000,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,1,2006-04-11 00:00:00,546.559998,547.119995,538.400024,545.599976,545.599976,162138,0.05,3 days 00:00:00,...,0.000000,0.000000,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,2,2006-04-12 00:00:00,545.760010,550.479980,542.479980,542.719971,542.719971,156038,0.07,1 days 00:00:00,...,0.000000,0.000000,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,4,2006-04-17 00:00:00,553.599976,559.200012,549.440002,558.320007,558.320007,114713,0.1,1 days 00:00:00,...,0.000000,0.000000,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
5,5,2006-04-18 00:00:00,560.799988,568.400024,556.559998,566.000000,566.000000,115338,0.15,3 days 00:00:00,...,0.000000,0.000000,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4412,4412,2023-10-19 00:00:00,79.370003,81.750000,78.940002,81.680000,81.680000,5924400,-0.16,1 days 00:00:00,...,40.799999,40.799999,4300.0,1 days 00:00:00,0.689998,0.590000,0.880002,0.480004,0.480004,17200.0
4413,4413,2023-10-20 00:00:00,81.589996,81.980003,80.169998,80.699997,80.699997,4660500,-0.14,1 days 00:00:00,...,40.330002,40.330002,6200.0,1 days 00:00:00,-0.049999,0.509998,-0.009999,0.649997,0.649997,-19900.0
4414,4414,2023-10-23 00:00:00,80.220001,80.269997,78.349998,78.889999,78.889999,4607100,-0.19,1 days 00:00:00,...,39.689999,39.689999,29300.0,1 days 00:00:00,0.739998,0.040001,0.309998,-0.469997,-0.469997,1900.0
4415,4415,2023-10-24 00:00:00,78.040001,78.220001,76.309998,76.930000,76.930000,6629600,-0.19,3 days 00:00:00,...,39.000000,39.000000,12600.0,3 days 00:00:00,-0.500000,-0.560001,-0.779999,-0.640003,-0.640003,23100.0


In [35]:
data_train["pred_high_open"] = best_model_lr.predict(df_list_lr["X_train"])
data_train["actual_high_open"] = data_train["High"] - data_train["Open"]
data_train["pred_low_open"] = best_model_low_lr.predict(df_list_lr["X_train"])
data_train["actual_low_open"] = data_train["Low"] - data_train["Open"]


data_val["pred_high_open"] = best_model_lr.predict(df_list_lr["X_val"])
data_val["actual_high_open"] = data_val["High"] - data_val["Open"]
data_val["pred_low_open"] = best_model_low_lr.predict(df_list_lr["X_val"])
data_val["actual_low_open"] = data_val["Low"] - data_val["Open"]

data_test["pred_high_open"] = best_model_lr.predict(df_list_lr["X_test"])
data_test["actual_high_open"] = data_test["High"] - data_test["Open"]
data_test["pred_low_open"] = best_model_low_lr.predict(df_list_lr["X_test"])
data_test["actual_low_open"] = data_test["Low"] - data_test["Open"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train["pred_high_open"] = best_model_lr.predict(df_list_lr["X_train"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train["actual_high_open"] = data_train["High"] - data_train["Open"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train["pred_low_open"] = best_model_low_lr.predict(

In [36]:
data_train.to_csv('../data/data_train.csv')
data_val.to_csv('../data/data_val.csv')
data_test.to_csv('../data/data_test.csv')