In [2]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
import math

In [14]:
df = pd.read_csv("../data/master_dataset.csv")

## Helper Functions

In [15]:
def forward_selection(data, response_col, alpha=0.05):
    selected_features = []
    remaining_features = list(data.columns.drop(response_col))

    while remaining_features:
        best_pvalue = float('inf')
        best_feature = None

        for feature in remaining_features:
            X = data[selected_features + [feature]]
            X = sm.add_constant(X)  # Add a constant term for the intercept
            y = data[response_col]

            model = sm.OLS(y, X).fit()
            pvalue = model.pvalues[feature]

            if pvalue < best_pvalue:
                best_pvalue = pvalue
                best_feature = feature

        if best_pvalue < alpha:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)
            print(f"Added {best_feature} with p-value {best_pvalue:.4f}")
        else:
            break

    return selected_features

In [16]:
def preprocess(df, y_var_name="High", regressor = RandomForestRegressor(), param_dist = None ):
    # Break into x and y
    X = df.drop(y_var_name, axis=1)
    y_archive = df[y_var_name]
    y = df[y_var_name] - df["Open"]
    y = pd.DataFrame(y)

    numeric_cols = X.select_dtypes(include=[np.number]).columns
    X = X[numeric_cols]

    # Avoid leakage
    drop_mask = []

    for col in X:
        if 'high' in str(col).lower():
            
            if 'lag' in str(col).lower():
                pass
            else:
                print(col)
                drop_mask.append(col)
        if 'low' in str(col).lower():
            
            if 'lag' in str(col).lower():
                pass
            else:
                print(col)
                drop_mask.append(col)

        if 'close' in str(col).lower():
            
            if 'lag' in str(col).lower():
                pass
            else:
                print(col)
                drop_mask.append(col)

    # Drop elems in drop mask
    X = X.drop(drop_mask, axis=1)
    X.drop('Unnamed: 0', axis=1, inplace=True)

    X_dev, X_test, y_dev, y_test = train_test_split(X, y, test_size=0.25, random_state=3)
    
    X_train, X_val, y_train, y_val = train_test_split(X_dev, y_dev, test_size = 0.25)

    # Variable Selection
    data_x = pd.DataFrame(X, columns=X.columns)
    data_y = pd.DataFrame(y, columns=[y_var_name])
    data = data_x.copy()
    data[y_var_name] = data_y
    data
    
    # Perform forward variable selection
    response_column = y_var_name
    selected_features = forward_selection(data, response_column)

    # print("Starting Columns:" , len(data.columns)-1)
    # print("Final Columns:" , len(selected_features))
    # Print the selected features
    # print("Selected Features:", selected_features)

    numerical_features = X_train.columns

    # Create a ColumnTransformer to apply scaling to numerical features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', MinMaxScaler(), numerical_features)
        ])

    # Instantiate the XGBoost regressor
    # xg_reg = xgb.XGBRegressor()

    # Create the pipeline with preprocessing and XGBoost
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', regressor)
    ])

    # Instantiate RandomizedSearchCV
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=20, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=-1)

    # Fit the random search to the data
    random_search.fit(X_train, y_train)

    # Print the best hyperparameters
    print("Best Hyperparameters:", random_search.best_params_)

    # Predict on the test set using the best model
    y_pred = random_search.best_estimator_.predict(X_val)

    # Evaluate the model
    mse = mean_squared_error(y_val, y_pred)
    print(f'Mean Squared Error: {mse}')

    dataframes = {'X_train' : X_train,'X_val' : X_val,'X_test' : X_test,'y_train' : y_train,'y_val' : y_val,'y_test' : y_test}
    predictions = y_pred

    return random_search.best_estimator_, predictions, dataframes

## Get Predictions

In [17]:
# Define the parameter grid for random search
param_dist = {
    'regressor__n_estimators': [50, 100, 150, 200],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['sqrt', 'log2']
}

best_model, high_delta_pred, df_list = preprocess(df, "High", param_dist=param_dist)

Low
Close
Adj Close
High_DXY
Low_DXY
Close_DXY
Adj Close_DXY
High_CHF
Low_CHF
High_BNO
Low_BNO
Close_BNO
Adj Close_BNO
High_GSCI
Low_GSCI
High_USL
Low_USL
Close_USL
Adj Close_USL
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  return fit_method(estimator, *args, **kwargs)


Best Hyperparameters: {'regressor__n_estimators': 150, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 2, 'regressor__max_features': 'sqrt', 'regressor__max_depth': None}
Mean Squared Error: 10.073836585810072


In [18]:
best_model_low, low_delta_pred, df_list = preprocess(df, "Low", param_dist=param_dist)

High
Close
Adj Close
High_DXY
Low_DXY
Close_DXY
Adj Close_DXY
High_CHF
Low_CHF
High_BNO
Low_BNO
Close_BNO
Adj Close_BNO
High_GSCI
Low_GSCI
High_USL
Low_USL
Close_USL
Adj Close_USL
Fitting 5 folds for each of 20 candidates, totalling 100 fits


  return fit_method(estimator, *args, **kwargs)


Best Hyperparameters: {'regressor__n_estimators': 200, 'regressor__min_samples_split': 10, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'log2', 'regressor__max_depth': 20}
Mean Squared Error: 9.170739320479084


In [19]:
data_train = df[df.index.isin(df_list["X_train"].index)]
data_val = df[df.index.isin(df_list["X_val"].index)]
data_test = df[df.index.isin(df_list["X_test"].index)]

In [20]:
data_train

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,T10Y2Y_Ten_Two,Date_lag_Ten_Two,...,Close_USL,Adj Close_USL,Volume_USL,Date_lag_USL,Open_lag_USL,High_lag_USL,Low_lag_USL,Close_lag_USL,Adj Close_lag_USL,Volume_lag_USL
2,2,2006-04-12 00:00:00,545.760010,550.479980,542.479980,542.719971,542.719971,156038,0.07,1 days 00:00:00,...,0.000000,0.000000,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,3,2006-04-13 00:00:00,540.000000,551.919983,539.200012,550.559998,550.559998,70088,0.09,1 days 00:00:00,...,0.000000,0.000000,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,4,2006-04-17 00:00:00,553.599976,559.200012,549.440002,558.320007,558.320007,114713,0.1,1 days 00:00:00,...,0.000000,0.000000,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
8,8,2006-04-21 00:00:00,567.119995,585.840027,566.400024,582.479980,582.479980,133225,0.11,1 days 00:00:00,...,0.000000,0.000000,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
9,9,2006-04-24 00:00:00,571.200012,579.599976,566.320007,568.080017,568.080017,130288,0.1,1 days 00:00:00,...,0.000000,0.000000,0.0,0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4407,4407,2023-10-12 00:00:00,76.580002,76.610001,74.620003,75.419998,75.419998,4244100,-0.36,1 days 00:00:00,...,38.099998,38.099998,18000.0,1 days 00:00:00,-0.340000,-0.200001,-0.529999,-0.190003,-0.190003,6700.0
4408,4408,2023-10-13 00:00:00,77.709999,79.180000,77.160004,78.989998,78.989998,6765100,-0.41,1 days 00:00:00,...,39.680000,39.680000,19600.0,1 days 00:00:00,0.239998,0.160000,0.110001,-0.160000,-0.160000,-2800.0
4409,4409,2023-10-16 00:00:00,78.629997,78.889999,77.870003,78.389999,78.389999,5156200,-0.38,1 days 00:00:00,...,39.540001,39.540001,29600.0,1 days 00:00:00,0.639999,1.169998,1.169998,1.580002,1.580002,1600.0
4412,4412,2023-10-19 00:00:00,79.370003,81.750000,78.940002,81.680000,81.680000,5924400,-0.16,1 days 00:00:00,...,40.799999,40.799999,4300.0,1 days 00:00:00,0.689998,0.590000,0.880002,0.480004,0.480004,17200.0


In [21]:
data_train["pred_high_open"] = best_model.predict(df_list["X_train"])
data_train["actual_high_open"] = data_train["High"] - data_train["Open"]
data_train["pred_low_open"] = best_model_low.predict(df_list["X_train"])
data_train["actual_low_open"] = data_train["Low"] - data_train["Open"]


data_val["pred_high_open"] = best_model.predict(df_list["X_val"])
data_val["actual_high_open"] = data_val["High"] - data_val["Open"]
data_val["pred_low_open"] = best_model_low.predict(df_list["X_val"])
data_val["actual_low_open"] = data_val["Low"] - data_val["Open"]

data_test["pred_high_open"] = best_model.predict(df_list["X_test"])
data_test["actual_high_open"] = data_test["High"] - data_test["Open"]
data_test["pred_low_open"] = best_model_low.predict(df_list["X_test"])
data_test["actual_low_open"] = data_test["Low"] - data_test["Open"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train["pred_high_open"] = best_model.predict(df_list["X_train"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train["actual_high_open"] = data_train["High"] - data_train["Open"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_train["pred_low_open"] = best_model_low.predict(df_list["

In [22]:
data_train.to_csv('../data/data_train.csv')
data_val.to_csv('../data/data_val.csv')
data_test.to_csv('../data/data_test.csv')
