In [2]:
import numpy as np
import pandas as pd
from constants import SHARED_RANDOM_STATE
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
)

from sklearn.model_selection import GridSearchCV, ParameterGrid, TimeSeriesSplit
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from supervised_learning_data_preprocessing_functions import (
    gen_df_for_supervised_learning,
)
from sklearn.linear_model import LinearRegression
import math

ticker = "AAPL"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
df = gen_df_for_supervised_learning(ticker)

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')


In [33]:
def grid_cv_test_model(
    model,
    model_parameters,
    data_frame,
    features_to_use,
    feature_to_predict,
    scoring_method,
    cv_train_size,
    cv_test_size,
    lag_time,
):
    steps = [("scaler", MinMaxScaler((-1, 1))), ("model", model)]
    std_parms = {f"model__{k}": v for k, v in model_parameters.items()}

    pipeline = Pipeline(steps)

    X = data_frame[features_to_use]
    y = data_frame[feature_to_predict]

    n_splits = math.floor(len(data_frame) / (cv_train_size + cv_test_size))

    tss_splits = TimeSeriesSplit(
        n_splits=max(5, n_splits),
        max_train_size=cv_train_size,
        test_size=cv_test_size,
        gap=lag_time,
    ).split(X)

    grid_search_cv_model = GridSearchCV(
        pipeline,
        param_grid=std_parms,
        scoring=scoring_method,
        cv=tss_splits,
        n_jobs=-1,
    )

    grid_search_cv_model.fit(X, y)
    return grid_search_cv_model.cv_results_

In [34]:
def iterative_grid_cv_model_testing(
    model, model_parameters, data_settings_grid_list, features_to_use
):
    results_df = pd.DataFrame()
    for data_settings in data_settings_grid_list:
        if data_settings["cv_train_size"] < data_settings["cv_test_size"]:
            continue
        try:
            res = grid_cv_test_model(
                model,
                model_parameters=model_parameters,
                data_frame=data_settings["data_frame"][0],
                cv_train_size=data_settings["cv_train_size"],
                cv_test_size=data_settings["cv_test_size"],
                features_to_use=features_to_use,
                lag_time=data_settings["lag_time"],
                feature_to_predict=data_settings["feature_to_predict"],
                scoring_method=data_settings["scoring_method"],
            )
            res_df = pd.DataFrame(res)
            res_df["data_frame"] = data_settings["data_frame"][1]
            res_df["cv_train_size"] = data_settings["cv_train_size"]
            res_df["cv_test_size"] = data_settings["cv_test_size"]
            res_df["lag_time"] = data_settings["lag_time"]
            res_df["scoring_method"] = data_settings["scoring_method"]
            res_df["features_to_use"] = ",".join(features_to_use)

            results_df = pd.concat([results_df, res_df])
        except:
            pass
    return results_df

In [48]:
dataframes_to_test = [(df, "daily"), (df.groupby("week_year").mean(), "weekly")]
cv_train_size = [5, 10, 15, 20, 60, 252]
cv_test_size = [5, 10, 15, 20, 60]
lag_time = [5, 10, 15, 20, 60]
features_to_use = [
    "open",
    "prev_high",
    "prev_low",
    "prev_close",
    # "prev_volume",
    # "dividends",
    # "stock_splits",
    "positive",
    "negative",
    "neutral",
    "day_of_month",
    "day_of_week",
    "quarter",
    # "month",
    # "year",
]
feature_to_predict = ["close"]
scoring_method = ["neg_mean_absolute_error"]

param_grid = {
    "data_frame": dataframes_to_test,
    "cv_train_size": cv_train_size,
    "cv_test_size": cv_test_size,
    "lag_time": lag_time,
    "feature_to_predict": feature_to_predict,
    "scoring_method": scoring_method,
}
data_settings_grid_list = list(ParameterGrid(param_grid))

In [49]:
knn_results = iterative_grid_cv_model_testing(
    model=KNeighborsRegressor(),
    model_parameters={"n_neighbors": [2, 3, 4, 5], "p": [1, 2]},
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
)

In [50]:
knn_results.sort_values("mean_test_score", ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__n_neighbors,param_model__p,params,split0_test_score,split1_test_score,split2_test_score,...,split99_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
4,0.00121,0.000128,0.000689,7.1e-05,4,1,"{'model__n_neighbors': 4, 'model__p': 1}",-2.836488,-4.831215,-1.924111,...,,-3.230301,1.076565,1,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.001279,0.000136,0.000681,4.9e-05,2,1,"{'model__n_neighbors': 2, 'model__p': 1}",-3.668221,-4.772339,-1.777481,...,,-3.341926,1.039812,1,weekly,60,5,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
6,0.001133,7.2e-05,0.000642,2.3e-05,5,1,"{'model__n_neighbors': 5, 'model__p': 1}",-3.907797,-5.097598,-2.047805,...,,-3.386091,1.131287,2,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.001275,6.4e-05,0.000685,2.9e-05,2,1,"{'model__n_neighbors': 2, 'model__p': 1}",-3.742049,-5.637442,-1.777481,...,,-3.391917,1.297149,3,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.001243,0.000109,0.000684,3.5e-05,2,1,"{'model__n_neighbors': 2, 'model__p': 1}",-2.622792,-4.772339,-1.777481,...,,-3.448815,1.540479,1,weekly,60,5,10,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."


In [51]:
gbr_results = iterative_grid_cv_model_testing(
    model=GradientBoostingRegressor(),
    model_parameters={
        "random_state": [SHARED_RANDOM_STATE],
        "learning_rate": [0.001, 0.01, 0.1, 1],
        "n_estimators": [10, 20, 50, 100],
        "max_depth": [None, 2, 4, 6],
        "max_leaf_nodes": [None, 5, 10, 20],
    },
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
)

In [52]:
gbr_results.sort_values("mean_test_score", ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__learning_rate,param_model__max_depth,param_model__max_leaf_nodes,param_model__n_estimators,param_model__random_state,params,...,split99_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
179,0.072952,0.008876,0.001142,0.000662,0.1,6.0,,100,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,,-1.524142,0.396175,1,weekly,252,5,10,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
178,0.038001,0.00835,0.000794,2.3e-05,0.1,6.0,,50,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,,-1.538826,0.365821,2,weekly,252,5,10,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
139,0.051637,0.002969,0.000808,1.4e-05,0.1,,10.0,100,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,,-1.54859,0.301207,1,weekly,252,10,5,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
146,0.011576,0.001995,0.000675,1.8e-05,0.1,2.0,,50,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,,-1.55008,0.202065,1,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
150,0.012204,0.001755,0.000681,2.9e-05,0.1,2.0,5.0,50,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,,-1.55008,0.202065,1,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."


In [53]:
rf_results = iterative_grid_cv_model_testing(
    model=RandomForestRegressor(),
    model_parameters={
        "random_state": [SHARED_RANDOM_STATE],
        "n_estimators": [10, 20, 50, 100],
        "max_depth": [None, 2, 4, 6],
        "max_leaf_nodes": [None, 5, 10, 20],
    },
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
)

In [54]:
rf_results.sort_values("mean_test_score", ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__max_leaf_nodes,param_model__n_estimators,param_model__random_state,params,split0_test_score,...,split99_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
49,0.015585,0.003885,0.000929,0.000107,6.0,,20,1337,"{'model__max_depth': 6, 'model__max_leaf_nodes...",-1.258821,...,,-1.504649,0.304974,1,weekly,252,5,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
1,0.009927,0.000108,0.000926,2.6e-05,,,20,1337,"{'model__max_depth': None, 'model__max_leaf_no...",-1.689443,...,,-1.518526,0.172859,1,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
51,0.069867,0.009114,0.001929,5.4e-05,6.0,,100,1337,"{'model__max_depth': 6, 'model__max_leaf_nodes...",-0.871234,...,,-1.523314,0.39884,2,weekly,252,5,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
3,0.045528,0.002825,0.002098,0.000379,,,100,1337,"{'model__max_depth': None, 'model__max_leaf_no...",-1.496904,...,,-1.52885,0.180989,2,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
15,0.050486,0.006143,0.002026,0.000244,,20.0,100,1337,"{'model__max_depth': None, 'model__max_leaf_no...",-1.442597,...,,-1.531256,0.290892,3,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."


In [55]:
lin_reg_results = iterative_grid_cv_model_testing(
    model=LinearRegression(),
    model_parameters={},
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
)

In [59]:
lin_reg_results.sort_values("mean_test_score", ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,...,split99_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
0,0.001343,9.8e-05,0.000526,1.8e-05,{},-0.443764,-0.46044,-0.830993,-0.672874,-0.988981,...,,-0.67941,0.21087,1,weekly,252,20,5,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.001399,3e-05,0.000528,9e-06,{},-0.443628,-0.452805,-0.84287,-0.695252,-1.007096,...,,-0.68833,0.219496,1,weekly,252,20,10,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.001296,0.000111,0.000515,1.4e-05,{},-0.456656,-0.458111,-0.846297,-0.699487,-1.009154,...,,-0.693941,0.216574,1,weekly,252,20,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.001319,0.000131,0.00052,1.3e-05,{},-0.450672,-0.4192,-0.859622,-0.726183,-1.02718,...,,-0.696571,0.234165,1,weekly,252,20,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.00113,0.000101,0.000478,1.5e-05,{},-0.418126,-0.464272,-0.957667,-0.695292,-0.998486,...,,-0.706768,0.240956,1,weekly,60,20,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
