In [1]:
import numpy as np
import pandas as pd
from constants import SHARED_RANDOM_STATE
from db_helper_functions import (
    get_stock_news_with_finbert_scores_from_db,
    get_stock_news_with_finbert_tone_scores_from_db,
    get_stock_news_with_finbert_whole_article_scores_from_db,
    get_stock_news_with_finbert_tone_whole_article_scores_from_db,
    get_stock_news_with_bertopic_sentiment_scores_from_db,
)
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import KNeighborsRegressor

from supervised_learning_data_preprocessing_functions import (
    gen_df_for_supervised_learning,
)
from grid_model_search_functions import iterative_grid_cv_model_testing

ticker = "AAPL"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
finbert_summary_sentiment_df = gen_df_for_supervised_learning(
    ticker=ticker,
    sentiment_df_retrieval_function=get_stock_news_with_finbert_scores_from_db,
)
finbert_tone_summary_sentiment_df = gen_df_for_supervised_learning(
    ticker=ticker,
    sentiment_df_retrieval_function=get_stock_news_with_finbert_tone_scores_from_db,
)
finbert_whole_article_sentiment_df = gen_df_for_supervised_learning(
    ticker=ticker,
    sentiment_df_retrieval_function=get_stock_news_with_finbert_whole_article_scores_from_db,
)
finbert_tone_article_sentiment_df = gen_df_for_supervised_learning(
    ticker=ticker,
    sentiment_df_retrieval_function=get_stock_news_with_finbert_tone_whole_article_scores_from_db,
)
bert_topic_article_sentiment_df = gen_df_for_supervised_learning(
    ticker=ticker,
    sentiment_df_retrieval_function=get_stock_news_with_bertopic_sentiment_scores_from_db,
)

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')
  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')


In [3]:
dataframes_to_test = [
    (finbert_summary_sentiment_df, "finbert_summary_sentiment_df"),
    (finbert_tone_summary_sentiment_df, "finbert_tone_summary_sentiment_df"),
    (finbert_whole_article_sentiment_df, "finbert_whole_article_sentiment_df"),
    (finbert_tone_article_sentiment_df, "finbert_tone_article_sentiment_df"),
    (bert_topic_article_sentiment_df, "finbert_tone_article_sentiment_df"),
]
cv_train_size = [5, 10, 15, 20, 60, 252, 504]
cv_test_size = [5, 10, 15, 20, 60]
lag_time = [5, 10, 15, 20, 60]
features_to_use = [
    "open",
    "prev_high",
    "prev_low",
    "prev_close",
    # "prev_volume",
    # "dividends",
    # "stock_splits",
    "positive",
    "negative",
    "neutral",
    # "day_of_month",
    # "day_of_week",
    # "quarter",
    # "month",
    # "year",
]
feature_to_predict = ["close"]
scoring_method = ["neg_mean_absolute_error"]

param_grid = {
    "data_frame": dataframes_to_test,
    "cv_train_size": cv_train_size,
    "cv_test_size": cv_test_size,
    "lag_time": lag_time,
    "feature_to_predict": feature_to_predict,
    "scoring_method": scoring_method,
}
data_settings_grid_list = list(ParameterGrid(param_grid))

In [4]:
knn_results = iterative_grid_cv_model_testing(
    model=KNeighborsRegressor(),
    model_parameters={"n_neighbors": [2, 3, 4, 5], "p": [1]},
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
)

In [5]:
print(len(knn_results))
knn_results.sort_values("mean_test_score", ascending=False).head()

2500


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__n_neighbors,param_model__p,params,split0_test_score,split1_test_score,split2_test_score,...,split9_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
3,0.001081,0.000107,0.000655,7.3e-05,5,1,"{'model__n_neighbors': 5, 'model__p': 1}",-2.362925,-2.232975,-2.434821,...,-3.276525,-2.519181,0.5488,1,finbert_tone_article_sentiment_df,252,15,60,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
2,0.001243,0.000107,0.000692,3.2e-05,4,1,"{'model__n_neighbors': 4, 'model__p': 1}",-1.972594,-1.988757,-3.001805,...,-2.662168,-2.519777,0.579761,1,finbert_tone_article_sentiment_df,504,15,5,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
3,0.001213,8.5e-05,0.000691,5.1e-05,5,1,"{'model__n_neighbors': 5, 'model__p': 1}",-2.294795,-2.013992,-2.834135,...,-2.601449,-2.528514,0.542876,2,finbert_tone_article_sentiment_df,504,15,5,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
2,0.001132,0.000107,0.000659,5.5e-05,4,1,"{'model__n_neighbors': 4, 'model__p': 1}",-2.572367,-1.650667,-2.80973,...,-3.627752,-2.543435,0.620915,1,finbert_tone_article_sentiment_df,252,15,60,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
2,0.001508,0.000457,0.000754,7.6e-05,4,1,"{'model__n_neighbors': 4, 'model__p': 1}",-1.684749,-2.76564,-2.73942,...,-2.924476,-2.55193,0.658498,1,finbert_tone_article_sentiment_df,504,10,60,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."


In [6]:
gbr_results = iterative_grid_cv_model_testing(
    model=GradientBoostingRegressor(),
    model_parameters={
        "random_state": [SHARED_RANDOM_STATE],
        "learning_rate": [0.01, 0.1, 1],
        "n_estimators": [50, 100, 300],
        "max_depth": [None, 2, 5],
        "max_leaf_nodes": [None, 5, 10],
    },
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
)

In [15]:
print(len(gbr_results))
gbr_results.sort_values("mean_test_score", ascending=False).head()

50625


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__learning_rate,param_model__max_depth,param_model__max_leaf_nodes,param_model__n_estimators,param_model__random_state,params,...,split9_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
33,0.061975,0.003848,0.000753,2.5e-05,0.1,,10.0,50,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,-2.365498,-2.215701,0.531494,1,finbert_tone_article_sentiment_df,504,15,60,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
34,0.148579,0.008524,0.00079,2.2e-05,0.1,,10.0,100,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,-2.31753,-2.240229,0.534359,2,finbert_tone_article_sentiment_df,504,15,60,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
37,0.075027,0.006582,0.000989,0.000482,0.1,2.0,,100,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,-2.591704,-2.254601,0.579071,1,finbert_tone_article_sentiment_df,504,15,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
43,0.083531,0.011218,0.001546,0.002181,0.1,2.0,10.0,100,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,-2.591704,-2.254601,0.579071,1,finbert_tone_article_sentiment_df,504,15,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
40,0.075951,0.005878,0.000835,0.000194,0.1,2.0,5.0,100,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,-2.591704,-2.254601,0.579071,1,finbert_tone_article_sentiment_df,504,15,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."


In [16]:
rf_results = iterative_grid_cv_model_testing(
    model=RandomForestRegressor(),
    model_parameters={
        "random_state": [SHARED_RANDOM_STATE],
        "n_estimators": [10, 20, 100],
        "max_depth": [None, 2, 5],
        "max_leaf_nodes": [None, 5, 10],
    },
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
)

In [54]:
rf_results.sort_values("mean_test_score", ascending=False).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__max_leaf_nodes,param_model__n_estimators,param_model__random_state,params,split0_test_score,...,split99_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
49,0.015585,0.003885,0.000929,0.000107,6.0,,20,1337,"{'model__max_depth': 6, 'model__max_leaf_nodes...",-1.258821,...,,-1.504649,0.304974,1,weekly,252,5,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
1,0.009927,0.000108,0.000926,2.6e-05,,,20,1337,"{'model__max_depth': None, 'model__max_leaf_no...",-1.689443,...,,-1.518526,0.172859,1,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
51,0.069867,0.009114,0.001929,5.4e-05,6.0,,100,1337,"{'model__max_depth': 6, 'model__max_leaf_nodes...",-0.871234,...,,-1.523314,0.39884,2,weekly,252,5,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
3,0.045528,0.002825,0.002098,0.000379,,,100,1337,"{'model__max_depth': None, 'model__max_leaf_no...",-1.496904,...,,-1.52885,0.180989,2,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
15,0.050486,0.006143,0.002026,0.000244,,20.0,100,1337,"{'model__max_depth': None, 'model__max_leaf_no...",-1.442597,...,,-1.531256,0.290892,3,weekly,60,5,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."


In [10]:
lin_reg_results = iterative_grid_cv_model_testing(
    model=LinearRegression(),
    model_parameters={},
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
)

In [11]:
print(len(lin_reg_results))
lin_reg_results.sort_values("mean_test_score", ascending=False).head()

625


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,...,split9_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
0,0.001334,9.1e-05,0.000499,1.9e-05,{},-2.032348,-1.423444,-1.371791,-1.213113,-1.307788,...,-2.460166,-1.866182,0.479337,1,finbert_tone_article_sentiment_df,504,60,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.001324,8.5e-05,0.000491,2.2e-05,{},-2.029319,-1.403279,-1.377488,-1.241795,-1.311704,...,-2.478244,-1.866664,0.475687,1,finbert_summary_sentiment_df,504,60,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.001244,0.000112,0.000506,5.1e-05,{},-2.035644,-1.406769,-1.36546,-1.208647,-1.312731,...,-2.45135,-1.866819,0.480117,1,finbert_tone_article_sentiment_df,504,60,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.001247,0.000121,0.000488,1.2e-05,{},-2.041573,-1.432794,-1.371827,-1.206308,-1.305426,...,-2.468679,-1.867009,0.479059,1,finbert_tone_summary_sentiment_df,504,60,15,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.00126,0.00011,0.000479,1.2e-05,{},-2.048549,-1.403492,-1.3714,-1.236804,-1.323817,...,-2.458205,-1.870727,0.478331,1,finbert_tone_article_sentiment_df,504,60,20,neg_mean_absolute_error,"open,prev_high,prev_low,prev_close,positive,ne..."


In [16]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score

dataframes_to_test = [
    (finbert_summary_sentiment_df, "finbert_summary_sentiment_df"),
    (finbert_tone_summary_sentiment_df, "finbert_tone_summary_sentiment_df"),
    (finbert_whole_article_sentiment_df, "finbert_whole_article_sentiment_df"),
    (finbert_tone_article_sentiment_df, "finbert_tone_article_sentiment_df"),
]

for df, name in dataframes_to_test:
    df["closed_higher"] = np.where(df.close > df.open, 1, 0)


feature_to_predict = ["closed_higher"]
scoring_method = ["accuracy", "average_precision"]

param_grid = {
    "data_frame": dataframes_to_test,
    "cv_train_size": cv_train_size,
    "cv_test_size": cv_test_size,
    "lag_time": lag_time,
    "feature_to_predict": feature_to_predict,
    "scoring_method": scoring_method,
}
data_settings_grid_list = list(ParameterGrid(param_grid))

In [17]:
for df, name in dataframes_to_test:
    print(len(df[df["closed_higher"] == 1]) / len(df))

0.532803180914513
0.532803180914513
0.532803180914513
0.532803180914513


In [18]:
gbr_clf_results = iterative_grid_cv_model_testing(
    model=GradientBoostingClassifier(),
    model_parameters={
        "random_state": [SHARED_RANDOM_STATE],
        "learning_rate": [0.1, 1],
        "n_estimators": [100, 200],
    },
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
)

In [64]:
gbr_clf_results[(~gbr_clf_results["mean_test_score"].isna())].sort_values(
    "mean_test_score", ascending=False
).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__learning_rate,param_model__max_depth,param_model__max_leaf_nodes,param_model__n_estimators,param_model__random_state,params,...,split99_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
12,0.377333,0.021985,0.001182,1.6e-05,0.001,,10,300,1337,"{'model__learning_rate': 0.001, 'model__max_de...",...,,0.793333,0.206989,1,daily,252,5,5,average_precision,"open,prev_high,prev_low,prev_close,positive,ne..."
13,0.474685,0.039957,0.002075,0.001528,0.001,,10,400,1337,"{'model__learning_rate': 0.001, 'model__max_de...",...,,0.76,0.205913,2,daily,252,5,5,average_precision,"open,prev_high,prev_low,prev_close,positive,ne..."
128,0.160582,0.015546,0.001668,0.001231,0.1,4.0,5,400,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,,0.747778,0.237492,1,daily,60,5,60,average_precision,"open,prev_high,prev_low,prev_close,positive,ne..."
98,0.169567,0.0125,0.001559,0.000753,0.1,,5,400,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,,0.747778,0.237492,1,daily,60,5,60,average_precision,"open,prev_high,prev_low,prev_close,positive,ne..."
127,0.127568,0.022608,0.001986,0.00195,0.1,4.0,5,300,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,,0.744444,0.23544,3,daily,60,5,60,average_precision,"open,prev_high,prev_low,prev_close,positive,ne..."


In [65]:
len(gbr_clf_results)

36000