In [51]:
import numpy as np
import pandas as pd
from constants import SHARED_RANDOM_STATE
from db_helper_functions import (
    get_stock_news_with_finbert_scores_from_db,
    get_stock_news_with_finbert_tone_scores_from_db,
    get_stock_news_with_finbert_whole_article_scores_from_db,
    get_stock_news_with_finbert_tone_whole_article_scores_from_db,
    get_stock_news_with_bertopic_sentiment_scores_from_db,
)

from sklearn.model_selection import ParameterGrid
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

from supervised_learning_data_preprocessing_functions import (
    gen_df_for_supervised_learning,
)
from grid_model_search_functions import (
    iterative_grid_cv_model_testing,
    grid_cv_test_model,
)
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

ticker = "AAPL"

In [52]:
# finbert_summary_sentiment_df = gen_df_for_supervised_learning(
#     ticker=ticker,
#     sentiment_df_retrieval_function=get_stock_news_with_finbert_scores_from_db,
# )
# finbert_tone_summary_sentiment_df = gen_df_for_supervised_learning(
#     ticker=ticker,
#     sentiment_df_retrieval_function=get_stock_news_with_finbert_tone_scores_from_db,
# )
# finbert_whole_article_sentiment_df = gen_df_for_supervised_learning(
#     ticker=ticker,
#     sentiment_df_retrieval_function=get_stock_news_with_finbert_whole_article_scores_from_db,
# )
# finbert_tone_article_sentiment_df = gen_df_for_supervised_learning(
#     ticker=ticker,
#     sentiment_df_retrieval_function=get_stock_news_with_finbert_tone_whole_article_scores_from_db,
# )
bert_topic_article_sentiment_df = gen_df_for_supervised_learning(
    ticker=ticker,
    sentiment_df_retrieval_function=get_stock_news_with_bertopic_sentiment_scores_from_db,
)

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')


In [53]:
save_results = False
dataframes_to_test = [
    # (finbert_summary_sentiment_df, "finbert_summary_sentiment_df"),
    # (finbert_tone_summary_sentiment_df, "finbert_tone_summary_sentiment_df"),
    # (finbert_whole_article_sentiment_df, "finbert_whole_article_sentiment_df"),
    # (finbert_tone_article_sentiment_df, "finbert_tone_article_sentiment_df"),
    (bert_topic_article_sentiment_df, "bert_topic_article_sentiment_df"),
]
cv_train_size = [126]
cv_test_size = [31]
lag_time = [5]
tss_splits = 5
features_to_use = [
    "open",
    "prev_high",
    "prev_low",
    "prev_close",
    "positive",
    "negative",
    "neutral",
]


feature_to_predict = ["closed_higher"]
scoring_method = ["accuracy"]

param_grid = {
    "data_frame": dataframes_to_test,
    "cv_train_size": cv_train_size,
    "cv_test_size": cv_test_size,
    "lag_time": lag_time,
    "feature_to_predict": feature_to_predict,
    "scoring_method": scoring_method,
}
data_settings_grid_list = list(ParameterGrid(param_grid))

In [54]:
log_reg_clf_results = iterative_grid_cv_model_testing(
    model=LogisticRegression(),
    model_parameters={
        "random_state": [SHARED_RANDOM_STATE],
    },
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
    tss_splits=tss_splits,
)

In [55]:
log_reg_clf_results = log_reg_clf_results.sort_values(
    "mean_test_score", ascending=False
)
log_reg_clf_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
0,0.003566,9.3e-05,0.000909,6.9e-05,1337,{'model__random_state': 1337},0.548387,0.548387,0.548387,0.354839,0.516129,0.503226,0.075238,1,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."


In [56]:
gb_clf_results = iterative_grid_cv_model_testing(
    model=GradientBoostingClassifier(),
    model_parameters={
        "random_state": [SHARED_RANDOM_STATE],
        "learning_rate": [0.1, 0.2, 0.3, 0.4],
        "n_estimators": [5, 10, 20, 100],
        "max_depth": [None, 2, 5, 10, 15, 20],
        "max_leaf_nodes": [None, 2, 5, 10, 15, 20],
    },
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
    tss_splits=tss_splits,
)

In [57]:
gb_clf_results = gb_clf_results.sort_values(
    ["mean_test_score", "std_test_score"], ascending=[False, True]
)
gb_clf_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__learning_rate,param_model__max_depth,param_model__max_leaf_nodes,param_model__n_estimators,param_model__random_state,params,...,split4_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
482,0.017043,0.003399,0.000799,2.9e-05,0.4,5.0,,20,1337,"{'model__learning_rate': 0.4, 'model__max_dept...",...,0.677419,0.625806,0.126754,1,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
207,0.06268,0.006832,0.000913,8e-06,0.2,5.0,10.0,100,1337,"{'model__learning_rate': 0.2, 'model__max_dept...",...,0.645161,0.619355,0.089861,2,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
71,0.08464,0.009732,0.000907,1e-05,0.1,5.0,20.0,100,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,0.612903,0.619355,0.068884,3,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
51,0.088165,0.016335,0.00097,6.8e-05,0.1,5.0,,100,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,0.580645,0.606452,0.102822,4,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
15,0.075937,0.006935,0.001063,0.000119,0.1,,10.0,100,1337,"{'model__learning_rate': 0.1, 'model__max_dept...",...,0.580645,0.606452,0.11966,4,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."


In [58]:
rf_clf_results = iterative_grid_cv_model_testing(
    model=RandomForestClassifier(),
    model_parameters={
        "random_state": [SHARED_RANDOM_STATE],
        "n_estimators": [4, 5, 10, 15, 20, 50, 100],
        "max_depth": [None, 2, 5, 10, 15, 20],
        "max_leaf_nodes": [None, 2, 5, 10, 15, 20],
    },
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
    tss_splits=tss_splits,
)

In [59]:
rf_clf_results = rf_clf_results.sort_values("mean_test_score", ascending=False)
rf_clf_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__max_leaf_nodes,param_model__n_estimators,param_model__random_state,params,split0_test_score,...,split4_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
124,0.028919,0.004026,0.002499,0.002067,5,20.0,50,1337,"{'model__max_depth': 5, 'model__max_leaf_nodes...",0.516129,...,0.580645,0.580645,0.04562,1,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
118,0.054862,0.009012,0.002066,2e-05,5,15.0,100,1337,"{'model__max_depth': 5, 'model__max_leaf_nodes...",0.548387,...,0.580645,0.574194,0.042795,2,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
125,0.047883,0.005954,0.002044,2.9e-05,5,20.0,100,1337,"{'model__max_depth': 5, 'model__max_leaf_nodes...",0.516129,...,0.580645,0.574194,0.031606,2,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
90,0.057687,0.008099,0.002095,0.000106,5,,100,1337,"{'model__max_depth': 5, 'model__max_leaf_nodes...",0.548387,...,0.580645,0.567742,0.032897,4,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
117,0.024544,0.002268,0.001429,4.2e-05,5,15.0,50,1337,"{'model__max_depth': 5, 'model__max_leaf_nodes...",0.516129,...,0.580645,0.56129,0.03871,5,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."


In [60]:
knn_clf_results = iterative_grid_cv_model_testing(
    model=KNeighborsClassifier(),
    model_parameters={
        "n_neighbors": [2, 3, 4, 5, 15, 20],
    },
    data_settings_grid_list=data_settings_grid_list,
    features_to_use=features_to_use,
    tss_splits=tss_splits,
)

In [61]:
knn_clf_results = knn_clf_results.sort_values("mean_test_score", ascending=False)
knn_clf_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,data_frame,cv_train_size,cv_test_size,lag_time,scoring_method,features_to_use
5,0.001216,5.5e-05,0.001411,1.9e-05,20,{'model__n_neighbors': 20},0.548387,0.451613,0.580645,0.548387,0.612903,0.548387,0.053978,1,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
1,0.001332,6.3e-05,0.001512,8e-06,3,{'model__n_neighbors': 3},0.548387,0.451613,0.516129,0.548387,0.645161,0.541935,0.062551,2,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
4,0.001356,6.2e-05,0.001505,5.3e-05,15,{'model__n_neighbors': 15},0.548387,0.548387,0.483871,0.483871,0.612903,0.535484,0.048279,3,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
0,0.001511,7.2e-05,0.001619,6.9e-05,2,{'model__n_neighbors': 2},0.548387,0.354839,0.483871,0.516129,0.580645,0.496774,0.077955,4,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."
3,0.001413,6.9e-05,0.00156,4.8e-05,5,{'model__n_neighbors': 5},0.548387,0.548387,0.290323,0.516129,0.516129,0.483871,0.097844,5,bert_topic_article_sentiment_df,126,31,5,accuracy,"open,prev_high,prev_low,prev_close,positive,ne..."


In [62]:
if save_results:
    log_reg_clf_results.to_csv(
        "./model_results/logistic_regression_classifier_results.csv"
    )
    gb_clf_results.to_csv("./model_results/gb_classifier_results.csv")
    rf_clf_results.to_csv("./model_results/rf_classifier_results.csv")
    knn_clf_results.to_csv("./model_results/knn_classifier_results.csv")

### Total Models Tested


In [63]:
(
    len(knn_clf_results)
    + len(gb_clf_results)
    + len(rf_clf_results)
    + len(log_reg_clf_results)
)

835

### Model Results


In [64]:
for df, name in [
    (log_reg_clf_results, "log_reg_clf_results"),
    (knn_clf_results, "knn_clf_results"),
    (rf_clf_results, "rf_clf_results"),
    (gb_clf_results, "gb_clf_results"),
]:
    print(name)
    print(round(df.iloc[0].mean_test_score, 5))
    print(round(df.iloc[0].std_test_score, 5))
    print("#####")

log_reg_clf_results
0.50323
0.07524
#####
knn_clf_results
0.54839
0.05398
#####
rf_clf_results
0.58065
0.04562
#####
gb_clf_results
0.62581
0.12675
#####
