In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    root_mean_squared_error,
)
import yfinance as yf
from constants import DATA_END_DATE, DATA_START_DATE
from db_helper_functions import (
    get_stock_news_with_finbert_tone_scores_from_db,
    get_stock_news_with_finbert_whole_article_scores_from_db,
    get_stock_news_with_finbert_scores_from_db,
)
from sklearn.model_selection import TimeSeriesSplit
import numpy as np
from constants import SHARED_RANDOM_STATE
from sklearn.ensemble import (
    GradientBoostingRegressor,
    AdaBoostRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
)
from sklearn.model_selection import ParameterGrid


ticker = "AAPL"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df_opts = [
    get_stock_news_with_finbert_tone_scores_from_db(ticker),
    get_stock_news_with_finbert_whole_article_scores_from_db(ticker),
    get_stock_news_with_finbert_scores_from_db(ticker),
]
df = df_opts[1]

In [3]:
grouped_sentiments = (
    df.groupby("date", as_index=False)
    .agg({"positive": "mean", "negative": "mean", "neutral": "mean"})
    .sort_values(by="date", ascending=True)
)

In [4]:
price_history = (
    yf.Ticker(ticker).history(start=DATA_START_DATE, end=DATA_END_DATE).reset_index()
)
price_history.columns = ["_".join(x.lower().split(" ")) for x in price_history.columns]

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')


In [5]:
price_history["date"] = price_history["date"].dt.date

In [6]:
combo_df = pd.merge(
    price_history, grouped_sentiments, left_on="date", right_on="date", how="left"
)
combo_df["date"] = pd.to_datetime(combo_df["date"])
combo_df = combo_df.sort_values(by="date", ascending=True)
combo_df = combo_df.set_index("date")
combo_df["day_of_month"] = combo_df.index.day
combo_df["day_of_week"] = combo_df.index.dayofweek
combo_df["quarter"] = combo_df.index.quarter
combo_df["month"] = combo_df.index.month
combo_df["year"] = combo_df.index.year
combo_df["month_year"] = combo_df.index.to_period("M")
combo_df["week_year"] = combo_df.index.to_period("W")
combo_df[["positive", "negative", "neutral"]] = combo_df[
    ["positive", "negative", "neutral"]
].ffill()
combo_df[["positive", "negative", "neutral"]] = combo_df[
    ["positive", "negative", "neutral"]
].shift(1)
combo_df[["prev_high", "prev_low", "prev_close", "prev_volume"]] = combo_df[
    ["high", "low", "close", "volume"]
].shift(1)
combo_df = combo_df.iloc[1:]

In [7]:
def time_series_cv_model_search(models, grouped_df, predictive_feats, target_feat):
    model_scores = []
    for model, param_grid in models:
        for params in ParameterGrid(param_grid):
            model.set_params(**params)

            count = 0
            prev_grp = None
            mse_scores = []
            rmse_scores = []
            mae_scores = []

            for _idx, grp in grouped_df:
                if count == 0:
                    prev_grp = grp
                    count += 1
                    continue
                train = prev_grp
                test = grp

                minmax_scaler = MinMaxScaler(feature_range=(-1, 1))

                X_train = train[predictive_feats]
                X_train = minmax_scaler.fit_transform(X_train)
                y_train = train[target_feat]

                X_test = test[predictive_feats]
                X_test = minmax_scaler.transform(X_test)
                y_test = test[target_feat]

                model.fit(X_train, y_train)
                pred = model.predict(X_test)

                mse_scores.append(mean_squared_error(y_test, pred))
                rmse_scores.append(root_mean_squared_error(y_test, pred))
                mae_scores.append(mean_absolute_error(y_test, pred))

                prev_grp = grp

            model_scores.append(
                (
                    model,
                    params,
                    np.mean(mse_scores),
                    np.mean(rmse_scores),
                    np.mean(mae_scores),
                )
            )

    return model_scores

In [8]:
predictive_features = [
    # "open",
    # "prev_high",
    # "prev_low",
    # "prev_close",
    # "prev_volume",
    # "dividends",
    # "stock_splits",
    "positive",
    "negative",
    "neutral",
    # "day_of_month",
    # "day_of_week",
    # "quarter",
    # "month",
    # "year",
]

target_feat = "close"

models_with_param_grids = [
    (
        GradientBoostingRegressor(),
        {
            "random_state": [SHARED_RANDOM_STATE],
            "loss": ["absolute_error", "squared_error"],
        },
    ),
    (
        KNeighborsRegressor(),
        {
            "n_neighbors": [2, 3, 4],
        },
    ),
]

results = time_series_cv_model_search(
    models_with_param_grids,
    combo_df.groupby("week_year"),
    predictive_features,
    target_feat,
)

sorted(results, key=lambda x: x[4])

[(KNeighborsRegressor(n_neighbors=4),
  {'n_neighbors': 3},
  19.68476897247656,
  3.5926739546057322,
  3.2423378962736864),
 (KNeighborsRegressor(n_neighbors=4),
  {'n_neighbors': 4},
  19.88283872848489,
  3.5949743646503616,
  3.2498723064477626),
 (KNeighborsRegressor(n_neighbors=4),
  {'n_neighbors': 2},
  20.39861214420479,
  3.65357756253946,
  3.2715034379408907),
 (GradientBoostingRegressor(random_state=1337),
  {'loss': 'absolute_error', 'random_state': 1337},
  21.240638095584174,
  3.7429363476819377,
  3.3479008781823634),
 (GradientBoostingRegressor(random_state=1337),
  {'loss': 'squared_error', 'random_state': 1337},
  21.710846694260866,
  3.800386792449836,
  3.3641138831900914)]