In [280]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import yfinance as yf
from constants import DATA_END_DATE, DATA_START_DATE
from db_helper_functions import (
    get_stock_news_with_finbert_tone_scores_from_db,
    get_stock_news_with_finbert_whole_article_scores_from_db,
    get_stock_news_with_finbert_scores_from_db,
)
from sklearn.model_selection import TimeSeriesSplit


ticker = "AAPL"

In [281]:
df_opts = [
    get_stock_news_with_finbert_tone_scores_from_db(ticker),
    get_stock_news_with_finbert_whole_article_scores_from_db(ticker),
    get_stock_news_with_finbert_scores_from_db(ticker),
]
df = df_opts[1]

In [282]:
grouped_sentiments = (
    df.groupby("date", as_index=False)
    .agg({"positive": "mean", "negative": "mean", "neutral": "mean"})
    .sort_values(by="date", ascending=True)
)

In [283]:
price_history = (
    yf.Ticker(ticker).history(start=DATA_START_DATE, end=DATA_END_DATE).reset_index()
)
price_history.columns = ["_".join(x.lower().split(" ")) for x in price_history.columns]


The 'unit' keyword in TimedeltaIndex construction is deprecated and will be removed in a future version. Use pd.to_timedelta instead.



In [284]:
price_history["date"] = price_history["date"].dt.date
price_history.head()

Unnamed: 0,date,open,high,low,close,volume,dividends,stock_splits
0,2019-01-04,34.636166,35.599548,34.461225,35.530048,234428400,0.0,0.0
1,2019-01-07,35.635503,35.666658,34.96449,35.450974,219111200,0.0,0.0
2,2019-01-08,35.841593,36.383198,35.592362,36.126774,164101200,0.0,0.0
3,2019-01-09,36.256188,37.032646,35.858377,36.740276,180396400,0.0,0.0
4,2019-01-10,36.546162,36.898444,36.153141,36.857704,143122800,0.0,0.0


In [285]:
combo_df = pd.merge(
    price_history, grouped_sentiments, left_on="date", right_on="date", how="left"
)
combo_df["date"] = pd.to_datetime(combo_df["date"])
combo_df = combo_df.sort_values(by="date", ascending=True)
combo_df = combo_df.set_index("date")

In [286]:
combo_df["day_of_month"] = combo_df.index.day
combo_df["day_of_week"] = combo_df.index.dayofweek
combo_df["quarter"] = combo_df.index.quarter
combo_df["month"] = combo_df.index.month
combo_df["year"] = combo_df.index.year

In [287]:
combo_df[["positive", "negative", "neutral"]] = combo_df[
    ["positive", "negative", "neutral"]
].ffill()

In [288]:
combo_df[["positive", "negative", "neutral"]] = combo_df[
    ["positive", "negative", "neutral"]
].shift(1)

In [289]:
combo_df[["prev_high", "prev_low", "prev_close", "prev_volume"]] = combo_df[
    ["high", "low", "close", "volume"]
].shift(1)
combo_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,dividends,stock_splits,positive,negative,neutral,day_of_month,day_of_week,quarter,month,year,prev_high,prev_low,prev_close,prev_volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2019-01-04,34.636166,35.599548,34.461225,35.530048,234428400,0.0,0.0,,,,4,4,1,1,2019,,,,
2019-01-07,35.635503,35.666658,34.96449,35.450974,219111200,0.0,0.0,0.3583,0.4261,0.2156,7,0,1,1,2019,35.599548,34.461225,35.530048,234428400.0
2019-01-08,35.841593,36.383198,35.592362,36.126774,164101200,0.0,0.0,0.1171,0.6865,0.1964,8,1,1,1,2019,35.666658,34.96449,35.450974,219111200.0
2019-01-09,36.256188,37.032646,35.858377,36.740276,180396400,0.0,0.0,0.2527,0.5316,0.2156,9,2,1,1,2019,36.383198,35.592362,36.126774,164101200.0
2019-01-10,36.546162,36.898444,36.153141,36.857704,143122800,0.0,0.0,0.268725,0.4149,0.3164,10,3,1,1,2019,37.032646,35.858377,36.740276,180396400.0


In [290]:
combo_df = combo_df.iloc[1:]

In [291]:
X_feats = [
    "open",
    "prev_high",
    "prev_low",
    "prev_close",
    "prev_volume",
    "dividends",
    "stock_splits",
    "positive",
    "negative",
    "neutral",
    "day_of_month",
    "day_of_week",
    "quarter",
    "month",
    "year",
]

y_feat = ["close"]

combo_df = combo_df[[*X_feats, *y_feat]]

training_data_len = int(len(combo_df) * 0.9)
train_df, test_df = combo_df[:training_data_len], combo_df[1 + training_data_len :]

X_train, y_train = train_df[X_feats], train_df[y_feat]
X_test, y_test = test_df[X_feats], test_df[y_feat]

In [292]:
minmax_scaler = MinMaxScaler(feature_range=(-1, 1))

X_train = pd.DataFrame(
    minmax_scaler.fit_transform(X_train),
    index=X_train.index,
    columns=X_train.columns,
)

X_test = pd.DataFrame(
    minmax_scaler.transform(X_test),
    index=X_test.index,
    columns=X_test.columns,
)

In [293]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    root_mean_squared_error,
)

knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print(f"MSE == {mean_squared_error(y_test, pred)}")
print(f"RMSE == {root_mean_squared_error(y_test, pred)}")
print(f"MAE == {mean_absolute_error(y_test, pred)}")

MSE == 106.21861015052433
RMSE == 10.30624132021584
MAE == 7.650570343017578


In [294]:
import plotly.express as px

In [303]:
test_df["predictions"] = knn.predict(X_test).flatten()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [298]:
px.line(
    test_df, x=test_df.index, y=["predictions", "close"], template="plotly_dark"
).add_vrect(
    x0=X_test.index[0],
    x1=X_test.index[-1],
    fillcolor="grey",
    opacity=0.25,
    line_width=0,
)