In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import yfinance as yf
from constants import DATA_END_DATE, DATA_START_DATE
from db_helper_functions import (
    get_stock_news_with_finbert_tone_scores_from_db,
    get_stock_news_with_finbert_whole_article_scores_from_db,
    get_stock_news_with_finbert_scores_from_db,
)
from sklearn.model_selection import TimeSeriesSplit


ticker = "AAPL"

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df_opts = [
    get_stock_news_with_finbert_tone_scores_from_db(ticker),
    get_stock_news_with_finbert_whole_article_scores_from_db(ticker),
    get_stock_news_with_finbert_scores_from_db(ticker),
]
df = df_opts[1]

In [3]:
grouped_sentiments = df.groupby("date", as_index=False).agg(
    {"positive": "mean", "negative": "mean", "neutral": "mean"}
)

In [4]:
price_history = (
    yf.Ticker(ticker).history(start=DATA_START_DATE, end=DATA_END_DATE).reset_index()
)
price_history.columns = [x.lower() for x in price_history.columns]

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')


In [5]:
price_history["date"] = price_history["date"].dt.date
price_history.head()

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits
0,2019-01-04,34.63617,35.599552,34.461229,35.530052,234428400,0.0,0.0
1,2019-01-07,35.635495,35.66665,34.964483,35.450966,219111200,0.0,0.0
2,2019-01-08,35.841593,36.383198,35.592362,36.126774,164101200,0.0,0.0
3,2019-01-09,36.256185,37.032642,35.858374,36.740273,180396400,0.0,0.0
4,2019-01-10,36.546162,36.898444,36.153141,36.857704,143122800,0.0,0.0


In [6]:
combo_df = pd.merge(
    price_history, grouped_sentiments, left_on="date", right_on="date", how="left"
)
combo_df = combo_df.sort_values(by="date", ascending=True)
combo_df = combo_df.set_index("date")

training_data_len = int(len(combo_df) * 0.9)
train_df, test_df = combo_df[:training_data_len], combo_df[1 + training_data_len :]

In [7]:
combo_df[combo_df["positive"].isna()]

Unnamed: 0_level_0,open,high,low,close,volume,dividends,stock splits,positive,negative,neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-11,36.637217,36.833726,36.308899,36.495823,108092800,0.0000,0.0,,,
2019-01-14,36.150738,36.251389,35.760112,35.947037,129756800,0.0000,0.0,,,
2019-01-22,37.483179,37.559864,36.574915,36.737877,121576000,0.0000,0.0,,,
2019-02-08,40.671620,41.073547,40.534434,41.013378,95280000,0.1825,0.0,,,
2019-02-12,40.938782,41.155388,40.842510,41.128914,89134000,0.0000,0.0,,,
...,...,...,...,...,...,...,...,...,...,...
2020-04-07,66.027785,66.247233,63.150654,63.255497,202887200,0.0000,0.0,,,
2020-05-07,73.932601,74.408063,73.627820,74.059387,115215200,0.0000,0.0,,,
2020-11-27,114.401007,115.303887,114.057521,114.420631,46691300,0.0000,0.0,,,
2020-12-04,120.318813,120.573977,119.258906,119.975327,78260400,0.0000,0.0,,,


In [8]:
combo_df = combo_df[["positive", "negative", "neutral"]].ffill()

In [9]:
minmax_scaler = MinMaxScaler(feature_range=(-1, 1))
train_df = pd.DataFrame(
    minmax_scaler.fit_transform(train_df),
    index=train_df.index,
    columns=train_df.columns,
)
test_df = pd.DataFrame(
    minmax_scaler.transform(test_df), index=test_df.index, columns=test_df.columns
)

In [10]:
train_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,dividends,stock splits,positive,negative,neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-04,-1.0,-1.0,-1.0,-0.998904,0.003494,-1.0,-1.0,-0.244209,-0.122882,-0.566108
2019-01-07,-0.986282,-0.999075,-0.992932,-1.0,-0.075971,-1.0,-1.0,-0.7664,0.432165,-0.610801
2019-01-08,-0.983453,-0.989194,-0.984114,-0.990632,-0.361359,-1.0,-1.0,-0.47283,0.101993,-0.566108
2019-01-09,-0.977762,-0.980239,-0.980379,-0.982127,-0.276821,-1.0,-1.0,-0.438136,-0.146755,-0.331471
2019-01-10,-0.973782,-0.98209,-0.976239,-0.980499,-0.470194,-1.0,-1.0,-0.142022,-0.600554,-0.15433
