In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import yfinance as yf
from constants import DATA_END_DATE, DATA_START_DATE
from db_helper_functions import get_stock_news_with_finbert_tone_scores_from_db
from sklearn.model_selection import TimeSeriesSplit


ticker = "AAPL"

In [2]:
df = get_stock_news_with_finbert_tone_scores_from_db(ticker)
df.head()

Unnamed: 0,id,ticker,date,title,summary,positive,negative,neutral
0,4235,AAPL,2022-07-25,"Microsoft, Alphabet, Meta, Apple, And Amazon L...",(Monday Market Open) Investors appear optimist...,0.0,1.0,0.0
1,4376,AAPL,2022-08-17,Benzinga Before The Bell: More Amazon Workers ...,CNBCBlackRock To Pledge A$1B In Australian Bat...,0.9988,0.0011,0.0001
2,3,AAPL,2019-01-04,"Market Rebounds On Trade Optimism, Tech Bounce...",A stronger-than-expected government report on ...,0.0016,0.0019,0.9964
3,4,AAPL,2019-01-07,American Shippers And Carriers React To Signs ...,The first week of 2019 saw three of America's ...,0.0,0.0,1.0
4,5,AAPL,2019-01-08,TD Ameritrade IMX Continued Its Dip In Decembe...,"Declining for the third month in a row, TD Ame...",0.0,1.0,0.0


In [3]:
grouped_sentiments = df.groupby("date", as_index=False).agg(
    {"positive": "mean", "negative": "mean", "neutral": "mean"}
)

In [None]:
price_history = (
    yf.Ticker(ticker).history(start=DATA_START_DATE, end=DATA_END_DATE).reset_index()
)
price_history.columns = [x.lower() for x in price_history.columns]

In [5]:
price_history["date"] = price_history["date"].dt.date
price_history.head()

Unnamed: 0,date,open,high,low,close,volume,dividends,stock splits
0,2019-01-04,34.636166,35.599548,34.461225,35.530048,234428400,0.0,0.0
1,2019-01-07,35.635491,35.666646,34.964479,35.450962,219111200,0.0,0.0
2,2019-01-08,35.841597,36.383202,35.592366,36.126778,164101200,0.0,0.0
3,2019-01-09,36.256181,37.032638,35.85837,36.740269,180396400,0.0,0.0
4,2019-01-10,36.546162,36.898444,36.153141,36.857704,143122800,0.0,0.0


In [6]:
combo_df = pd.merge(price_history, grouped_sentiments, left_on="date", right_on="date")
combo_df = combo_df.sort_values(by="date", ascending=True)
combo_df = combo_df.set_index("date")
# X = combo_df[[col for col in combo_df.columns  if col != 'close']]
# y =  combo_df['close']
training_data_len = int(len(combo_df) * 0.9)
train_df, test_df = combo_df[:training_data_len], combo_df[1 + training_data_len :]

In [7]:
minmax_scaler = MinMaxScaler(feature_range=(-1, 1))
train_df = pd.DataFrame(
    minmax_scaler.fit_transform(train_df),
    index=train_df.index,
    columns=train_df.columns,
)
test_df = pd.DataFrame(
    minmax_scaler.transform(test_df), index=test_df.index, columns=test_df.columns
)

In [8]:
train_df.head()

Unnamed: 0_level_0,open,high,low,close,volume,dividends,stock splits,positive,negative,neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-04,-1.0,-1.0,-1.0,-0.998904,0.003494,-1.0,-1.0,-0.889067,-0.948,0.837
2019-01-07,-0.986282,-0.999075,-0.992932,-1.0,-0.075971,-1.0,-1.0,-1.0,-1.0,1.0
2019-01-08,-0.983453,-0.989194,-0.984114,-0.990631,-0.361359,-1.0,-1.0,-1.0,1.0,-1.0
2019-01-09,-0.977762,-0.980239,-0.980379,-0.982127,-0.276821,-1.0,-1.0,0.10635,-0.6545,-0.4519
2019-01-10,-0.973782,-0.98209,-0.976239,-0.980499,-0.470194,-1.0,-1.0,-0.9998,0.986,-0.986
