In [2]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import yfinance as yf
from constants import DATA_END_DATE, DATA_START_DATE
from db_helper_functions import (
    get_stock_news_with_finbert_tone_scores_from_db,
    get_stock_news_with_finbert_whole_article_scores_from_db,
    get_stock_news_with_finbert_scores_from_db,
)
from sklearn.model_selection import TimeSeriesSplit


ticker = "AAPL"


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
df_opts = [
    get_stock_news_with_finbert_tone_scores_from_db(ticker),
    get_stock_news_with_finbert_whole_article_scores_from_db(ticker),
    get_stock_news_with_finbert_scores_from_db(ticker),
]
df = df_opts[1]
grouped_sentiments = df.groupby("date", as_index=False).agg(
    {"positive": "mean", "negative": "mean", "neutral": "mean"}
)
price_history = (
    yf.Ticker(ticker).history(start=DATA_START_DATE, end=DATA_END_DATE).reset_index()
)
price_history.columns = [x.lower() for x in price_history.columns]
price_history["date"] = price_history["date"].dt.date
price_history.head()
combo_df = pd.merge(
    price_history, grouped_sentiments, left_on="date", right_on="date", how="left"
)
combo_df = combo_df.sort_values(by="date", ascending=True)
combo_df = combo_df.set_index("date")
combo_df

  df.index += _pd.TimedeltaIndex(dst_error_hours, 'h')


Unnamed: 0_level_0,open,high,low,close,volume,dividends,stock splits,positive,negative,neutral
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-04,34.636166,35.599548,34.461225,35.530048,234428400,0.0,0.0,0.358300,0.426100,0.215600
2019-01-07,35.635499,35.666654,34.964487,35.450970,219111200,0.0,0.0,0.117100,0.686500,0.196400
2019-01-08,35.841597,36.383202,35.592366,36.126778,164101200,0.0,0.0,0.252700,0.531600,0.215600
2019-01-09,36.256181,37.032638,35.858370,36.740269,180396400,0.0,0.0,0.268725,0.414900,0.316400
2019-01-10,36.546147,36.898428,36.153127,36.857689,143122800,0.0,0.0,0.405500,0.202000,0.392500
...,...,...,...,...,...,...,...,...,...,...
2022-12-27,130.483247,130.513041,127.841400,129.142456,69007800,0.0,0.0,0.236418,0.375418,0.388145
2022-12-28,128.784900,130.135617,125.010842,125.179680,85438400,0.0,0.0,0.180425,0.423800,0.395763
2022-12-29,127.116381,129.589383,126.858162,128.725327,75703700,0.0,0.0,0.292311,0.475433,0.232256
2022-12-30,127.533507,129.062989,126.560193,129.043121,77034200,0.0,0.0,0.382640,0.364780,0.252600


In [4]:
import pandas as pd
from datetime import datetime
from dateutil.relativedelta import relativedelta
from torch import nn, torch
from torch.nn import LSTM
from torch.nn import MSELoss

In [6]:
sentiments_df = combo_df[['close', 'positive', 'negative', 'neutral']]

# Aggregate data into months, using the average of columns as the values (ignoring NaN)
def aggregate_monthly_data(df):
    df.index = pd.to_datetime(df.index)
    df['date_monthly'] = df.index.to_period('M').strftime('%Y-%m')
    monthly_data = df.groupby('date_monthly').mean()
    monthly_data = monthly_data.sort_index()
    return monthly_data
monthly_df = aggregate_monthly_data(sentiments_df.copy())
pd.set_option('display.max_rows', 10)
monthly_df

Unnamed: 0_level_0,close,positive,negative,neutral
date_monthly,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01,37.049162,0.291351,0.399724,0.308912
2019-02,41.284198,0.305565,0.350038,0.344378
2019-03,44.114185,0.286235,0.269212,0.444551
2019-04,48.259177,0.339818,0.350140,0.310044
2019-05,46.151072,0.275238,0.367214,0.357551
...,...,...,...,...
2022-09,151.706829,0.221570,0.393089,0.385339
2022-10,143.784985,0.244999,0.371511,0.383489
2022-11,144.813804,0.236016,0.361605,0.402381
2022-12,136.935554,0.253168,0.355630,0.391191


In [22]:
train_data = monthly_df
predictors = ['positive', 'negative', 'neutral']

model = LSTM(len(predictors), 64, 2, batch_first=True)  # Input size is the length of predictors
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = MSELoss()

for start_idx in range(len(monthly_df) - 1):
    train_window = monthly_df.iloc[start_idx:start_idx+1][predictors + ['close']]
    test_window = monthly_df.iloc[start_idx+1:start_idx+2][['close']]  # Use only the 'close' value for comparison

    train_features = torch.from_numpy(train_window[predictors].values).float()
    train_close = torch.tensor(train_window['close'].values).float()

    # Train on the window
    output, _ = model(train_features.unsqueeze(0))
    train_close_predicted = train_close + output.squeeze()

    # Compute training loss
    loss = loss_fn(train_close_predicted, train_close)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    print(f"Epoch: {start_idx+1}/{len(monthly_df)-1}, Train Loss: {loss.item():.4f}")
    
    # Print predicted and actual closing prices
    print("Predicted Close:", train_close_predicted[0].item())
    print("Actual Close:", train_close)

    if start_idx < len(monthly_df) - 2:
        # Evaluate on the test set (separate from the training data)
        test_close = torch.tensor(test_window['close'].values).float()

        with torch.no_grad():
            test_output, _ = model(test_close.unsqueeze(0))  # Get the output tensor from the model
            test_close_predicted = test_output.squeeze() + test_window['close'].values  # Predict the next month's closing price
            
            test_loss = loss_fn(test_close_predicted, test_close)
            print(f"Epoch: {start_idx+1}/{len(monthly_df)-1}, Test Loss: {test_loss.item():.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

Epoch: 1/48, Train Loss: 0.0004
Predicted Close: 37.01704025268555
Actual Close: tensor([37.0492])
Epoch: 1/48, Test Loss: 0.0019
Epoch: 2/48, Train Loss: 0.0004
Predicted Close: 41.253173828125
Actual Close: tensor([41.2842])
Epoch: 2/48, Test Loss: 0.0019
Epoch: 3/48, Train Loss: 0.0003
Predicted Close: 44.08424758911133
Actual Close: tensor([44.1142])
Epoch: 3/48, Test Loss: 0.0019
Epoch: 4/48, Train Loss: 0.0003
Predicted Close: 48.23017883300781
Actual Close: tensor([48.2592])
Epoch: 4/48, Test Loss: 0.0018
Epoch: 5/48, Train Loss: 0.0003
Predicted Close: 46.12311553955078
Actual Close: tensor([46.1511])
Epoch: 5/48, Test Loss: 0.0018
Epoch: 6/48, Train Loss: 0.0003
Predicted Close: 46.59474563598633
Actual Close: tensor([46.6216])
Epoch: 6/48, Test Loss: 0.0018
Epoch: 7/48, Train Loss: 0.0002
Predicted Close: 49.554725646972656
Actual Close: tensor([49.5806])
Epoch: 7/48, Test Loss: 0.0018
Epoch: 8/48, Train Loss: 0.0002
Predicted Close: 49.63116455078125
Actual Close: tensor([49

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


In [None]:
# model = LSTM(len(predictors), 64, 2, batch_first=True)  # Input size is the length of predictors
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# loss_fn = MSELoss()

# for start_idx in range(len(monthly_df) - 1):  # Adjust the range to prevent index out of bounds
#     # Define training and testing windows based on datetime index
#     train_window = monthly_df.iloc[start_idx:start_idx+1][predictors + ['close']]
#     test_window = monthly_df.iloc[start_idx+1:start_idx+2][['close']]  # Use only the 'close' value for comparison

#     # Convert to tensors
#     train_features = torch.from_numpy(train_window[predictors].values).float()
#     train_close = torch.tensor(train_window['close'].values).float()
#     test_close = torch.tensor(test_window['close'].values).float()

#     # Train on the window
#     output, _ = model(train_features.unsqueeze(0))  # Add an extra dimension for batch size
    
#     # Extract the predicted value
#     predicted_value = output.squeeze()  # Remove the batch dimension
#     train_close_predicted = train_close + predicted_value  # Add the predicted value to the train month's close

#     # Compute loss based on the comparison of the predicted close and the test close
#     loss = loss_fn(train_close_predicted, test_close)
#     print(train_close_predicted)
#     print(test_close)

#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()

#     # Evaluate the test loss
#     with torch.no_grad():
#         test_output, _ = model(train_features.unsqueeze(0))  # Assuming you're evaluating on training data for simplicity
#         test_predicted_value = test_output.squeeze()  # Remove the batch dimension
#         test_close_predicted = train_close + test_predicted_value  # Add the predicted value to the train month's close
    
#     # Ensure that the shapes of test_close_predicted and test_close match
#         if test_close_predicted.shape != test_close.shape:
#             # Reshape or broadcast test_close_predicted to match the shape of test_close
#             test_close_predicted = test_close_predicted.squeeze().unsqueeze(0)  # Adjust the shape as needed

#         # Compute the test loss
#         test_loss = loss_fn(test_close_predicted, test_close)

#     # Print training and test progress
#     print(f"Epoch: {start_idx+1}/{len(monthly_df)-1}, Train Loss: {loss.item():.4f}, Test Loss: {test_loss.item():.4f}")