In [2]:
#import everything
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
import numpy as np
import math
import time
import keyboard
import matplotlib.pyplot as plt
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from transformers import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification
import spacy
import pydantic
import requests
from bs4 import BeautifulSoup
from lumibot.brokers import Alpaca
from alpaca_trade_api import REST
from datetime import datetime, timedelta 
import time as Time

In [109]:
# get data
news_data = pd.read_csv('news_data.csv')
market_data = pd.read_csv('market_data')
news_data = news_data.drop(columns=['Unnamed: 0'])
market_data = market_data.drop(columns=['Unnamed: 0'])


In [110]:
# Change the time to only have the date
market_data['timestamp'] = pd.to_datetime(market_data['timestamp'])
market_data['date'] = market_data['timestamp'].dt.date
market_data = market_data.drop(columns=['timestamp'])

market_data.head(1)

Unnamed: 0,close,high,low,trade_count,open,volume,vwap,date
0,201.0192,201.03,198.59,655489,200.49,225903783,200.656423,2016-01-04


In [111]:
# Remove the 2015th year, only keep 2016
news_data['date'] = pd.to_datetime(news_data['date'])
news_data = news_data[news_data['date'].dt.year != 2015]
news_data

Unnamed: 0,text,date
0,Tom Lee Predicts S&P 500 Surge In 2024 Despite...,2023-12-30
1,"2023's Top Dogs: Nvidia, Meta Lead S&P 500 Cha...",2023-12-29
2,"AI Revolution, Fed's Pivot Power US Stock Mark...",2023-12-29
3,Mickey Mouse Goes Public: Disney Icon Enters D...,2023-12-29
4,"Market Clubhouse Morning Memo - December 29th,...",2023-12-29
...,...,...
56962,S&P 500 Index Futures Lower.,2016-01-05
56963,MarketTalk/HammerForum Before-the-Close Imbala...,2016-01-05
56964,Traders Are Bearish For 2016.,2016-01-05
56965,Damodaran: 2015 Could've Been A Lot Worse For ...,2016-01-04


In [112]:
# for market_data: Set time as index, and sort
market_data.set_index('date', inplace=True)
market_data.sort_index(inplace=True)
market_data

Unnamed: 0_level_0,close,high,low,trade_count,open,volume,vwap
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-01-04,201.0192,201.03,198.59,655489,200.490,225903783,200.656423
2016-01-05,201.3600,201.90,200.05,418709,201.405,112719152,201.084280
2016-01-06,198.8200,200.06,197.60,548386,198.330,153948196,198.956460
2016-01-07,194.0500,197.44,193.59,796451,195.330,216191953,195.345911
2016-01-08,191.9230,195.85,191.58,754102,195.190,216105404,193.644537
...,...,...,...,...,...,...,...
2023-12-22,473.6500,475.38,471.70,485465,473.860,67131807,473.800078
2023-12-26,475.6500,476.58,473.99,348986,474.070,55386952,475.111253
2023-12-27,476.5100,476.66,474.89,425538,475.440,68000811,475.770446
2023-12-28,476.6900,477.55,476.26,374241,476.880,77158117,476.774686


In [113]:
# for news_data: Set time as index, and sort
news_data.set_index('date', inplace=True)
news_data.sort_index(inplace=True)
news_data

Unnamed: 0_level_0,text
date,Unnamed: 1_level_1
2016-01-04,S&P 500 Index Futures Sharply Lower.
2016-01-04,Damodaran: 2015 Could've Been A Lot Worse For ...
2016-01-05,MarketTalk/HammerForum Before-the-Close Imbala...
2016-01-05,S&P 500 Index Futures Lower.
2016-01-05,MarketTalk/HammerForum Before-the-Close Imbala...
...,...
2023-12-29,Crypto Trader Foresees Steep Bitcoin Decline A...
2023-12-29,"Nasdaq, S&P 500 Futures Flat Ahead Of 2023's F..."
2023-12-29,"Asia Markets Mixed, Europe Gains, Crude Oil At..."
2023-12-29,"If You Invested $1,000 In Bitcoin When Mark Yu..."


In [114]:
news_data_s = news_data.groupby(news_data.index).agg({'text': ' '.join})

# Create a complete date range to reindex news_data
complete_date_range = pd.date_range(start=market_data.index.min(), end=market_data.index.max(), freq='B')

# Reindex news_data with the complete date range
news_data_ffill = news_data_s.reindex(complete_date_range).fillna(method='ffill')

# Convert the reindexed data back into DataFrame and rename columns
news_data_ffill = news_data_ffill.reset_index().rename(columns={'index': 'date'})
market_data = market_data.reset_index().rename(columns={'index': 'date'})

# Ensure 'date' columns are of type datetime64[ns]
news_data_ffill['date'] = pd.to_datetime(news_data_ffill['date'])
market_data['date'] = pd.to_datetime(market_data['date'])

# Merge DataFrames on 'date'
merged_data = pd.merge(market_data, news_data_ffill, on='date', how='left')

# Set 'date' back as index if needed
merged_data = merged_data.set_index('date')
merged_data

  news_data_ffill = news_data_s.reindex(complete_date_range).fillna(method='ffill')


Unnamed: 0_level_0,close,high,low,trade_count,open,volume,vwap,text
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-04,201.0192,201.03,198.59,655489,200.490,225903783,200.656423,S&P 500 Index Futures Sharply Lower. Damodara...
2016-01-05,201.3600,201.90,200.05,418709,201.405,112719152,201.084280,MarketTalk/HammerForum Before-the-Close Imbala...
2016-01-06,198.8200,200.06,197.60,548386,198.330,153948196,198.956460,Technical Take: Digging Into Gartman's Bear Ma...
2016-01-07,194.0500,197.44,193.59,796451,195.330,216191953,195.345911,Merrill Lynch's Michael Harnett Says DJ Transp...
2016-01-08,191.9230,195.85,191.58,754102,195.190,216105404,193.644537,RBS Global Macro Team Says 'sell in May and ne...
...,...,...,...,...,...,...,...,...
2023-12-22,473.6500,475.38,471.70,485465,473.860,67131807,473.800078,USA New Home Sales (MoM) For November -12.2% V...
2023-12-26,475.6500,476.58,473.99,348986,474.070,55386952,475.111253,"Market Clubhouse Morning Memo - December 26th,..."
2023-12-27,476.5100,476.66,474.89,425538,475.440,68000811,475.770446,"US Stocks Pause, Gold Rises As Dollar, Treasur..."
2023-12-28,476.6900,477.55,476.26,374241,476.880,77158117,476.774686,USA Crude Oil Inventories A Draw Of 6.911M Vs ...


In [101]:
news_data_grouped = news_data.groupby(news_data.index).agg({'text': ' '.join})

# Forward-fill news_data index to match the nearest market_data date
news_data_grouped = news_data_grouped.reindex(market_data.index, method='ffill')
news_data_grouped.head(10)
# # Reset index to prepare for merging
# news_data_grouped = news_data_grouped.reset_index().rename(columns={'index': 'date'})
# market_data_reset = market_data.reset_index().rename(columns={'index': 'date'})

# # Merge DataFrames on 'date'
# merged_data = pd.merge(market_data_reset, news_data_grouped, on='date', how='left')

# # Set 'date' back as index if needed
# merged_data = merged_data.set_index('date')
# merged_data

Unnamed: 0_level_0,text
date,Unnamed: 1_level_1
2016-01-04,S&P 500 Index Futures Sharply Lower. Damodara...
2016-01-05,MarketTalk/HammerForum Before-the-Close Imbala...
2016-01-06,Technical Take: Digging Into Gartman's Bear Ma...
2016-01-07,Merrill Lynch's Michael Harnett Says DJ Transp...
2016-01-08,RBS Global Macro Team Says 'sell in May and ne...
2016-01-11,US Says It Is Not Discussing Re-Introducing Nu...
2016-01-12,How Demographic Trends Help Explain Rising Cas...
2016-01-13,Did A Huge Commodity Fund Blow Up At 2:30PM We...
2016-01-14,My 2016 Guesses…and of Course the Crash of 201...
2016-01-15,5 Stocks Ronnie Moas Just Upgraded -- And A Wa...


In [105]:
news_data_grouped['text'].iloc[4]

"RBS Global Macro Team Says 'sell in May and never come back', Warns Of Coming Weakness And Fundamental Risks.  S&P 500 Index Futures Flat.  Fed's Lacker Says More Than 4 Int. Rate Hikes May Be Needed In 2016 If Inflation Surges-Reuters.  Goldman Sachs Cuts S&P 500 2016 FY EPS From $109 To $103, Cuts FY 2016 EPS From $120 To $117, Cuts 2017 FY EPS From $129 To $126.  Deutsche Bank 'unsure' On How Much To Cut Energy And S&P 500 2016 EPS Estimates, Cuts S&P 500 FY 2015 EPS From $119 To $118.50, Sees Non-GAAP EPS Growth Down 2% YoY.  Deutsche Bank Desk Commentary: Solid Dec Employment Report Maintains Path For March Rate Hike, Says Strong Hiring Alongside Weak GDP Growth Yields Margin Compression.  El-Erian, When Asked 'Is the great bull market is over?' Responds 'Not necessarily.'.  US Labor Force Participation Rate Increases From 62.5% TO 62.6%.  El-Erian Says 'There's no way we should have gotten that reaction from [the news out of China], expect that we were in a phase of suppressed v

In [106]:
news_data_grouped['text'].iloc[5]

"US Says It Is Not Discussing Re-Introducing Nuclear Weapons To S. Korea, Says Doing So May Spark Arms Race.  MarketTalk/HammerForum Before-the-Close Imbalance Update Jan 11, 2016.  S&P 500 Turns Green In Violent Turn Around During Final Hour Of Trading, ~14 Points In 20 Minutes To 1919.  JP Morgan Downgrades China To Neutral, Is Disturbed By Feedback Loop In 'Weak Developed Equity Markets'.  Deustche Bank Corrects Itself, Says $55 Oil Predicted Weeks Ago Now Seems Unreasonable.  S&P 500 E-Mini Futures Breaks 1900 Level, Now Trading At 1899.50.  S&P 500 Index Futures Clinging To Gains.  RBC Desk Commentary:  Sentiment Around China Sell Off Last Night And Last Monday Hinges On Seller Fatigue And A Reverting Focus To Micro From Macro As Earnings Season Approaches.  Deustche Bank Says $55 Oil Predicted Weeks Ago Now Seems Unreasonable. "

In [103]:
news_data.iloc[30:100].head(30)

Unnamed: 0_level_0,text
date,Unnamed: 1_level_1
2016-01-08,RBS Global Macro Team Says 'sell in May and ne...
2016-01-08,S&P 500 Index Futures Flat.
2016-01-08,Fed's Lacker Says More Than 4 Int. Rate Hikes ...
2016-01-08,Goldman Sachs Cuts S&P 500 2016 FY EPS From $1...
2016-01-08,Deutsche Bank 'unsure' On How Much To Cut Ener...
2016-01-08,Deutsche Bank Desk Commentary: Solid Dec Emplo...
2016-01-08,"El-Erian, When Asked 'Is the great bull market..."
2016-01-08,US Labor Force Participation Rate Increases Fr...
2016-01-08,El-Erian Says 'There's no way we should have g...
2016-01-08,El-Erian on Benzinga's PreMarket Prep: 'With m...


In [115]:
print(merged_data['text'].iloc[1])

MarketTalk/HammerForum Before-the-Close Imbalance Update Jan. 5, 2016.  S&P 500 Index Futures Lower.  MarketTalk/HammerForum Before-the-Close Imbalance Update Jan 5, 2016.  Explanations are Not Predictions.  What's Next For Stocks After The New Year's Day One Hangover.  Traders Are Bearish For 2016. 


In [116]:
news_data['text'].iloc[4]

'MarketTalk/HammerForum Before-the-Close Imbalance Update Jan 5, 2016. '

In [117]:
news_data.iloc[30:100].head(30)

Unnamed: 0_level_0,text
date,Unnamed: 1_level_1
2016-01-08,RBS Global Macro Team Says 'sell in May and ne...
2016-01-08,S&P 500 Index Futures Flat.
2016-01-08,Fed's Lacker Says More Than 4 Int. Rate Hikes ...
2016-01-08,Goldman Sachs Cuts S&P 500 2016 FY EPS From $1...
2016-01-08,Deutsche Bank 'unsure' On How Much To Cut Ener...
2016-01-08,Deutsche Bank Desk Commentary: Solid Dec Emplo...
2016-01-08,"El-Erian, When Asked 'Is the great bull market..."
2016-01-08,US Labor Force Participation Rate Increases Fr...
2016-01-08,El-Erian Says 'There's no way we should have g...
2016-01-08,El-Erian on Benzinga's PreMarket Prep: 'With m...


In [118]:
market_data.head(20)

Unnamed: 0,date,close,high,low,trade_count,open,volume,vwap
0,2016-01-04,201.0192,201.03,198.59,655489,200.49,225903783,200.656423
1,2016-01-05,201.36,201.9,200.05,418709,201.405,112719152,201.08428
2,2016-01-06,198.82,200.06,197.6,548386,198.33,153948196,198.95646
3,2016-01-07,194.05,197.44,193.59,796451,195.33,216191953,195.345911
4,2016-01-08,191.923,195.85,191.58,754102,195.19,216105404,193.644537
5,2016-01-11,192.11,193.41,189.82,701548,193.01,205368067,191.757659
6,2016-01-12,193.6608,194.55,191.14,635749,193.82,175844276,192.902295
7,2016-01-13,188.83,194.86,188.38,812694,194.45,223632834,191.13459
8,2016-01-14,191.93,193.26,187.66,824320,189.55,243821116,190.77655
9,2016-01-15,187.81,188.76,185.52,1104776,186.77,333774073,187.694544


In [119]:
#08
merged_data['text'].iloc[4]

"RBS Global Macro Team Says 'sell in May and never come back', Warns Of Coming Weakness And Fundamental Risks.  S&P 500 Index Futures Flat.  Fed's Lacker Says More Than 4 Int. Rate Hikes May Be Needed In 2016 If Inflation Surges-Reuters.  Goldman Sachs Cuts S&P 500 2016 FY EPS From $109 To $103, Cuts FY 2016 EPS From $120 To $117, Cuts 2017 FY EPS From $129 To $126.  Deutsche Bank 'unsure' On How Much To Cut Energy And S&P 500 2016 EPS Estimates, Cuts S&P 500 FY 2015 EPS From $119 To $118.50, Sees Non-GAAP EPS Growth Down 2% YoY.  Deutsche Bank Desk Commentary: Solid Dec Employment Report Maintains Path For March Rate Hike, Says Strong Hiring Alongside Weak GDP Growth Yields Margin Compression.  El-Erian, When Asked 'Is the great bull market is over?' Responds 'Not necessarily.'.  US Labor Force Participation Rate Increases From 62.5% TO 62.6%.  El-Erian Says 'There's no way we should have gotten that reaction from [the news out of China], expect that we were in a phase of suppressed v

In [120]:
#11
merged_data['text'].iloc[5]

"US Says It Is Not Discussing Re-Introducing Nuclear Weapons To S. Korea, Says Doing So May Spark Arms Race.  MarketTalk/HammerForum Before-the-Close Imbalance Update Jan 11, 2016.  S&P 500 Turns Green In Violent Turn Around During Final Hour Of Trading, ~14 Points In 20 Minutes To 1919.  JP Morgan Downgrades China To Neutral, Is Disturbed By Feedback Loop In 'Weak Developed Equity Markets'.  Deustche Bank Corrects Itself, Says $55 Oil Predicted Weeks Ago Now Seems Unreasonable.  S&P 500 E-Mini Futures Breaks 1900 Level, Now Trading At 1899.50.  S&P 500 Index Futures Clinging To Gains.  RBC Desk Commentary:  Sentiment Around China Sell Off Last Night And Last Monday Hinges On Seller Fatigue And A Reverting Focus To Micro From Macro As Earnings Season Approaches.  Deustche Bank Says $55 Oil Predicted Weeks Ago Now Seems Unreasonable. "