In [2]:
#imports

import numpy as np
import pandas as pd
import yfinance as yf

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

from sklearn.linear_model import LogisticRegression, LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)



In [32]:
#gonna start by manually copy and pasting some text from Seeking Alpha
#gonna use NVDA
ticker = "NVDA"
start_date = "2025-03-10"
end_date = "2025-03-20"
nvda_data = yf.download(ticker, start_date, end_date)

[*********************100%***********************]  1 of 1 completed


In [33]:
nvda_data

Price,Close,High,Low,Open,Volume
Ticker,NVDA,NVDA,NVDA,NVDA,NVDA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-03-10,106.970161,111.839709,105.450297,109.889891,366487400
2025-03-11,108.75,112.229676,104.760361,106.980159,354865700
2025-03-12,115.739998,116.760002,112.879997,114.120003,323857500
2025-03-13,115.580002,117.760002,113.790001,117.029999,299033100
2025-03-14,121.669998,121.879997,118.150002,118.610001,277593500
2025-03-17,119.529999,122.889999,118.029999,122.739998,255501500
2025-03-18,115.43,119.019997,114.540001,118.0,299686900
2025-03-19,117.519997,120.449997,115.68,117.269997,273426200


To calculate returns:

return = (close_end - close_start) / close_start

- can do multi-day returns or single day returns
- will look at various metrics such as: volatility, return, volume

| Window       | What It Captures           | Good For                                             |
|--------------|----------------------------|------------------------------------------------------|
| `t to t+1`   | Immediate next-day reaction | High-signal breaking news, overnight sentiment       |
| `t to t+3`   | Short-term reaction         | Earnings reports, Fed statements, corporate events   |
| `t to t+5`   | Full reaction cycle         | Captures delayed reactions, investor digestion       |
| `t+1 to t+5` | Post-market sentiment       | Filters out pre-announcement speculation             |

For this example, we are gonna capture short-term reaction, for this article: https://finance.yahoo.com/news/elon-musk-wanted-way-more-214202510.html

We are gonna assume a publication was released after hours, unless otherwise specified, so start_date will be at t+1

Also, we have to worry about weekends (the market is closed on weekends), so what do we do?
Documentation here: https://pandas.pydata.org/docs/user_guide/timeseries.html 

- can use .offsets.BDay to add business days (excluding weekends)

In [58]:
ticker = "NVDA"
publication_date = pd.to_datetime('2024-03-12')
start_date = publication_date + pd.offsets.BDay(1)
end_date = publication_date + pd.offsets.BDay(4)
end_date

Timestamp('2024-03-18 00:00:00')

In [53]:
ex_data = yf.download(ticker, start_date, end_date)

[*********************100%***********************]  1 of 1 completed


In [54]:
ex_data

Price,Close,High,Low,Open,Volume
Ticker,NVDA,NVDA,NVDA,NVDA,NVDA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2024-03-13,90.858139,91.473934,88.405942,91.025084,635713000
2024-03-14,87.915108,90.616224,86.571548,89.547574,602318000
2024-03-15,87.808144,89.516582,86.228668,86.901444,642086000
2024-03-18,88.425941,92.374641,87.056389,90.358305,668976000
2024-03-19,89.368637,90.514257,84.982078,86.671518,672171000
2024-03-20,90.342316,90.380305,88.19402,89.7675,479063000
2024-03-21,91.404961,92.617567,90.3753,92.269682,480372000
2024-03-22,94.258026,94.746864,90.804159,91.111058,586719000
2024-03-25,94.970787,96.734207,93.47928,93.910139,552136000
2024-03-26,92.530586,96.343336,92.471608,95.819506,513648000
