In [204]:
#imports

import numpy as np
import pandas as pd
import yfinance as yf

import requests 
from bs4 import BeautifulSoup

import matplotlib.pyplot as plt
import seaborn as sns

import re

import warnings
warnings.filterwarnings('ignore')

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

from sklearn.linear_model import LogisticRegression, LinearRegression

from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    roc_auc_score
)

from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score
)

In [122]:
#gonna start by manually copy and pasting some text from Seeking Alpha
#gonna use NVDA
ticker = "NVDA"
start_date = "2025-03-10"
end_date = "2025-03-20"
nvda_data = yf.download(ticker, start_date, end_date)

[*********************100%***********************]  1 of 1 completed


In [123]:
nvda_data

Price,Close,High,Low,Open,Volume
Ticker,NVDA,NVDA,NVDA,NVDA,NVDA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-03-10,106.970161,111.839709,105.450297,109.889891,366487400
2025-03-11,108.75,112.229676,104.760361,106.980159,354865700
2025-03-12,115.739998,116.760002,112.879997,114.120003,323857500
2025-03-13,115.580002,117.760002,113.790001,117.029999,299033100
2025-03-14,121.669998,121.879997,118.150002,118.610001,277593500
2025-03-17,119.529999,122.889999,118.029999,122.739998,255501500
2025-03-18,115.43,119.019997,114.540001,118.0,299686900
2025-03-19,117.519997,120.449997,115.68,117.269997,273426200


To calculate returns:

return = (close_end - close_start) / close_start

- can do multi-day returns or single day returns
- will look at various metrics such as: volatility, return, volume

| Window       | What It Captures           | Good For                                             |
|--------------|----------------------------|------------------------------------------------------|
| `t to t+1`   | Immediate next-day reaction | High-signal breaking news, overnight sentiment       |
| `t to t+3`   | Short-term reaction         | Earnings reports, Fed statements, corporate events   |
| `t to t+5`   | Full reaction cycle         | Captures delayed reactions, investor digestion       |
| `t+1 to t+5` | Post-market sentiment       | Filters out pre-announcement speculation             |

For this example, we are gonna capture short-term reaction, for this article: https://finance.yahoo.com/news/elon-musk-wanted-way-more-214202510.html

We are gonna assume a publication was released after hours, unless otherwise specified, so start_date will be at t+1

Also, we have to worry about weekends (the market is closed on weekends), so what do we do?
Documentation here: https://pandas.pydata.org/docs/user_guide/timeseries.html 

- can use .offsets.BDay to add business days (excluding weekends)

In [124]:
ticker = "NVDA"
publication_date = pd.to_datetime('2024-03-12')
start_date = publication_date + pd.offsets.BDay(1)
end_date = publication_date + pd.offsets.BDay(4)
end_date

Timestamp('2024-03-18 00:00:00')

In [125]:
ex_data = yf.download(ticker, start_date, end_date)

[*********************100%***********************]  1 of 1 completed


In [126]:
ex_data

Price,Close,High,Low,Open,Volume
Ticker,NVDA,NVDA,NVDA,NVDA,NVDA
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2024-03-13,90.858139,91.473934,88.405942,91.025084,635713000
2024-03-14,87.915108,90.616224,86.571548,89.547574,602318000
2024-03-15,87.808144,89.516582,86.228668,86.901444,642086000


In [127]:
def calc_return(ending_price, starting_price):
    return (ending_price - starting_price) / starting_price

In [128]:
#calculate returns
close_price_end = ex_data.iloc[-1][('High', 'NVDA')]
close_price_start = ex_data.iloc[0][('High', 'NVDA')]
calc_return(close_price_end, close_price_start) * 100

np.float64(-2.1397922180365336)

In [222]:
url = "https://finance.yahoo.com/news/elon-musk-wanted-way-more-214202510.html"
response = requests.get(url, headers = {"User-Agent": "Mozilla/5.0"})
soup = BeautifulSoup(response.text, "html.parser")

article_block = soup.find_all("p", class_ = "yf-1090901")

In [228]:
paragraphs = [p.get_text() for p in article_block]

In [231]:
paragraphs

['Nvidia stock has tripled this year as investors bet the AI boom will fuel demand for graphics chips.',
 "Oracle's Larry Ellison said Elon Musk's xAI wanted more Nvidia chips than it could offer last quarter.",
 'Oracle is spending heavily on scaling up to meet intense demand from clients like Musk.',
 "Nvidia stock has more than tripled this year as investors bet the graphics-chip specialist will power the artificial-intelligence revolution. Elon Musk is among those who can't get enough of its semiconductors, Oracle's Larry Ellison said on Monday.",
 "Musk has opted to use Oracle's servers to run his xAI company's recently launched chatbot, Grok. The cloud-infrastructure giant managed to provide enough Nvidia chips to power the first version of Grok, but fell short of Musk's demands, Ellison said during Oracle's latest earnings call, according to a transcript provided by AlphaSense/Sentieo.",
 '"Boy, did they want a lot more GPUs than we gave them," Oracle\'s billionaire cofounder an

In [232]:
text = [re.sub(r"[^A-Za-z0-9']+", ' ', line) for line in paragraphs]
text = list(map(str.lower, text))
text

['nvidia stock has tripled this year as investors bet the ai boom will fuel demand for graphics chips ',
 "oracle's larry ellison said elon musk's xai wanted more nvidia chips than it could offer last quarter ",
 'oracle is spending heavily on scaling up to meet intense demand from clients like musk ',
 "nvidia stock has more than tripled this year as investors bet the graphics chip specialist will power the artificial intelligence revolution elon musk is among those who can't get enough of its semiconductors oracle's larry ellison said on monday ",
 "musk has opted to use oracle's servers to run his xai company's recently launched chatbot grok the cloud infrastructure giant managed to provide enough nvidia chips to power the first version of grok but fell short of musk's demands ellison said during oracle's latest earnings call according to a transcript provided by alphasense sentieo ",
 " boy did they want a lot more gpus than we gave them oracle's billionaire cofounder and tech chie

In [233]:
text= " ".join(text)
text

"nvidia stock has tripled this year as investors bet the ai boom will fuel demand for graphics chips  oracle's larry ellison said elon musk's xai wanted more nvidia chips than it could offer last quarter  oracle is spending heavily on scaling up to meet intense demand from clients like musk  nvidia stock has more than tripled this year as investors bet the graphics chip specialist will power the artificial intelligence revolution elon musk is among those who can't get enough of its semiconductors oracle's larry ellison said on monday  musk has opted to use oracle's servers to run his xai company's recently launched chatbot grok the cloud infrastructure giant managed to provide enough nvidia chips to power the first version of grok but fell short of musk's demands ellison said during oracle's latest earnings call according to a transcript provided by alphasense sentieo   boy did they want a lot more gpus than we gave them oracle's billionaire cofounder and tech chief said we gave them q

In [234]:
model_name = "yiyanghkust/finbert-tone"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [235]:
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

Device set to use mps:0


In [236]:
result = classifier(text)

RuntimeError: The size of tensor a (572) must match the size of tensor b (512) at non-singleton dimension 1

In [None]:
print(result)