In [1]:
# notebooks/02_sentiment_analysis.ipynb

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import sys
import os

# Add src to path so we can import config if needed
sys.path.append(os.path.abspath('..'))

# Load raw news
print("Loading news data...")
df_news = pd.read_csv('../data/raw/scraped_news_dump.csv')
df_news['Date'] = pd.to_datetime(df_news['Date'])

# Load FinBERT
print("Loading FinBERT model...")
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def get_sentiment_score(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    # Score = Positive - Negative
    return probs[0][2].item() - probs[0][0].item()

# Process headlines (batching recommended for large datasets, loop used here for simplicity)
print("Calculating sentiment scores...")
tqdm.pandas()
df_news['sentiment_score'] = df_news['Headline'].progress_apply(get_sentiment_score)

# Aggregate by day (mean sentiment per day)
daily_sentiment = df_news.groupby('Date')['sentiment_score'].mean().reset_index()

# Save intermediate result
daily_sentiment.to_csv('../data/processed/daily_sentiment.csv', index=False)
print("Saved daily sentiment to data/processed/daily_sentiment.csv")


Loading news data...
Loading FinBERT model...
Calculating sentiment scores...


100%|██████████| 4011/4011 [01:56<00:00, 34.45it/s]


Saved daily sentiment to data/processed/daily_sentiment.csv
