In [None]:
import feedparser
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from bs4 import BeautifulSoup
import uuid
from datetime import datetime

class FinancialSentimentPipeline:
    def __init__(self):
        # 1. Load the FinBERT model (The "Brain")
        # We use 'ProsusAI/finbert' which is the industry standard open-source model
        print("Loading FinBERT model... (this may take a minute first time)")
        self.tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
        self.model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
        self.labels = ["positive", "negative", "neutral"]

    def preprocess(self, text):
        """Clean html and extra whitespace"""
        soup = BeautifulSoup(text, "html.parser")
        clean_text = soup.get_text(separator=' ')
        return " ".join(clean_text.split())

    def get_sentiment(self, text):
        """Returns label and confidence score"""
        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Convert logits to probabilities (0 to 1)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        # Get the highest probability and its label
        score, idx = torch.max(probs, dim=-1)
        return {
            "sentiment_label": self.labels[idx.item()],
            "sentiment_score": round(score.item(), 4),
            "probabilities": {
                "positive": round(probs[0][0].item(), 4),
                "negative": round(probs[0][1].item(), 4),
                "neutral":  round(probs[0][2].item(), 4)
            }
        }

    def fetch_news_rss(self, rss_url):
        """Ingest Data from RSS Feed (Simulating a Scraper)"""
        print(f"Fetching news from {rss_url}...")
        feed = feedparser.parse(rss_url)
        results = []

        for entry in feed.entries[:5]: # Limit to 5 for demo
            # Extract content
            raw_text = entry.title + ". " + entry.get('summary', '')
            clean_text = self.preprocess(raw_text)
            
            # Analyze Sentiment
            sentiment = self.get_sentiment(clean_text)
            
            # Structure the data
            record = {
                "news_id": entry.get('id', str(uuid.uuid4())),
                "published_time": entry.get('published', datetime.now().isoformat()),
                "text": clean_text[:100] + "...", # Preview
                "sentiment_label": sentiment['sentiment_label'],
                "sentiment_score": sentiment['sentiment_score'],
                "probs": sentiment['probabilities']
            }
            results.append(record)
            
        return pd.DataFrame(results)

# --- RUNNING THE PIPELINE ---
if __name__ == "__main__":
    # Example: Yahoo Finance Tech News RSS
    rss_url = "https://finance.yahoo.com/news/rssindex"
    
    pipeline = FinancialSentimentPipeline()
    df = pipeline.fetch_news_rss(rss_url)
    
    # Display clearly
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 1000)
    print(df[['published_time', 'sentiment_label', 'sentiment_score', 'text']])

  from .autonotebook import tqdm as notebook_tqdm


Loading FinBERT model... (this may take a minute first time)
Fetching news from https://finance.yahoo.com/news/rssindex...
         published_time sentiment_label  sentiment_score                                               text
0  2025-12-18T10:30:00Z        negative           0.9518  JPMorgan Quants Warn of ‘Extreme Crowding’ in ...
1  2025-12-18T10:30:00Z        negative           0.9666  Micron’s Blowout Results Are Bad News for Anyo...
2  2025-12-18T10:30:49Z        negative           0.8726  Is Norwegian Cruise Line Stock Underperforming...
3  2025-12-18T10:34:25Z        positive           0.5494  Solstice Stock: Is SOLS Outperforming the Mate...
4  2025-12-18T10:44:00Z        positive           0.9061  Micron Crushes Earnings. The Stock Is Rising.....


In [2]:
df

Unnamed: 0,news_id,published_time,text,sentiment_label,sentiment_score,probs
0,jpmorgan-quants-warn-extreme-crowding-10300045...,2025-12-18T10:30:00Z,JPMorgan Quants Warn of ‘Extreme Crowding’ in ...,negative,0.9518,"{'positive': 0.0126, 'negative': 0.9518, 'neut..."
1,microns-blowout-results-are-bad-news-for-anyon...,2025-12-18T10:30:00Z,Micron’s Blowout Results Are Bad News for Anyo...,negative,0.9666,"{'positive': 0.0082, 'negative': 0.9666, 'neut..."
2,norwegian-cruise-line-stock-underperforming-10...,2025-12-18T10:30:49Z,Is Norwegian Cruise Line Stock Underperforming...,negative,0.8726,"{'positive': 0.0482, 'negative': 0.8726, 'neut..."
3,solstice-stock-sols-outperforming-materials-10...,2025-12-18T10:34:25Z,Solstice Stock: Is SOLS Outperforming the Mate...,positive,0.5494,"{'positive': 0.5494, 'negative': 0.3153, 'neut..."
4,micron-technology-earnings-stock-price-ca4679e...,2025-12-18T10:44:00Z,Micron Crushes Earnings. The Stock Is Rising.....,positive,0.9061,"{'positive': 0.9061, 'negative': 0.0443, 'neut..."
