In [8]:
import os
import requests
import xml.etree.ElementTree as ET
import pandas as pd
from datetime import datetime
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from tqdm import tqdm

# 📁 Ensure data directory exists
os.makedirs("data", exist_ok=True)

# 📅 Get current date (YYYY-MM-DD)
timestamp = datetime.utcnow().strftime("%Y-%m-%d")

# 🔍 Keyword list for scanning the news
keywords = [
    "S&P 500", "NASDAQ", "Dow Jones", "US stock market", "bull market", "stocks rally",
    "economic growth", "interest rate cut", "strong earnings", "SP500 up", "ETF inflows",
    "US market gains", "equities surge", "stock rally", "financial markets rising"
]

# 🧠 Load FinBERT
print("🧠 Loading FinBERT...")
model_name = "yiyanghkust/finbert-tone"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# 🧪 Sentiment classification
def analyze_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=1)
    labels = ['negative', 'neutral', 'positive']
    return labels[torch.argmax(probs)]

# 📊 Prepare output dictionary
results = {"date": timestamp}

# 🔁 Loop over keywords
for keyword in tqdm(keywords, desc="🔍 Processing keywords"):
    query = f"{keyword}"
    rss_url = f"https://news.google.com/rss/search?q={query.replace(' ', '+')}&hl=en-US&gl=US&ceid=US:en"

    try:
        response = requests.get(rss_url, headers={"User-Agent": "Mozilla/5.0"})
        root = ET.fromstring(response.content)
        items = root.findall(".//item")

        headlines = [item.find("title").text for item in items]
        links = [item.find("link").text for item in items]

        df = pd.DataFrame({'headline': headlines, 'url': links})
        df['sentiment'] = df['headline'].apply(analyze_sentiment)

        positive_count = df[df['sentiment'] == 'positive'].shape[0]
        results[keyword] = positive_count

    except Exception as e:
        print(f"Error processing keyword '{keyword}': {e}")
        results[keyword] = "ERROR"

# 🧾 Save results
output_file = "data/news.csv"
results_df = pd.DataFrame([results])

if os.path.exists(output_file):
    existing_df = pd.read_csv(output_file)
    combined_df = pd.concat([existing_df, results_df], ignore_index=True)
else:
    combined_df = results_df

combined_df.to_csv(output_file, index=False)
print(f"\n✅ Results saved to {output_file}")


RuntimeError: Failed to import transformers.models.bert.modeling_bert because of the following error (look up to see its traceback):
Failed to import transformers.generation.utils because of the following error (look up to see its traceback):
cannot import name '_contains_nan' from 'scipy._lib._util' (/opt/homebrew/lib/python3.10/site-packages/scipy/_lib/_util.py)

In [7]:
!pip install --upgrade pip

# Stable versions known to work
!pip install numpy==1.24.4 scipy==1.10.1 scikit-learn==1.2.2
!pip install torch transformers
!pip install pandas tqdm

