## Using FinBert to Analyze NKE, LULU, ATZ Sentiment

**Load Data (Yahoo Finance)**

In [3]:
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta, UTC
import time

# Tickers to analyze
TICKERS = ["NKE", "LULU", "ATZ.TO"]
YF_MAX_ITEMS = 10  # yfinance often returns ~10
LOOKBACK_DAYS = 365



def extract_article(item: dict, ticker: str) -> dict:
    """Flatten a yfinance news item into a FinBERT-ready row."""
    content = item.get("content", {}) or {}

    # Published time: providerPublishTime (unix) or content pubDate/displayTime
    published_dt = None
    ts = item.get("providerPublishTime")
    if ts:
        try:
            published_dt = datetime.fromtimestamp(ts, tz=UTC)
        except Exception:
            published_dt = None
    if published_dt is None:
        pub_iso = content.get("pubDate") or content.get("displayTime")
        if pub_iso:
            published_dt = pd.to_datetime(pub_iso, errors="coerce", utc=True)

    link = (
        item.get("link")
        or (item.get("canonicalUrl") or {}).get("url")
        or (item.get("clickThroughUrl") or {}).get("url")
        or (content.get("canonicalUrl") or {}).get("url")
        or (content.get("clickThroughUrl") or {}).get("url")
    )

    return {
        "source": "yfinance",
        "ticker": ticker,
        "title": (content.get("title") or item.get("title") or "").strip(),
        "summary": (content.get("summary") or content.get("description") or item.get("summary") or "").strip(),
        "publisher": (item.get("publisher") or (content.get("provider") or {}).get("displayName") or "").strip(),
        "type": (item.get("type") or content.get("contentType") or "").strip(),
        "link": link,
        "published_utc": published_dt,
        "raw_id": item.get("id") or item.get("uuid"),
    }


rows = []
for tic in TICKERS:
    raw_news = (yf.Ticker(tic).news or [])[:YF_MAX_ITEMS]
    rows.extend([extract_article(item, tic) for item in raw_news])
    print(f"{tic}: fetched {len(raw_news)} raw items")
    time.sleep(1)

# Build dataframe
df = pd.DataFrame(rows)
if not df.empty:
    df["published_utc"] = pd.to_datetime(df["published_utc"], errors="coerce", utc=True)
    cutoff = pd.Timestamp.now(tz="UTC") - pd.Timedelta(days=LOOKBACK_DAYS)
    df = df[df["published_utc"].isna() | (df["published_utc"] >= cutoff)]
    df = df.drop_duplicates(subset=["title", "link"])
    df = df.sort_values(["ticker", "published_utc"], ascending=[True, False]).reset_index(drop=True)

print(df.head())
print()
print(df.describe(include="all"))


NKE: fetched 10 raw items
LULU: fetched 10 raw items
ATZ.TO: fetched 10 raw items
     source  ticker                                              title  \
0  yfinance  ATZ.TO  The investing winners and losers that made or ...   
1  yfinance  ATZ.TO  TSX Value Picks Including Aritzia And Two Othe...   
2  yfinance  ATZ.TO  Stifel Canada Names Gildan, KITS, and Couche-T...   
3  yfinance  ATZ.TO  3 TSX Growth Stocks With Up To 22% Insider Own...   
4  yfinance  ATZ.TO  Tech Tactics: Aritzia Taps Nedapâ€™s RFID Platfo...   

                                             summary         publisher   type  \
0  Despite turmoil from the trade war, most globa...    Financial Post  STORY   
1  As 2025 draws to a close, the Canadian market ...   Simply Wall St.  STORY   
2  Stifel Canada said Friday its best ideas for C...      MT Newswires  STORY   
3  As we approach the end of 2025, Canadian marke...   Simply Wall St.  STORY   
4  Aritzia utilizes Nedap's RFID platform to stre...  Sourcing Jou

In [4]:
OUTPUT_FILE = "./processed_data/financial_news_dataset.csv"  # optional export
df.to_csv(OUTPUT_FILE, index=False)

print(f"Data exported to {OUTPUT_FILE}")

Data exported to ./processed_data/financial_news_dataset.csv


**Load Model**

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import feedparser
import requests


MODEL_NAME = "ProsusAI/finbert" #tabularisai/ModernFinBERT" or "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()

id2label = model.config.id2label  # e.g. {0: 'negative', 1: 'neutral', 2: 'positive'}

print(f"{MODEL_NAME} loaded")

KeyboardInterrupt: 

**Sanity Inference Check**

In [None]:
pipe = pipeline("text-classification", model=MODEL_NAME)

**Scoring Function**

In [27]:
def finbert_score(texts):
    if isinstance(texts, str):
        texts = [texts]
    enc = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    with torch.no_grad():
        logits = model(**enc).logits
        probs = torch.softmax(logits, dim=-1).numpy()
    labels = [id2label[int(i)] for i in probs.argmax(axis=1)]
    return labels, probs
