Imports & Setup

In [None]:
import os
import json
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Download punkt if necessary
nltk.download('punkt')


Load Frozen FinBERT

In [None]:
MODEL_NAME = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()  # no gradient

label_map = {0: "NEG", 1: "NEU", 2: "POS"}

def frozen_sentiment_predict(text: str):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        out = model(**inputs)
        probs = torch.softmax(out.logits, dim=-1).squeeze().cpu().numpy()
    label_id = int(probs.argmax())
    return label_map[label_id], float(probs[label_id])


Chunk & Predict for All Articles

In [None]:
# Ensure the sampled folder exists
os.makedirs("data/sampled", exist_ok=True)

CHUNK_OUT = "../data/sampled/chunk_sentiments_frozen.jsonl"
with open("../data/sampled/10k_sample.jsonl", "r", encoding="utf-8", errors="ignore") as fin, \
     open(CHUNK_OUT, "w", encoding="utf-8") as fout:
    for line in tqdm(fin, total=10000):
        art = json.loads(line)
        art_id = art.get("id", None)
        full_text = art.get("headline_summary", "") + " " + art.get("body", "")

        # Break into 3-sentence chunks
        from nltk.tokenize import sent_tokenize
        sents = sent_tokenize(full_text)
        chunks = []
        for i in range(0, len(sents), 3):
            chunk = " ".join(sents[i : i + 3]).strip()
            if chunk:
                chunks.append(chunk)

        # Predict sentiment for each chunk (on CPU)
        chunk_results = []
        for chunk in chunks:
            lab, conf = frozen_sentiment_predict(chunk)
            chunk_results.append({"text": chunk, "label": lab, "confidence": conf})

        fout.write(json.dumps({
            "article_id": art_id,
            "chunks": chunk_results
        }) + "\n")

print("✔ Saved chunk‐level sentiments to", CHUNK_OUT)


Aggregate Chunk‐Level to Article‐Level

In [None]:
def aggregate_article_sentiment(chunk_results):
    labels = [c["label"] for c in chunk_results]
    confidences = [c["confidence"] for c in chunk_results]
    count = Counter(labels)
    top_two = count.most_common(2)
    if len(top_two) == 1 or top_two[0][1] > top_two[1][1]:
        article_label = top_two[0][0]
    else:
        tied = [lab for lab, cnt in top_two if cnt == top_two[0][1]]
        avg_conf = {lab: np.mean([c["confidence"] for c in chunk_results if c["label"] == lab]) for lab in tied}
        article_label = max(avg_conf, key=avg_conf.get)
    rel_confs = [c["confidence"] for c in chunk_results if c["label"] == article_label]
    article_conf = float(np.mean(rel_confs)) if rel_confs else 0.0
    return article_label, article_conf

AGG_OUT = "../data/sampled/article_sentiments_frozen.csv"
rows = []
with open("../data/sampled/chunk_sentiments_frozen.jsonl", "r", encoding="utf-8") as fin:
    for line in fin:
        rec = json.loads(line)
        art_id = rec["article_id"]
        art_lbl, art_conf = aggregate_article_sentiment(rec["chunks"])
        rows.append({"article_id": art_id, "label": art_lbl, "confidence": art_conf})

pd.DataFrame(rows).to_csv(AGG_OUT, index=False, encoding="utf-8")
print("✔ Saved article‐level sentiments to", AGG_OUT)


Quick Sanity Check

df = pd.read_csv("data/sampled/article_sentiments_frozen.csv")
print("Label distribution:")
print(df["label"].value_counts())
print("\nConfidence stats:")
print(df["confidence"].describe())
