Imports & Setup

In [None]:
# Cell 2 (code)
import os
import json
import nltk
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import Counter

import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Download punkt if necessary (for sent_tokenize)
nltk.download('punkt')


Load Frozen FinBERT

In [None]:
# Cell 3 (code)

# 3.1 Choose device (T4 GPU in Colab if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)  # Should print “cuda” when GPU is attached

# 3.2 Load FinBERT model + tokenizer, move model to GPU
MODEL_NAME = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()  # No gradient, inference‐only

# 3.3 Label mapping for FinBERT
label_map = {0: "NEG", 1: "NEU", 2: "POS"}

# 3.4 Helper function: run one‐chunk inference on GPU
def frozen_sentiment_predict(text: str):
    """
    Tokenize `text`, push inputs to GPU, run FinBERT, return (label, confidence).
    """
    # Tokenize + pad/truncate → return PyTorch tensors
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512,
        padding="max_length"
    ).to(device)

    with torch.no_grad():
        out = model(**inputs)
        probs = torch.softmax(out.logits, dim=-1).squeeze().cpu().numpy()
    label_id = int(np.argmax(probs))
    return label_map[label_id], float(probs[label_id])

# Quick test to confirm GPU inference is working:
test_label, test_conf = frozen_sentiment_predict("The market is great today.")
print(f"Test → {test_label} (confidence={test_conf:.3f})")


Chunk & Predict for All Articles

In [None]:
# Cell 4 (code)

# # 4.1 Ensure the sampled folder exists (should already exist from Cell 1)
# os.makedirs("data/sampled", exist_ok=True)

# 4.2 Define input/output paths for chunk‐level JSONL
INPUT_PATH = "../data/sampled/10k_sample.jsonl"
CHUNK_OUT  = "../data/sampled/chunk_sentiments_frozen.jsonl"

# 4.3 Iterate over each article, split into 3‐sentence chunks, run GPU inference
with open(INPUT_PATH, "r", encoding="utf-8", errors="ignore") as fin, \
     open(CHUNK_OUT, "w", encoding="utf-8") as fout:

    for line in tqdm(fin, total=10000):
        art = json.loads(line)
        art_id = art.get("id", None)
        full_text = art.get("headline_summary", "") + " " + art.get("body", "")

        # 4.3.1 Sentence‐split (CPU)
        from nltk.tokenize import sent_tokenize
        sents = sent_tokenize(full_text)

        # 4.3.2 Build 3‐sentence chunks
        chunks = []
        for i in range(0, len(sents), 3):
            chunk = " ".join(sents[i : i + 3]).strip()
            if chunk:
                chunks.append(chunk)

        # 4.3.3 Predict sentiment for each chunk (on GPU)
        chunk_results = []
        for chunk in chunks:
            lab, conf = frozen_sentiment_predict(chunk)
            chunk_results.append({"text": chunk, "label": lab, "confidence": conf})

        # 4.3.4 Write out one JSONL line per article
        fout.write(json.dumps({
            "article_id": art_id,
            "chunks":     chunk_results
        }) + "\n")

print("✔ Saved chunk‐level sentiments to", CHUNK_OUT)


 13%|█▎        | 1330/10000 [57:37<6:15:35,  2.60s/it] 


KeyboardInterrupt: 

Aggregate Chunk‐Level to Article‐Level

In [None]:
# Cell 5 (code)

def aggregate_article_sentiment(chunk_results):
    """
    Given a list of chunk‐level dicts (each with 'label' and 'confidence'),
    compute one aggregated label & confidence for the article.
    """
    # Count how many times each label appears:
    labels = [c["label"] for c in chunk_results]
    counts = Counter(labels)
    # Choose the majority label; if tie, pick the one with higher average confidence
    top_label, _ = counts.most_common(1)[0]
    
    # Gather confidences only for chunks with top_label
    rel_confs = [c["confidence"] for c in chunk_results if c["label"] == top_label]
    article_conf = float(np.mean(rel_confs)) if rel_confs else 0.0
    return top_label, article_conf

# 5.1 Define aggregation input/output paths
AGG_IN  = "../data/sampled/chunk_sentiments_frozen.jsonl"
AGG_OUT = "../data/sampled/article_sentiments_frozen.csv"

rows = []
with open(AGG_IN, "r", encoding="utf-8") as fin:
    for line in fin:
        rec = json.loads(line)
        art_id = rec["article_id"]
        art_lbl, art_conf = aggregate_article_sentiment(rec["chunks"])
        rows.append({"article_id": art_id, "label": art_lbl, "confidence": art_conf})

pd.DataFrame(rows).to_csv(AGG_OUT, index=False, encoding="utf-8")
print("✔ Saved article‐level sentiments to", AGG_OUT)


In [None]:
# Cell 6 (code)

df = pd.read_csv("../data/sampled/article_sentiments_frozen.csv")
print("Label distribution:")
print(df["label"].value_counts())

print("\nConfidence stats:")
print(df["confidence"].describe())
