In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from utils import score_texts

In [None]:
# === Config ===
DATA_PATH_IN  = "outputs/preprocessed_data.csv"
DATA_PATH_OUT = "outputs/finbert_sentiment.csv" 

ID_COL        = "id"
TEXT_COL      = "sentiment_ready_text"

# === Load data ===
df = pd.read_csv(DATA_PATH_IN)

print("Input shape:", df.shape)
df.head()


In [None]:
# === Load FinBERT model ==
MODEL_NAME = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device == 0 else "CPU")

# === Build sentiment pipeline ===
sentiment_pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device,
    return_all_scores=True,
    truncation=True,
    max_length=128            # can increase to 256 if your texts are longer
)


In [None]:
def score_texts(texts):
    """
    Run FinBERT on a list of texts and return structured sentiment info.

    Parameters
    ----------
    texts : list of str
        The input texts to classify.

    Returns
    -------
    results : list of dict
        Each dict has:
        - sentiment_label : str       ('positive', 'neutral', 'negative')
        - sentiment_score : float     (p_pos - p_neg in [-1, 1])
        - p_pos, p_neu, p_neg : float (probabilities)
    """
    # This calls the HF pipeline once for the whole batch
    outputs = sentiment_pipe(texts)

    results = []
    for out in outputs:
        # out is a list like:
        # [{'label': 'positive', 'score': 0.7}, {'label': 'neutral', 'score': 0.2}, {'label': 'negative', 'score': 0.1}]
        # Normalize label names to lowercase to be robust to variations
        probs = {d["label"].lower(): float(d["score"]) for d in out}

        p_pos = probs.get("positive", 0.0)
        p_neg = probs.get("negative", 0.0)
        p_neu = probs.get("neutral", 0.0)

        # Continuous sentiment score in [-1, 1]
        sentiment_score = p_pos - p_neg

        # Discrete label = argmax over the three probabilities
        sentiment_label = max(probs, key=probs.get)

        results.append({
            "sentiment_label": sentiment_label,
            "sentiment_score": sentiment_score,
            "p_pos": p_pos,
            "p_neu": p_neu,
            "p_neg": p_neg
        })

    return results

In [None]:
# === Batch configuration ===
# batch size kdyztak budeme menit 
BATCH_SIZE = 32

sentiment_labels = []
sentiment_scores = []
p_pos_list = []
p_neu_list = []
p_neg_list = []

# Replace NA with empty strings so the model doesn't crash
texts = df[TEXT_COL].fillna("").tolist()

n_texts = len(texts)
print("Number of texts to process:", n_texts)

for start in range(0, n_texts, BATCH_SIZE):
    end = start + BATCH_SIZE
    batch = texts[start:end]

    scored = score_texts(batch)

    # Extend our result lists
    for r in scored:
        sentiment_labels.append(r["sentiment_label"])
        sentiment_scores.append(r["sentiment_score"])
        p_pos_list.append(r["p_pos"])
        p_neu_list.append(r["p_neu"])
        p_neg_list.append(r["p_neg"])

    # progress print
    if (start // BATCH_SIZE) % 50 == 0:
        print(f"Processed {min(end, n_texts)} / {n_texts} texts")

print("Scores computed:", len(sentiment_labels), "rows in df:", len(df))


In [None]:
# Optional: save to disk
df.to_csv(DATA_PATH_OUT, index=False)
print("Saved FinBERT sentiment data to:", DATA_PATH_OUT)
