In [37]:
import os
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from utils import score_texts, get_oracle_connection

In [38]:
df = pd.read_csv("preprocessed_data.csv")
DATA_PATH_OUT = "outputs/finbert_sentiment.csv"

ID_COL   = "id"
TEXT_COL = "sentiment_ready_text"


tohle potom dat dolu

In [39]:
df['mentioned_tickers'] = df['mentioned_tickers'].apply(lambda x: x if isinstance(x, list) else [t.strip() for t in str(x).split(',') if t.strip()])
df = df.explode('mentioned_tickers').reset_index(drop=True)

Load dat je tady zatím manuální. Mám blbej connection k Oraclu, takže to dole bude potom official

In [40]:
df.columns

Index(['id', 'sentiment_ready_text', 'type', 'subreddit', 'created_utc',
       'normalized_score', 'mentioned_tickers', 'n_tickers', 'text_length',
       'word_count', 'date', 'hour', 'day_of_week'],
      dtype='object')

Starting off with the first of the models - a base FINBERT.

In [41]:
MODEL_NAME = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Use GPU if available; otherwise fall back to CPU
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device == 0 else "CPU")

# === Build sentiment pipeline ===
sentiment_pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device,
    return_all_scores=True,   # we want probabilities for all labels
    truncation=True,
    max_length=128            # can increase to 256 if your texts are longer
)


Device set to use cpu


Using device: CPU




Tohle se taky schová potom do utils

In [42]:
def score_texts(texts):
    """
    Run FinBERT on a list of texts and return structured sentiment info.

    Parameters
    ----------
    texts : list of str
        The input texts to classify.

    Returns
    -------
    results : list of dict
        Each dict has:
        - sentiment_label : str       ('positive', 'neutral', 'negative')
        - sentiment_score : float     (p_pos - p_neg in [-1, 1])
        - p_pos, p_neu, p_neg : float (probabilities)
    """
    # This calls the HF pipeline once for the whole batch
    outputs = sentiment_pipe(texts)

    results = []
    for out in outputs:
        # out is a list like:
        # [{'label': 'positive', 'score': 0.7}, {'label': 'neutral', 'score': 0.2}, {'label': 'negative', 'score': 0.1}]
        # Normalize label names to lowercase to be robust to variations
        probs = {d["label"].lower(): float(d["score"]) for d in out}

        p_pos = probs.get("positive", 0.0)
        p_neg = probs.get("negative", 0.0)
        p_neu = probs.get("neutral", 0.0)

        # Continuous sentiment score in [-1, 1]
        sentiment_score = p_pos - p_neg

        # Discrete label = argmax over the three probabilities
        sentiment_label = max(probs, key=probs.get)

        results.append({
            "sentiment_label": sentiment_label,
            "sentiment_score": sentiment_score,
            "p_pos": p_pos,
            "p_neu": p_neu,
            "p_neg": p_neg
        })

    return results


In [43]:
# === Batch configuration ===
# Larger batch_size => faster but more memory usage.
# we can change the batch size as we wish...
BATCH_SIZE = 32

sentiment_labels = []
sentiment_scores = []
p_pos_list = []
p_neu_list = []
p_neg_list = []

# Replace NaNs with empty strings so the model doesn't crash
texts = df[TEXT_COL].fillna("").tolist()

n_texts = len(texts)
print("Number of texts to process:", n_texts)

for start in range(0, n_texts, BATCH_SIZE):
    end = start + BATCH_SIZE
    batch = texts[start:end]

    scored = score_texts(batch)

    # Extend our result lists
    for r in scored:
        sentiment_labels.append(r["sentiment_label"])
        sentiment_scores.append(r["sentiment_score"])
        p_pos_list.append(r["p_pos"])
        p_neu_list.append(r["p_neu"])
        p_neg_list.append(r["p_neg"])

    # Optional: simple progress print
    if (start // BATCH_SIZE) % 50 == 0:
        print(f"Processed {min(end, n_texts)} / {n_texts} texts")

# Sanity check: number of scores should match number of rows
print("Scores computed:", len(sentiment_labels), "rows in df:", len(df))


Number of texts to process: 29624
Processed 32 / 29624 texts
Processed 1632 / 29624 texts
Processed 3232 / 29624 texts
Processed 4832 / 29624 texts
Processed 6432 / 29624 texts
Processed 8032 / 29624 texts
Processed 9632 / 29624 texts
Processed 11232 / 29624 texts
Processed 12832 / 29624 texts
Processed 14432 / 29624 texts
Processed 16032 / 29624 texts
Processed 17632 / 29624 texts
Processed 19232 / 29624 texts
Processed 20832 / 29624 texts
Processed 22432 / 29624 texts
Processed 24032 / 29624 texts
Processed 25632 / 29624 texts
Processed 27232 / 29624 texts
Processed 28832 / 29624 texts
Scores computed: 29624 rows in df: 29624


In [44]:
df.head()

Unnamed: 0,id,sentiment_ready_text,type,subreddit,created_utc,normalized_score,mentioned_tickers,n_tickers,text_length,word_count,date,hour,day_of_week
0,1hqr72t,What is a good ROE? (Return on Equity) - Quick...,post,investing,2024-12-31 23:56:46,0.019045,TOP,1,1072,171,2024-12-31,23,1
1,1hqqmq2,Soundhound ($SOUN) now has a market cap 75x it...,post,stocks,2024-12-31 23:24:29,0.039443,SOUN,1,476,82,2024-12-31,23,1
2,1hqqgv5,Is $SMCI beginning to be a play?. P/E is looki...,post,stocks,2024-12-31 23:15:10,0.011074,SMCI,1,419,75,2024-12-31,23,1
3,1hqpxkb,Any thoughts on RDDT feedback appreciated.,post,investing,2024-12-31 22:45:36,0.018076,RDDT,1,42,6,2024-12-31,22,1
4,1hqpw3r,Is D.R.Horton (DHI) good value ?. The stock is...,post,ValueInvesting,2024-12-31 22:43:25,0.051121,DHI,1,391,75,2024-12-31,22,1


Make it for every ticker in the data

Do teď manual output, dokud nebudu mit fixed oracle

In [None]:
df.to_csv(DATA_PATH_OUT, index=False)
print("Saved FinBERT sentiment data to:", DATA_PATH_OUT)