In [1]:
import os
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch
from utils import score_texts, get_oracle_connection

  from .autonotebook import tqdm as notebook_tqdm
INFO:utils:spaCy model loaded successfully


In [3]:
df = pd.read_csv("preprocessed_data.csv")

Load dat je tady zatím manuální. Mám blbej connection k Oraclu, takže to dole bude potom official

In [None]:

# === Load data ===
# === Config ===
DATA_PATH_OUT = "outputs/finbert_sentiment.csv"

ID_COL   = "id"
TEXT_COL = "sentiment_ready_text"

try:
    import oracledb
    ORACLE_AVAILABLE = True
except ImportError:
    ORACLE_AVAILABLE = False
    print("oracledb not installed. Install with: pip install oracledb")

# === Load data from Oracle preprocessed_data table ===
print("Loading preprocessed data from Oracle...")

df = pd.DataFrame()

if ORACLE_AVAILABLE:
    conn = get_oracle_connection()   # uses your .env: db-username, db-password, db-dsn

    if conn:
        try:
            query_preprocessed = """
                SELECT
                    id,
                    sentiment_ready_text,
                    type,
                    subreddit,
                    created_utc,
                    normalized_score,
                    mentioned_tickers,
                    n_tickers,
                    text_length,
                    word_count,
                    date_col,
                    hour,
                    day_of_week
                FROM preprocessed_data
            """

            df = pd.read_sql_query(query_preprocessed, conn)

            # close connection as soon as possible
            conn.close()

            print("Preprocessed rows loaded:", len(df))
            print("Columns:", list(df.columns))

            # Optional: convert types back to something nice for modelling
            # created_utc was exported as epoch seconds in your preprocessing script
            if "created_utc" in df.columns:
                df["created_utc"] = pd.to_datetime(df["created_utc"], unit="s", errors="coerce")

            # date_col is the Oracle name; rename back to 'date' if you prefer
            if "date_col" in df.columns:
                df.rename(columns={"date_col": "date"}, inplace=True)

        except Exception as e:
            print("Error while loading from Oracle:", e)
            try:
                conn.close()
            except:
                pass
    else:
        print("Could not obtain Oracle connection (check get_oracle_connection / .env).")
else:
    print("Oracle not available in this environment.")

print("Input shape:", df.shape)
df.head()


Loading preprocessed data from Oracle...
Error connecting to Oracle DB: DPY-6005: cannot connect to database (CONNECTION_ID=Pm2LZyv+xAu4LY2i1P6hGw==).
DPY-6000: Listener refused connection. (Similar to ORA-12506)
Could not obtain Oracle connection (check get_oracle_connection / .env).
Input shape: (0, 0)


In [29]:
df.columns

Index(['id', 'sentiment_ready_text', 'type', 'subreddit', 'created_utc',
       'normalized_score', 'mentioned_tickers', 'n_tickers', 'text_length',
       'word_count', 'date', 'hour', 'day_of_week'],
      dtype='object')

Starting off with the first of the models - a base FINBERT.

In [23]:
MODEL_NAME = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)

# Use GPU if available; otherwise fall back to CPU
device = 0 if torch.cuda.is_available() else -1
print("Using device:", "GPU" if device == 0 else "CPU")

# === Build sentiment pipeline ===
sentiment_pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=device,
    return_all_scores=True,   # we want probabilities for all labels
    truncation=True,
    max_length=128            # can increase to 256 if your texts are longer
)


Device set to use cpu


Using device: CPU


Tohle se taky schová potom do utils

In [27]:
def score_texts(texts):
    """
    Run FinBERT on a list of texts and return structured sentiment info.

    Parameters
    ----------
    texts : list of str
        The input texts to classify.

    Returns
    -------
    results : list of dict
        Each dict has:
        - sentiment_label : str       ('positive', 'neutral', 'negative')
        - sentiment_score : float     (p_pos - p_neg in [-1, 1])
        - p_pos, p_neu, p_neg : float (probabilities)
    """
    # This calls the HF pipeline once for the whole batch
    outputs = sentiment_pipe(texts)

    results = []
    for out in outputs:
        # out is a list like:
        # [{'label': 'positive', 'score': 0.7}, {'label': 'neutral', 'score': 0.2}, {'label': 'negative', 'score': 0.1}]
        # Normalize label names to lowercase to be robust to variations
        probs = {d["label"].lower(): float(d["score"]) for d in out}

        p_pos = probs.get("positive", 0.0)
        p_neg = probs.get("negative", 0.0)
        p_neu = probs.get("neutral", 0.0)

        # Continuous sentiment score in [-1, 1]
        sentiment_score = p_pos - p_neg

        # Discrete label = argmax over the three probabilities
        sentiment_label = max(probs, key=probs.get)

        results.append({
            "sentiment_label": sentiment_label,
            "sentiment_score": sentiment_score,
            "p_pos": p_pos,
            "p_neu": p_neu,
            "p_neg": p_neg
        })

    return results


In [None]:
# === Batch configuration ===
# Larger batch_size => faster but more memory usage.
# we can change the batch size as we wish...
BATCH_SIZE = 32

sentiment_labels = []
sentiment_scores = []
p_pos_list = []
p_neu_list = []
p_neg_list = []

# Replace NaNs with empty strings so the model doesn't crash
texts = df[TEXT_COL].fillna("").tolist()

n_texts = len(texts)
print("Number of texts to process:", n_texts)

for start in range(0, n_texts, BATCH_SIZE):
    end = start + BATCH_SIZE
    batch = texts[start:end]

    scored = score_texts(batch)

    # Extend our result lists
    for r in scored:
        sentiment_labels.append(r["sentiment_label"])
        sentiment_scores.append(r["sentiment_score"])
        p_pos_list.append(r["p_pos"])
        p_neu_list.append(r["p_neu"])
        p_neg_list.append(r["p_neg"])

    # Optional: simple progress print
    if (start // BATCH_SIZE) % 50 == 0:
        print(f"Processed {min(end, n_texts)} / {n_texts} texts")

# Sanity check: number of scores should match number of rows
print("Scores computed:", len(sentiment_labels), "rows in df:", len(df))


Number of texts to process: 29624
Processed 32 / 29624 texts
Processed 1632 / 29624 texts
Processed 3232 / 29624 texts
Processed 4832 / 29624 texts
Processed 6432 / 29624 texts
Processed 8032 / 29624 texts
Processed 9632 / 29624 texts
Processed 11232 / 29624 texts
Processed 12832 / 29624 texts
Processed 14432 / 29624 texts
Processed 16032 / 29624 texts
Processed 17632 / 29624 texts
Processed 19232 / 29624 texts
Processed 20832 / 29624 texts
Processed 22432 / 29624 texts
Processed 24032 / 29624 texts
Processed 25632 / 29624 texts
Processed 27232 / 29624 texts
Processed 28832 / 29624 texts
Scores computed: 29624 rows in df: 29624


Make it for every ticker in the data

In [None]:
df['mentioned_tickers'] = df['mentioned_tickers'].apply(lambda x: x if isinstance(x, list) else [t.strip() for t in str(x).split(',') if t.strip()])
df = df.explode('mentioned_tickers').reset_index(drop=True)

In [None]:
export_df = df.copy()
conn = get_oracle_connection()
if conn:
    cursor = conn.cursor()

    # Truncate existing data (full refresh)
    cursor.execute("TRUNCATE TABLE preprocessed_data")
    print("Existing data in 'preprocessed_data' truncated")

    # We now use 'date' directly (no renaming to date_col)
    db_export_df = export_df.copy()

    # Insert statement including sentiment columns
    insert_sql = """
    INSERT INTO preprocessed_data (
        id,
        sentiment_ready_text,
        type,
        subreddit,
        created_utc,
        normalized_score,
        mentioned_tickers,
        n_tickers,
        text_length,
        word_count,
        date,
        hour,
        day_of_week,
        finbert_label,
        finbert_score,
        finbert_p_pos,
        finbert_p_neu,
        finbert_p_neg
    ) VALUES (
        :1, :2, :3, :4, :5, :6, :7, :8, :9,
        :10, :11, :12, :13, :14, :15, :16, :17, :18
    )
    """

    # Prepare data for batch insert
    insert_data = []
    for _, row in db_export_df.iterrows():
        insert_data.append((
            # base columns (you specified these names)
            str(row["id"]),
            str(row["sentiment_ready_text"]),
            str(row["type"]),
            str(row["subreddit"]),
            # created_utc: stored as epoch seconds (NUMBER) like your preprocessing step
            row["created_utc"].timestamp() if pd.notna(row.get("created_utc")) else None,
            float(row["normalized_score"]) if pd.notna(row.get("normalized_score")) else None,
            str(row["mentioned_tickers"]) if pd.notna(row.get("mentioned_tickers")) else "",
            int(row["n_tickers"]) if pd.notna(row.get("n_tickers")) else 0,
            int(row["text_length"]) if pd.notna(row.get("text_length")) else 0,
            int(row["word_count"]) if pd.notna(row.get("word_count")) else 0,
            row.get("date") if pd.notna(row.get("date")) else None,
            int(row["hour"]) if pd.notna(row.get("hour")) else None,
            int(row["day_of_week"]) if pd.notna(row.get("day_of_week")) else None,

            # FinBERT sentiment columns
            str(row["finbert_label"]),
            float(row["finbert_score"]) if pd.notna(row.get("finbert_score")) else None,
            float(row["finbert_p_pos"]) if pd.notna(row.get("finbert_p_pos")) else None,
            float(row["finbert_p_neu"]) if pd.notna(row.get("finbert_p_neu")) else None,
            float(row["finbert_p_neg"]) if pd.notna(row.get("finbert_p_neg")) else None,
        ))

    # Execute batch insert
    cursor.executemany(insert_sql, insert_data)
    conn.commit()

    print(f"Successfully exported {len(insert_data)} rows to Oracle table 'preprocessed_data'")

    cursor.close()
    conn.close()
    print("Database export complete!")

else:
    print("Failed to connect to Oracle database")
    try:
        if conn:
            conn.close()
    except:
        pass

print("\nStep 11 complete: Database export finished")

Do teď manual output, dokud nebudu mit fixed oracle

In [None]:
df.to_csv(DATA_PATH_OUT, index=False)
print("Saved FinBERT sentiment data to:", DATA_PATH_OUT)