In [16]:
from dotenv import load_dotenv
from utils import get_oracle_connection
import sys
print(sys.version)
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

3.13.5 (tags/v3.13.5:6cb20a2, Jun 11 2025, 16:15:46) [MSC v.1943 64 bit (AMD64)]
Torch version: 2.7.1+cu118
CUDA available: True


In [17]:
conn = get_oracle_connection()

query = """
SELECT
    ID,
    DBMS_LOB.SUBSTR(SENTIMENT_READY_TEXT, 20000, 1) as SENTIMENT_READY_TEXT,
    TYPE,
    SUBREDDIT,
    CREATED_UTC,
    NORMALIZED_UPVOTES,
    DBMS_LOB.SUBSTR(MENTIONED_TICKERS, 100, 1) as MENTIONED_TICKERS,
    N_TICKERS,
    TEXT_LENGTH,
    WORD_COUNT,
    DATE_COL,
    HOUR,
    DAY_OF_WEEK
FROM preprocessed_data
FETCH FIRST 1000 ROWS ONLY
"""

df = pd.read_sql_query(query, conn)
conn.close()

df.columns = df.columns.str.lower()
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
if 'date_col' in df.columns:
    df['date'] = pd.to_datetime(df['date_col'])
    df.drop(columns=['date_col'], inplace=True)

print(f"Loaded {len(df)} rows")
df.head()

Oracle connection successful!


  df = pd.read_sql_query(query, conn)


Loaded 1000 rows


Unnamed: 0,id,sentiment_ready_text,type,subreddit,created_utc,normalized_upvotes,mentioned_tickers,n_tickers,text_length,word_count,hour,day_of_week,date
0,m48hiu4,I'd considering splitting it across two banks ...,comment,investing,2024-12-28 19:22:19,0.018399,UBS,1,241,47,19,5,2024-12-28
1,m48hedi,I had the opportunity for the IPO price thing ...,comment,stocks,2024-12-28 19:21:38,0.011074,AMD,1,75,15,19,5,2024-12-28
2,m48heet,"Looks like It's down, but the 6 month chart sh...",comment,investing,2024-12-28 19:21:38,0.018076,KO,1,286,50,19,5,2024-12-28
3,m48gxg1,Just look at 52 week lows. Stock screeners wil...,comment,ValueInvesting,2024-12-28 19:19:04,0.056502,HSY,1,278,51,19,5,2024-12-28
4,m48guco,Are you looking to index? Wealthfront has a ne...,comment,investing,2024-12-28 19:18:36,0.018076,UBS,1,82,15,19,5,2024-12-28


In [18]:
"""
HYBRID SENTIMENT PIPELINE (Notebook B)

Assumes:
- df is ALREADY loaded in a previous cell from Oracle
- df columns are already lowercase

Pipeline:
1) Twitter RoBERTa sentiment per (text, ticker) for ALL rows.
2) Local instruction LLM (Qwen) only for uncertain cases where
   tw_score ∈ [-0.1, +0.1].
3) Produces:
   tw_score, tw_label
   llm_sentiment_score, llm_sentiment_label
"""

import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM
)
from tqdm import tqdm
import ast
import time
import numpy as np
import json
import re

# ============================================================================
# CONFIG
# ============================================================================

OUTPUT_FILE = "sentiment_hybrid_twitter_llm.csv"

TEXT_COL   = "sentiment_ready_text"
TICKER_COL = "mentioned_tickers"

TW_MODEL_NAME  = "cardiffnlp/twitter-roberta-base-sentiment-latest"
LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

BATCH_SIZE_CLS = 64
BATCH_SIZE_LLM = 8

MAX_INPUT_TOKENS = 512
MAX_NEW_TOKENS   = 64
USE_FP16_LLM = True

print("="*80)
print("HYBRID SENTIMENT (Notebook B)")
print("="*80)

# ============================================================================
# DEVICE
# ============================================================================

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ============================================================================
# BASIC PREP (df already exists)
# ============================================================================

def parse_tickers(x):
    if pd.isna(x) or x == "":
        return []
    try:
        return [str(t).strip() for t in ast.literal_eval(str(x)) if t]
    except:
        return [t.strip() for t in str(x).split(",") if t.strip()]

df["tickers_list"] = df[TICKER_COL].apply(parse_tickers)
df["n_tickers"] = df["tickers_list"].apply(len)

df = df[df["n_tickers"] > 0].copy()

# Explode to per-ticker rows
df = df.explode("tickers_list").reset_index(drop=True)
df = df.rename(columns={"tickers_list": "ticker"})

print(f"Per-ticker rows: {len(df):,}")

# ============================================================================
# LOAD TWITTER ROBERTA
# ============================================================================

tw_tokenizer = AutoTokenizer.from_pretrained(TW_MODEL_NAME)
tw_model     = AutoModelForSequenceClassification.from_pretrained(TW_MODEL_NAME)
tw_model.to(device)
tw_model.eval()

# ============================================================================
# TWITTER ROBERTA FUNCTION
# ============================================================================

def twitter_batch(texts, tickers):
    texts = [f"{tic}: {txt}" for txt, tic in zip(texts, tickers)]

    enc = tw_tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_INPUT_TOKENS,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        probs = torch.softmax(tw_model(**enc).logits, dim=-1).cpu().numpy()

    id2label = tw_model.config.id2label
    results = []

    for prob in probs:
        p_pos = prob[list(id2label.values()).index("positive")]
        p_neg = prob[list(id2label.values()).index("negative")]

        score = p_pos - p_neg

        if score > 0.15:
            label = "positive"
        elif score < -0.15:
            label = "negative"
        else:
            label = "neutral"

        results.append((score, label))

    return results

# ============================================================================
# RUN TWITTER ROBERTA
# ============================================================================

tw_scores = []
tw_labels = []

texts = df[TEXT_COL].fillna("").tolist()
tickers = df["ticker"].tolist()

for i in tqdm(range(0, len(texts), BATCH_SIZE_CLS), desc="Twitter RoBERTa"):
    batch_texts = texts[i:i+BATCH_SIZE_CLS]
    batch_tickers = tickers[i:i+BATCH_SIZE_CLS]
    res = twitter_batch(batch_texts, batch_tickers)
    for s,l in res:
        tw_scores.append(s)
        tw_labels.append(l)

df["tw_score"] = tw_scores
df["tw_label"] = tw_labels

# ============================================================================
# SELECT UNCERTAIN FOR LLM (tw_score between -0.1 and 0.1)
# ============================================================================

df_llm = df[df["tw_score"].between(-0.1, 0.1)].copy()
print(f"Sent to LLM: {len(df_llm):,} rows")

# ============================================================================
# LOAD QWEN LLM
# ============================================================================

llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
llm_model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL_NAME,
    torch_dtype=torch.float16 if device=="cuda" else None,
    device_map="auto" if device=="cuda" else None
)

llm_model.eval()

if llm_tokenizer.pad_token is None:
    llm_tokenizer.pad_token = llm_tokenizer.eos_token

# ============================================================================
# FEW-SHOT PROMPT
# ============================================================================

def build_prompt(text, ticker):
    return f"""
You are a financial sentiment analyst.

Examples:
Comment: NVDA is a disaster, terrible performance.
Result: {{"ticker":"NVDA","label":"negative","score":-0.8}}

Comment: TSLA looks okay, maybe sideways.
Result: {{"ticker":"TSLA","label":"neutral","score":0.0}}

Comment: AAPL to the moon!!!
Result: {{"ticker":"AAPL","label":"positive","score":0.8}}

Now analyze:

Ticker: {ticker}
Comment: {text}

Return ONLY valid JSON.
"""

def parse_llm(text):
    m = re.search(r"\{.*\}", text)
    if not m:
        return "neutral", 0.0
    try:
        d = json.loads(m.group())
        return d.get("label","neutral"), float(d.get("score",0.0))
    except:
        return "neutral", 0.0

def run_llm_batch(texts, tickers):
    prompts = [build_prompt(t, tic) for t, tic in zip(texts, tickers)]

    inputs = llm_tokenizer(prompts, return_tensors="pt",
                           padding=True, truncation=True,
                           max_length=MAX_INPUT_TOKENS).to(device)

    outputs = llm_model.generate(**inputs, max_new_tokens=MAX_NEW_TOKENS)
    decoded = llm_tokenizer.batch_decode(outputs, skip_special_tokens=True)

    return [parse_llm(o) for o in decoded]

# ============================================================================
# RUN LLM
# ============================================================================

llm_labels = []
llm_scores = []

texts_llm = df_llm[TEXT_COL].tolist()
tickers_llm = df_llm["ticker"].tolist()

for i in tqdm(range(0, len(texts_llm), BATCH_SIZE_LLM), desc="LLM"):
    batch_texts = texts_llm[i:i+BATCH_SIZE_LLM]
    batch_tickers = tickers_llm[i:i+BATCH_SIZE_LLM]
    res = run_llm_batch(batch_texts, batch_tickers)

    for l,s in res:
        llm_labels.append(l)
        llm_scores.append(s)

df_llm["llm_sentiment_label"] = llm_labels
df_llm["llm_sentiment_score"] = llm_scores

# Merge back
df["llm_sentiment_label"] = np.nan
df["llm_sentiment_score"] = np.nan

df.loc[df_llm.index, "llm_sentiment_label"] = df_llm["llm_sentiment_label"]
df.loc[df_llm.index, "llm_sentiment_score"] = df_llm["llm_sentiment_score"]

# ============================================================================
# SAVE
# ============================================================================

df.to_csv(OUTPUT_FILE, index=False)
print("✅ Saved to", OUTPUT_FILE)


HYBRID SENTIMENT (Notebook B)
Device: cuda
Per-ticker rows: 1,950


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Twitter RoBERTa: 100%|██████████| 31/31 [04:49<00:00,  9.33s/it]


Sent to LLM: 488 rows


LLM:   0%|          | 0/61 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:   2%|▏         | 1/61 [00:40<40:28, 40.47s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:   3%|▎         | 2/61 [01:17<37:41, 38.33s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:   5%|▍         | 3/61 [01:54<36:35, 37.85s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:   7%|▋         | 4/61 [02:29<34:54, 36.75s/it]A decoder-only architecture is being used, but right-padding 

✅ Saved to sentiment_hybrid_twitter_llm.csv



 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'negative' 'nega

In [4]:
df.head()

Unnamed: 0,id,sentiment_ready_text,type,subreddit,created_utc,normalized_score,mentioned_tickers,n_tickers,text_length,word_count,...,hour,day_of_week,ticker,tw_score,tw_label,tw_p_positive,tw_p_neutral,tw_p_negative,llm_sentiment_label,llm_sentiment_score
0,1hqr72t,What is a good ROE? (Return on Equity) - Quick...,post,investing,2024-12-31 23:56:46,0.019045,TOP,1,1072,171,...,23,1,TOP,0.100538,neutral,0.133485,0.833568,0.032947,,
1,1hqqmq2,Soundhound ($SOUN) now has a market cap 75x it...,post,stocks,2024-12-31 23:24:29,0.039443,SOUN,1,476,82,...,23,1,SOUN,-0.344324,negative,0.083458,0.488761,0.427781,,
2,1hqqgv5,Is $SMCI beginning to be a play?. P/E is looki...,post,stocks,2024-12-31 23:15:10,0.011074,SMCI,1,419,75,...,23,1,SMCI,0.704077,positive,0.727324,0.249428,0.023247,,
3,1hqpxkb,Any thoughts on RDDT feedback appreciated.,post,investing,2024-12-31 22:45:36,0.018076,RDDT,1,42,6,...,22,1,RDDT,0.080196,neutral,0.100133,0.879929,0.019937,,
4,1hqpw3r,Is D.R.Horton (DHI) good value ?. The stock is...,post,ValueInvesting,2024-12-31 22:43:25,0.051121,DHI,1,391,75,...,22,1,DHI,0.659816,positive,0.686326,0.287164,0.02651,,


In [1]:
import sys
print(sys.version)
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

3.13.5 (tags/v3.13.5:6cb20a2, Jun 11 2025, 16:15:46) [MSC v.1943 64 bit (AMD64)]
Torch version: 2.7.1+cu118
CUDA available: True
