In [None]:
import pandas as pd
from dotenv import load_dotenv
from utils import get_oracle_connection
import sys
print(sys.version)
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM
)
from tqdm import tqdm
import ast
import numpy as np
import json
import re

ModuleNotFoundError: No module named 'utils'

In [None]:
conn = get_oracle_connection()

query = """
SELECT
    ID,
    DBMS_LOB.SUBSTR(SENTIMENT_READY_TEXT, 20000, 1) as SENTIMENT_READY_TEXT,
    TYPE,
    SUBREDDIT,
    CREATED_UTC,
    NORMALIZED_UPVOTES,
    DBMS_LOB.SUBSTR(MENTIONED_TICKERS, 100, 1) as MENTIONED_TICKERS,
    N_TICKERS,
    TEXT_LENGTH,
    WORD_COUNT,
    DATE_COL,
    HOUR,
    DAY_OF_WEEK
FROM preprocessed_data
FETCH FIRST 2000 ROWS ONLY
"""

df = pd.read_sql_query(query, conn)
conn.close()

df.columns = df.columns.str.lower()
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
if 'date_col' in df.columns:
    df['date'] = pd.to_datetime(df['date_col'])
    df.drop(columns=['date_col'], inplace=True)

print(f"Loaded {len(df)} rows")
df.head()

Oracle connection successful!


  df = pd.read_sql_query(query, conn)


Loaded 2000 rows


Unnamed: 0,id,sentiment_ready_text,type,subreddit,created_utc,normalized_upvotes,mentioned_tickers,n_tickers,text_length,word_count,hour,day_of_week,date
0,lqm91bb,POOR THING BUT THIS DOESN t SOUND GOOD ANY IDE...,comment,investing,2024-10-06 14:09:50,0.123091,IBKR,1,1141,215,14,6,2024-10-06
1,lqm90lu,ROKU THEY VE DONE nothing BUT expand market SH...,comment,stocks,2024-10-06 14:09:43,0.050911,ASML,1,483,81,14,6,2024-10-06
2,lqm8vkv,BMBL LOTS OF people KNOW IT AND HAVE USED IT B...,comment,stocks,2024-10-06 14:08:53,0.050911,"ASML,BMBL,MTCH",3,439,81,14,6,2024-10-06
3,lqm8tfg,i VE BEEN BAG holding chinese stocks SINCE EAR...,comment,ValueInvesting,2024-10-06 14:08:31,0.054083,"BABA,PDD",2,107,19,14,6,2024-10-06
4,lqm8pgy,please DON t assume anyone CAN JUST WORK PART ...,comment,financialindependence,2024-10-06 14:07:53,0.278094,CC,1,320,56,14,6,2024-10-06


In [None]:
"""
OPTIMIZED HYBRID SENTIMENT PIPELINE
- RoBERTa for scoring
- Qwen 1.5B for reasoning
- With filtering logic to minimize LLM calls and drop noise
"""

import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM
)
from tqdm import tqdm
import ast
import numpy as np
import json
import re

# ============================================================================
# CONFIG
# ============================================================================

OUTPUT_FILE = "sentiment_hybrid_twitter_llm.csv"

TEXT_COL   = "sentiment_ready_text"
TICKER_COL = "mentioned_tickers"

TW_MODEL_NAME  = "cardiffnlp/twitter-roberta-base-sentiment-latest"
LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

BATCH_SIZE_CLS = 64
BATCH_SIZE_LLM = 64

# DECOUPLED TOKEN LIMITS (Crucial Fix)
ROBERTA_MAX_TOKENS = 512  # Model limit
LLM_MAX_TOKENS     = 1500 # Expanded context for reasoning
MAX_NEW_TOKENS     = 128  # Output length for JSON

# LOGIC THRESHOLDS
DROP_THRESHOLD = 0.50  # Drop rows below this
LLM_THRESHOLD  = 0.65  # Send rows below but aboive DROP to LLM

print("="*80)
print(f"HYBRID PIPELINE: Drop<{DROP_THRESHOLD} | LLM {DROP_THRESHOLD}-{LLM_THRESHOLD} | RoBERTa >{LLM_THRESHOLD}")
print("="*80)

# ============================================================================
# DEVICE - to use GPU
# ============================================================================

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ============================================================================
# DATA PREPARATION
# ============================================================================

def parse_tickers(x):
    if pd.isna(x) or x == "":
        return []
    try:
        return [str(t).strip() for t in ast.literal_eval(str(x)) if t]
    except:
        return [t.strip() for t in str(x).split(",") if t.strip()]

# Ensure we start clean
if "tickers_list" not in df.columns:
    df["tickers_list"] = df[TICKER_COL].apply(parse_tickers)
    df["n_tickers"] = df["tickers_list"].apply(len)
    df = df[df["n_tickers"] > 0].copy()
    df = df.explode("tickers_list").reset_index(drop=True)
    df = df.rename(columns={"tickers_list": "ticker"})

print(f"Total rows to process: {len(df):,}")

# ============================================================================
# 1.RoBERTa scoring
# ============================================================================

print("\nLoading Twitter-RoBERTa...")
tw_tokenizer = AutoTokenizer.from_pretrained(TW_MODEL_NAME)
tw_model     = AutoModelForSequenceClassification.from_pretrained(TW_MODEL_NAME)
tw_model.to(device)
tw_model.eval()
print("✓ Loaded")

def twitter_batch(texts, tickers):
    inputs = [f"{tic}: {txt}" for txt, tic in zip(texts, tickers)]

    enc = tw_tokenizer(
        inputs,
        padding=True,
        truncation=True,
        max_length=ROBERTA_MAX_TOKENS, # Fixed 512 limit
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        probs = torch.softmax(tw_model(**enc).logits, dim=-1).cpu().numpy()

    results = []
    for prob in probs:
        p_neg, p_neu, p_pos = prob[0], prob[1], prob[2]
        
        # Weighted Score
        score = (p_pos * 1.0) + (p_neu * 0.0) + (p_neg * -1.0)
        
        # Confidence = Max probability
        confidence = max(p_pos, p_neu, p_neg)
        
        if p_pos > p_neg and p_pos > p_neu: label = "positive"
        elif p_neg > p_pos and p_neg > p_neu: label = "negative"
        else: label = "neutral"

        results.append({
            "score": score,
            "label": label,
            "confidence": confidence,
            "p_pos": p_pos, "p_neg": p_neg, "p_neu": p_neu
        })

    return results

# Run RoBERTa
tw_results = []
texts = df[TEXT_COL].fillna("").tolist()
tickers = df["ticker"].tolist()

print("Running RoBERTa...")
for i in tqdm(range(0, len(texts), BATCH_SIZE_CLS), desc="RoBERTa"):
    batch_texts = texts[i:i+BATCH_SIZE_CLS]
    batch_tickers = tickers[i:i+BATCH_SIZE_CLS]
    tw_results.extend(twitter_batch(batch_texts, batch_tickers))

# Attach results
df["tw_score"] = [r["score"] for r in tw_results]
df["tw_label"] = [r["label"] for r in tw_results]
df["tw_confidence"] = [r["confidence"] for r in tw_results]
df["tw_p_pos"] = [r["p_pos"] for r in tw_results]
df["tw_p_neg"] = [r["p_neg"] for r in tw_results]
df["tw_p_neu"] = [r["p_neu"] for r in tw_results]

# ============================================================================
# 2. Filtering RoBERTa results
# ============================================================================

print("\n" + "="*80)
print("APPLYING SMART FILTERS")
print("="*80)

initial_count = len(df)

# drop below DROP_THRESHOLD
df_dropped = df[df["tw_confidence"] < DROP_THRESHOLD]
df = df[df["tw_confidence"] >= DROP_THRESHOLD].copy()

print(f"1. Dropped (Confidence < {DROP_THRESHOLD}): {len(df_dropped):,} rows")

# Set llm subset for reevaluation
df_uncertain = df[df["tw_confidence"] < LLM_THRESHOLD].copy()
print(f"2. Sent to LLM ({DROP_THRESHOLD} <= Conf < {LLM_THRESHOLD}): {len(df_uncertain):,} rows")

# Keep confident above LLM_THRESHOLD
df_confident = df[df["tw_confidence"] >= LLM_THRESHOLD].copy()
print(f"3. Kept RoBERTa (Confidence >= {LLM_THRESHOLD}): {len(df_confident):,} rows")

print(f"   -> New Dataset Size: {len(df):,} (was {initial_count:,})")

if len(df_uncertain) == 0:
    print("No uncertain cases found. Saving...")
    df.to_csv(OUTPUT_FILE, index=False)
    exit()

# ============================================================================
# 3. Qwen LLM for uncertain cases
# ============================================================================

print("\n" + "="*80)
print("LOADING QWEN LLM")
print("="*80)

llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
llm_model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL_NAME,
    torch_dtype=torch.float16 if device=="cuda" else None,
    device_map="auto" if device=="cuda" else None
)
llm_model.eval()
if llm_tokenizer.pad_token is None: llm_tokenizer.pad_token = llm_tokenizer.eos_token
print("✓ Qwen loaded")

def build_prompt(text, ticker):
    # Increased context window here to 1500
    return f"""You are a financial sentiment expert.
Analyze the sentiment of the text from a comment or post below regarding the ticker: {ticker}.

Return a JSON object with:
1. "reasoning": A brief explanation (max 15 words).
2. "sentiment": "Positive", "Negative", or "Neutral".
3. "score": A float between -1.0 (Very Negative) and 1.0 (Very Positive).

Text: "{text[:1500]}"
Ticker: {ticker}

JSON Response:"""

def parse_llm_response(text):
    try:
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if match:
            data = json.loads(match.group(0))
            return data.get("sentiment", "Neutral").lower(), float(data.get("score", 0.0))
    except:
        pass
    
    # Fallback regex
    match = re.search(r"score\":\s*(-?0\.\d+|1\.0|-1\.0|-?\d+)", text)
    if match:
        val = float(match.group(1))
        label = "positive" if val > 0.1 else ("negative" if val < -0.1 else "neutral")
        return label, val
    return "neutral", 0.0

def run_llm_batch(texts, tickers):
    prompts = [build_prompt(t, tic) for t, tic in zip(texts, tickers)]
    inputs = llm_tokenizer(
        prompts, return_tensors="pt", padding=True, truncation=True, 
        max_length=LLM_MAX_TOKENS # 1500 limit for LLM
    ).to(device)

    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs, max_new_tokens=MAX_NEW_TOKENS, do_sample=False,
            pad_token_id=llm_tokenizer.pad_token_id
        )
    decoded = llm_tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return [parse_llm_response(d) for d in decoded]

# Run LLM Loop
llm_labels = []
llm_scores = []
texts_llm = df_uncertain[TEXT_COL].tolist()
tickers_llm = df_uncertain["ticker"].tolist()

for i in tqdm(range(0, len(texts_llm), BATCH_SIZE_LLM), desc="LLM Inference"):
    b_texts = texts_llm[i:i+BATCH_SIZE_LLM]
    b_tickers = tickers_llm[i:i+BATCH_SIZE_LLM]
    try:
        results = run_llm_batch(b_texts, b_tickers)
        for l, s in results:
            llm_labels.append(l)
            llm_scores.append(s)
    except Exception as e:
        print(f"Batch {i} error: {e}")
        llm_labels.extend(["neutral"] * len(b_texts))
        llm_scores.extend([0.0] * len(b_texts))

# ============================================================================
# 4. MERGE & SAVE
# ============================================================================

# Default all to RoBERTa
df["final_sentiment_label"] = df["tw_label"]
df["final_sentiment_score"] = df["tw_score"]
df["source_model"] = "RoBERTa"

# Overwrite LLM rows
df.loc[df_uncertain.index, "final_sentiment_label"] = llm_labels
df.loc[df_uncertain.index, "final_sentiment_score"] = llm_scores
df.loc[df_uncertain.index, "source_model"] = "LLM"

print("\n" + "="*80)
print("SAVING")
print("="*80)

df.to_csv(OUTPUT_FILE, index=False)
print(f"✓ Saved to {OUTPUT_FILE}")
print(f"  - RoBERTa rows: {len(df[df['source_model']=='RoBERTa']):,}")
print(f"  - LLM rows:     {len(df[df['source_model']=='LLM']):,}")
print(f"  - Dropped rows: {len(df_dropped):,}")

  from .autonotebook import tqdm as notebook_tqdm


HYBRID PIPELINE: Drop<0.4 | LLM 0.4-0.65 | RoBERTa >0.65
Device: cuda
Total rows to process: 3,993

Loading Twitter-RoBERTa...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✓ Loaded
Running RoBERTa...


RoBERTa: 100%|██████████| 63/63 [01:25<00:00,  1.35s/it]



APPLYING SMART FILTERS
1. Dropped (Confidence < 0.4): 4 rows
2. Sent to LLM (0.4 <= Conf < 0.65): 1,427 rows
3. Kept RoBERTa (Confidence >= 0.65): 2,562 rows
   -> New Dataset Size: 3,989 (was 3,993)

LOADING QWEN LLM


`torch_dtype` is deprecated! Use `dtype` instead!
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


✓ Qwen loaded


LLM Inference:   0%|          | 0/23 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM Inference:   4%|▍         | 1/23 [03:47<1:23:33, 227.88s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM Inference:   9%|▊         | 2/23 [07:23<1:17:15, 220.76s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM Inference:  13%|█▎        | 3/23 [11:20<1:15:58, 227.95s/it]A decoder-only architecture is being used, but right-padding was detected! For corre


SAVING
✓ Saved to sentiment_hybrid_twitter_llm.csv
  - RoBERTa rows: 2,562
  - LLM rows:     1,427
  - Dropped rows: 4


In [14]:
"""
FIXED HYBRID SENTIMENT PIPELINE (RoBERTa + Qwen 1.5B)
"""

import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM
)
from tqdm import tqdm
import ast
import time
import numpy as np
import json
import re

# ============================================================================
# CONFIG
# ============================================================================

OUTPUT_FILE = "sentiment_hybrid_twitter_llm.csv"

TEXT_COL   = "sentiment_ready_text"
TICKER_COL = "mentioned_tickers"

TW_MODEL_NAME  = "cardiffnlp/twitter-roberta-base-sentiment-latest"
LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

BATCH_SIZE_CLS = 64
BATCH_SIZE_LLM = 64

MAX_INPUT_TOKENS = 512 # cant really change (for roberta)

MAX_NEW_TOKENS   = 128 # Increased for JSON reasoning
CONFIDENCE_THRESHOLD = 0.5 # Confidence below this triggers LLM!!!!

print("="*80)
print("FIXED HYBRID SENTIMENT PIPELINE")
print("="*80)

# ============================================================================
# DEVICE
# ============================================================================

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ============================================================================
# DATA PREPARATION
# ============================================================================
# Processing rows for tickers
def parse_tickers(x):
    if pd.isna(x) or x == "":
        return []
    try:
        return [str(t).strip() for t in ast.literal_eval(str(x)) if t]
    except:
        return [t.strip() for t in str(x).split(",") if t.strip()]

df["tickers_list"] = df[TICKER_COL].apply(parse_tickers)
df["n_tickers"] = df["tickers_list"].apply(len)
df = df[df["n_tickers"] > 0].copy()

# Explode to per-ticker rows
df = df.explode("tickers_list").reset_index(drop=True)
df = df.rename(columns={"tickers_list": "ticker"})

print(f"Per-ticker rows: {len(df):,}")

# ============================================================================
# LOAD TWITTER ROBERTA
# ============================================================================

print("\nLoading Twitter-RoBERTa...")
tw_tokenizer = AutoTokenizer.from_pretrained(TW_MODEL_NAME)
tw_model     = AutoModelForSequenceClassification.from_pretrained(TW_MODEL_NAME)
tw_model.to(device)
tw_model.eval()
print("✓ Loaded")

# ============================================================================
# TWITTER ROBERTA FUNCTION
# ============================================================================

def twitter_batch(texts, tickers):
    # Context-aware input: "TICKER: Text"
    inputs = [f"{tic}: {txt}" for txt, tic in zip(texts, tickers)]

    enc = tw_tokenizer(
        inputs,
        padding=True,
        truncation=True,
        max_length=MAX_INPUT_TOKENS,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        probs = torch.softmax(tw_model(**enc).logits, dim=-1).cpu().numpy()

    results = []
    for prob in probs:
        # cardiffnlp mapping: 0 -> Negative, 1 -> Neutral, 2 -> Positive
        p_neg = prob[0]
        p_neu = prob[1]
        p_pos = prob[2]
        
        # Weighted Score (-1 to 1)
        score = (p_pos * 1.0) + (p_neu * 0.0) + (p_neg * -1.0)
        
        # Confidence: The highest probability of the three classes
        confidence = max(p_pos, p_neu, p_neg)
        
        # Determine Label
        if p_pos > p_neg and p_pos > p_neu:
            label = "positive"
        elif p_neg > p_pos and p_neg > p_neu:
            label = "negative"
        else:
            label = "neutral"

        results.append({
            "score": score,
            "label": label,
            "confidence": confidence,
            "p_pos": p_pos, 
            "p_neg": p_neg,
            "p_neu": p_neu
        })

    return results

# ============================================================================
# RUN TWITTER ROBERTA
# ============================================================================

print("\n" + "="*80)
print("RUNNING TWITTER ROBERTA")
print("="*80)

# Storage lists
tw_scores = []
tw_labels = []
tw_confs  = []
tw_pos_probs = []
tw_neg_probs = []
tw_neu_probs = []

texts = df[TEXT_COL].fillna("").tolist()
tickers = df["ticker"].tolist()

for i in tqdm(range(0, len(texts), BATCH_SIZE_CLS), desc="Twitter RoBERTa"):
    batch_texts = texts[i:i+BATCH_SIZE_CLS]
    batch_tickers = tickers[i:i+BATCH_SIZE_CLS]
    
    results = twitter_batch(batch_texts, batch_tickers)
    
    # UNPACKING FIXED HERE
    for res in results:
        tw_scores.append(res["score"])
        tw_labels.append(res["label"])
        tw_confs.append(res["confidence"])
        tw_pos_probs.append(res["p_pos"])
        tw_neg_probs.append(res["p_neg"])
        tw_neu_probs.append(res["p_neu"])

# Save all metrics to DataFrame
df["tw_score"] = tw_scores
df["tw_label"] = tw_labels
df["tw_confidence"] = tw_confs
df["tw_prob_pos"] = tw_pos_probs # Useful for visualization
df["tw_prob_neg"] = tw_neg_probs # Useful for visualization
df["tw_prob_neu"] = tw_neu_probs

print(f"\nTwitter-RoBERTa Results:")
print(f"  Label distribution:\n{df['tw_label'].value_counts()}")
print(f"  Avg Confidence: {df['tw_confidence'].mean():.3f}")

# ============================================================================
# SELECT UNCERTAIN FOR LLM
# ============================================================================

print("\n" + "="*80)
print("SELECTING UNCERTAIN CASES FOR LLM")
print("="*80)

# Filter: Send to LLM if RoBERTa isn't at least 65% sure of its answer
df_uncertain = df[df["tw_confidence"] < CONFIDENCE_THRESHOLD].copy()

print(f"Total rows: {len(df)}")
print(f"Confident rows: {len(df) - len(df_uncertain)}")
print(f"Uncertain rows (to LLM): {len(df_uncertain)} ({len(df_uncertain)/len(df)*100:.1f}%)")

if len(df_uncertain) == 0:
    print("No uncertain cases found. Saving immediately.")
    df.to_csv(OUTPUT_FILE, index=False)
    exit()

# ============================================================================
# LOAD QWEN LLM
# ============================================================================

print("\n" + "="*80)
print("LOADING QWEN LLM")
print("="*80)

llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
llm_model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL_NAME,
    torch_dtype=torch.float16 if device=="cuda" else None,
    device_map="auto" if device=="cuda" else None
)

llm_model.eval()

if llm_tokenizer.pad_token is None:
    llm_tokenizer.pad_token = llm_tokenizer.eos_token

print("✓ Qwen loaded")

# ============================================================================
# LLM PROMPTING & PARSING (JSON MODE)
# ============================================================================

def build_prompt(text, ticker):
    # Force JSON output for easier parsing with small models
    return f"""You are a financial sentiment expert.
Analyze the sentiment of the text below regarding the ticker: {ticker}.

Return a JSON object with:
1. "reasoning": A brief explanation (max 15 words).
2. "sentiment": "Positive", "Negative", or "Neutral".
3. "score": A float between -1.0 (Very Negative) and 1.0 (Very Positive).

Text: "{text[:300]}"
Ticker: {ticker}

JSON Response:"""

def parse_llm_response(text):
    """Robust parsing of pseudo-JSON output."""
    try:
        # Attempt to find JSON-like structure
        match = re.search(r"\{.*\}", text, re.DOTALL)
        if match:
            json_str = match.group(0)
            data = json.loads(json_str)
            return data.get("sentiment", "Neutral").lower(), float(data.get("score", 0.0))
    except:
        pass
    
    # Fallback: Regex for score if JSON fails
    score_match = re.search(r"score\":\s*(-?0\.\d+|1\.0|-1\.0|-?\d+)", text)
    if score_match:
        val = float(score_match.group(1))
        label = "positive" if val > 0.1 else ("negative" if val < -0.1 else "neutral")
        return label, max(-1.0, min(1.0, val))

    return "neutral", 0.0  # complete failure fallback

# ============================================================================
# RUN LLM
# ============================================================================

print("\n" + "="*80)
print("RUNNING LLM ON UNCERTAIN CASES")
print("="*80)

llm_labels = []
llm_scores = []

texts_llm = df_uncertain[TEXT_COL].tolist()
tickers_llm = df_uncertain["ticker"].tolist()

# Define batch function for LLM
def run_llm_batch(texts, tickers):
    prompts = [build_prompt(t, tic) for t, tic in zip(texts, tickers)]
    
    inputs = llm_tokenizer(
        prompts, 
        return_tensors="pt",
        padding=True, 
        truncation=True,
        max_length=MAX_INPUT_TOKENS
    ).to(device)

    with torch.no_grad():
        outputs = llm_model.generate(
            **inputs, 
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=False, # Deterministic
            pad_token_id=llm_tokenizer.pad_token_id
        )
    
    decoded = llm_tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
    return [parse_llm_response(d) for d in decoded]

# Loop
for i in tqdm(range(0, len(texts_llm), BATCH_SIZE_LLM), desc="LLM Inference"):
    b_texts = texts_llm[i:i+BATCH_SIZE_LLM]
    b_tickers = tickers_llm[i:i+BATCH_SIZE_LLM]
    
    try:
        results = run_llm_batch(b_texts, b_tickers)
        for lbl, scr in results:
            llm_labels.append(lbl)
            llm_scores.append(scr)
    except Exception as e:
        print(f"Error in batch {i}: {e}")
        # Error fallback
        for _ in range(len(b_texts)):
            llm_labels.append("neutral")
            llm_scores.append(0.0)

# ============================================================================
# MERGE & SAVE
# ============================================================================

# Initialize columns with RoBERTa values
df["final_sentiment_label"] = df["tw_label"]
df["final_sentiment_score"] = df["tw_score"]
df["source_model"] = "RoBERTa"

# Update with LLM values
df.loc[df_uncertain.index, "final_sentiment_label"] = llm_labels
df.loc[df_uncertain.index, "final_sentiment_score"] = llm_scores
df.loc[df_uncertain.index, "source_model"] = "LLM"

print("\n" + "="*80)
print("SAVING FINAL RESULTS")
print("="*80)

df.to_csv(OUTPUT_FILE, index=False)
print(f"✓ Saved to {OUTPUT_FILE}")
print(f"  - RoBERTa rows: {len(df[df['source_model']=='RoBERTa'])}")
print(f"  - LLM rows:     {len(df[df['source_model']=='LLM'])}")

FIXED HYBRID SENTIMENT PIPELINE
Device: cuda
Per-ticker rows: 10,326

Loading Twitter-RoBERTa...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✓ Loaded

RUNNING TWITTER ROBERTA


Twitter RoBERTa: 100%|██████████| 162/162 [23:51<00:00,  8.84s/it]



Twitter-RoBERTa Results:
  Label distribution:
tw_label
neutral     5461
negative    2546
positive    2319
Name: count, dtype: int64
  Avg Confidence: 0.719

SELECTING UNCERTAIN CASES FOR LLM
Total rows: 10326
Confident rows: 9662
Uncertain rows (to LLM): 664 (6.4%)

LOADING QWEN LLM
✓ Qwen loaded

RUNNING LLM ON UNCERTAIN CASES


LLM Inference:   0%|          | 0/11 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM Inference:   9%|▉         | 1/11 [05:09<51:35, 309.54s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM Inference:  18%|█▊        | 2/11 [09:13<40:39, 271.02s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM Inference:  27%|██▋       | 3/11 [13:44<36:08, 271.12s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM Inference:  36%|███▋      | 4/11 [19:29<35:02, 300.34s/it]A decod


SAVING FINAL RESULTS
✓ Saved to sentiment_hybrid_twitter_llm.csv
  - RoBERTa rows: 9662
  - LLM rows:     664
