In [3]:
import pandas as pd
from dotenv import load_dotenv
from utils import get_oracle_connection
import sys
print(sys.version)
import torch
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())

3.13.5 (tags/v3.13.5:6cb20a2, Jun 11 2025, 16:15:46) [MSC v.1943 64 bit (AMD64)]
Torch version: 2.7.1+cu118
CUDA available: True


In [4]:
conn = get_oracle_connection()

query = """
SELECT
    ID,
    DBMS_LOB.SUBSTR(SENTIMENT_READY_TEXT, 20000, 1) as SENTIMENT_READY_TEXT,
    TYPE,
    SUBREDDIT,
    CREATED_UTC,
    NORMALIZED_UPVOTES,
    DBMS_LOB.SUBSTR(MENTIONED_TICKERS, 100, 1) as MENTIONED_TICKERS,
    N_TICKERS,
    TEXT_LENGTH,
    WORD_COUNT,
    DATE_COL,
    HOUR,
    DAY_OF_WEEK
FROM preprocessed_data
FETCH FIRST 1000 ROWS ONLY
"""

df = pd.read_sql_query(query, conn)
conn.close()

df.columns = df.columns.str.lower()
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
if 'date_col' in df.columns:
    df['date'] = pd.to_datetime(df['date_col'])
    df.drop(columns=['date_col'], inplace=True)

print(f"Loaded {len(df)} rows")
df.head()

Oracle connection successful!


  df = pd.read_sql_query(query, conn)


Loaded 1000 rows


Unnamed: 0,id,sentiment_ready_text,type,subreddit,created_utc,normalized_upvotes,mentioned_tickers,n_tickers,text_length,word_count,hour,day_of_week,date
0,m48hiu4,I'd considering splitting it across two banks ...,comment,investing,2024-12-28 19:22:19,0.018399,UBS,1,241,47,19,5,2024-12-28
1,m48hedi,I had the opportunity for the IPO price thing ...,comment,stocks,2024-12-28 19:21:38,0.011074,AMD,1,75,15,19,5,2024-12-28
2,m48heet,"Looks like It's down, but the 6 month chart sh...",comment,investing,2024-12-28 19:21:38,0.018076,KO,1,286,50,19,5,2024-12-28
3,m48gxg1,Just look at 52 week lows. Stock screeners wil...,comment,ValueInvesting,2024-12-28 19:19:04,0.056502,HSY,1,278,51,19,5,2024-12-28
4,m48guco,Are you looking to index? Wealthfront has a ne...,comment,investing,2024-12-28 19:18:36,0.018076,UBS,1,82,15,19,5,2024-12-28


In [5]:
"""
FIXED HYBRID SENTIMENT PIPELINE

Improvements:
1. Better LLM prompt with more diverse score examples
2. Asks for nuanced scores (not just -0.8, 0, 0.8)
3. Better parsing with fallback strategies
4. Validation to catch bad outputs
"""

import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM
)
from tqdm import tqdm
import ast
import time
import numpy as np
import json
import re

# ============================================================================
# CONFIG
# ============================================================================

OUTPUT_FILE = "sentiment_hybrid_twitter_llm.csv"

TEXT_COL   = "sentiment_ready_text"
TICKER_COL = "mentioned_tickers"

TW_MODEL_NAME  = "cardiffnlp/twitter-roberta-base-sentiment-latest"
LLM_MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

BATCH_SIZE_CLS = 64
BATCH_SIZE_LLM = 8

MAX_INPUT_TOKENS = 512
MAX_NEW_TOKENS   = 64
USE_FP16_LLM = True

print("="*80)
print("FIXED HYBRID SENTIMENT PIPELINE")
print("="*80)

# ============================================================================
# DEVICE
# ============================================================================

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# ============================================================================
# BASIC PREP
# ============================================================================

def parse_tickers(x):
    if pd.isna(x) or x == "":
        return []
    try:
        return [str(t).strip() for t in ast.literal_eval(str(x)) if t]
    except:
        return [t.strip() for t in str(x).split(",") if t.strip()]

df["tickers_list"] = df[TICKER_COL].apply(parse_tickers)
df["n_tickers"] = df["tickers_list"].apply(len)

df = df[df["n_tickers"] > 0].copy()

# Explode to per-ticker rows
df = df.explode("tickers_list").reset_index(drop=True)
df = df.rename(columns={"tickers_list": "ticker"})

print(f"Per-ticker rows: {len(df):,}")

# ============================================================================
# LOAD TWITTER ROBERTA
# ============================================================================

print("\nLoading Twitter-RoBERTa...")
tw_tokenizer = AutoTokenizer.from_pretrained(TW_MODEL_NAME)
tw_model     = AutoModelForSequenceClassification.from_pretrained(TW_MODEL_NAME)
tw_model.to(device)
tw_model.eval()
print("✓ Loaded")

# ============================================================================
# TWITTER ROBERTA FUNCTION
# ============================================================================

def twitter_batch(texts, tickers):
    texts = [f"{tic}: {txt}" for txt, tic in zip(texts, tickers)]

    enc = tw_tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=MAX_INPUT_TOKENS,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        probs = torch.softmax(tw_model(**enc).logits, dim=-1).cpu().numpy()

    id2label = tw_model.config.id2label
    results = []

    for prob in probs:
        label_map = {id2label[i].lower(): prob[i] for i in range(len(prob))}
        
        p_pos = label_map.get("positive", 0.0)
        p_neg = label_map.get("negative", 0.0)
        p_neu = label_map.get("neutral", 0.0)

        # Weighted score for better distribution
        score = p_pos * 1.0 + p_neu * 0.0 + p_neg * (-1.0)

        if score > 0.15:
            label = "positive"
        elif score < -0.15:
            label = "negative"
        else:
            label = "neutral"

        results.append((score, label))

    return results

# ============================================================================
# RUN TWITTER ROBERTA
# ============================================================================

print("\n" + "="*80)
print("RUNNING TWITTER ROBERTA")
print("="*80)

tw_scores = []
tw_labels = []

texts = df[TEXT_COL].fillna("").tolist()
tickers = df["ticker"].tolist()

for i in tqdm(range(0, len(texts), BATCH_SIZE_CLS), desc="Twitter RoBERTa"):
    batch_texts = texts[i:i+BATCH_SIZE_CLS]
    batch_tickers = tickers[i:i+BATCH_SIZE_CLS]
    res = twitter_batch(batch_texts, batch_tickers)
    for s,l in res:
        tw_scores.append(s)
        tw_labels.append(l)

df["tw_score"] = tw_scores
df["tw_label"] = tw_labels

print(f"\nTwitter-RoBERTa Results:")
print(f"  Label distribution:\n{df['tw_label'].value_counts()}")
print(f"  Score stats: mean={df['tw_score'].mean():.3f}, std={df['tw_score'].std():.3f}")

# ============================================================================
# SELECT UNCERTAIN FOR LLM
# ============================================================================

print("\n" + "="*80)
print("SELECTING UNCERTAIN CASES FOR LLM")
print("="*80)

# Uncertain: tw_score between -0.1 and 0.1
df_llm = df[df["tw_score"].between(-0.1, 0.1)].copy()
print(f"Uncertain cases (tw_score ∈ [-0.1, 0.1]): {len(df_llm):,} rows ({len(df_llm)/len(df)*100:.1f}%)")

if len(df_llm) == 0:
    print("⚠️ No uncertain cases! Skipping LLM.")
    df["llm_sentiment_label"] = np.nan
    df["llm_sentiment_score"] = np.nan
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"✓ Saved to {OUTPUT_FILE}")
else:
    # ============================================================================
    # LOAD QWEN LLM
    # ============================================================================
    
    print("\n" + "="*80)
    print("LOADING QWEN LLM")
    print("="*80)
    
    llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
    llm_model = AutoModelForCausalLM.from_pretrained(
        LLM_MODEL_NAME,
        torch_dtype=torch.float16 if device=="cuda" else None,
        device_map="auto" if device=="cuda" else None
    )
    
    llm_model.eval()
    
    if llm_tokenizer.pad_token is None:
        llm_tokenizer.pad_token = llm_tokenizer.eos_token
    
    print("✓ Qwen loaded")
    
    # ============================================================================
    # IMPROVED PROMPT (KEY FIX!)
    # ============================================================================
    
    def build_prompt(text, ticker):
        """
        Better prompt with:
        1. More diverse score examples (not just -0.8, 0, 0.8)
        2. Clear instructions about continuous scale
        3. Emphasis on nuance
        """
        return f"""You are analyzing stock market sentiment. Rate sentiment from -1.0 (very negative) to +1.0 (very positive).

Examples with various scores:

Text: "NVDA absolutely crushing it, best quarter ever!"
Ticker: NVDA
Score: 0.95 (very positive)

Text: "TSLA down but might recover, who knows"
Ticker: TSLA
Score: -0.2 (slightly negative, uncertain)

Text: "AAPL sideways, nothing happening"
Ticker: AAPL
Score: 0.05 (neutral, slightly positive)

Text: "MSFT looks concerning, sales dropping"
Ticker: MSFT
Score: -0.6 (negative)

Text: "AMD decent earnings, pretty good results"
Ticker: AMD
Score: 0.5 (positive)

Now analyze this text. Give a score from -1.0 to +1.0:

Text: "{text[:200]}"
Ticker: {ticker}

Respond ONLY with a number from -1.0 to +1.0, nothing else."""

    # ============================================================================
    # IMPROVED PARSING (KEY FIX!)
    # ============================================================================
    
    def parse_llm_response(text):
        """
        Better parsing with multiple strategies and validation.
        """
        text = str(text).strip()
        
        # Strategy 1: Find any decimal number
        number_match = re.search(r'(-?\d*\.?\d+)', text)
        if number_match:
            try:
                score = float(number_match.group(1))
                
                # Clamp to [-1, 1]
                score = max(-1.0, min(1.0, score))
                
                # Determine label
                if score > 0.15:
                    label = "positive"
                elif score < -0.15:
                    label = "negative"
                else:
                    label = "neutral"
                
                return label, score
            except ValueError:
                pass
        
        # Strategy 2: Look for sentiment words as fallback
        text_lower = text.lower()
        
        if any(word in text_lower for word in ['very positive', 'extremely positive', 'bullish']):
            return "positive", 0.8
        elif any(word in text_lower for word in ['positive', 'good']):
            return "positive", 0.5
        elif any(word in text_lower for word in ['very negative', 'extremely negative', 'bearish']):
            return "negative", -0.8
        elif any(word in text_lower for word in ['negative', 'bad']):
            return "negative", -0.5
        
        # Strategy 3: Default to slight neutral with randomness
        # (avoids clustering at exactly 0)
        import random
        return "neutral", random.uniform(-0.05, 0.05)
    
    # ============================================================================
    # RUN LLM WITH BETTER GENERATION PARAMS
    # ============================================================================
    
    def run_llm_batch(texts, tickers):
        """Run LLM with improved generation parameters."""
        prompts = [build_prompt(t, tic) for t, tic in zip(texts, tickers)]
    
        inputs = llm_tokenizer(
            prompts, 
            return_tensors="pt",
            padding=True, 
            truncation=True,
            max_length=MAX_INPUT_TOKENS
        ).to(device)
    
        with torch.no_grad():
            outputs = llm_model.generate(
                **inputs, 
                max_new_tokens=MAX_NEW_TOKENS,
                do_sample=False,  # Greedy for consistency
                temperature=1.0,
                top_p=1.0,
                pad_token_id=llm_tokenizer.pad_token_id
            )
        
        # Decode only NEW tokens (skip the prompt)
        decoded_outputs = []
        for i, output in enumerate(outputs):
            prompt_length = inputs['input_ids'][i].shape[0]
            new_tokens = output[prompt_length:]
            decoded = llm_tokenizer.decode(new_tokens, skip_special_tokens=True)
            decoded_outputs.append(decoded)
        
        return [parse_llm_response(o) for o in decoded_outputs]
    
    # ============================================================================
    # RUN LLM
    # ============================================================================
    
    print("\n" + "="*80)
    print("RUNNING LLM ON UNCERTAIN CASES")
    print("="*80)
    
    llm_labels = []
    llm_scores = []
    llm_raw_responses = []
    
    texts_llm = df_llm[TEXT_COL].tolist()
    tickers_llm = df_llm["ticker"].tolist()
    
    for i in tqdm(range(0, len(texts_llm), BATCH_SIZE_LLM), desc="LLM"):
        batch_texts = texts_llm[i:i+BATCH_SIZE_LLM]
        batch_tickers = tickers_llm[i:i+BATCH_SIZE_LLM]
        
        try:
            res = run_llm_batch(batch_texts, batch_tickers)
            
            for label, score in res:
                llm_labels.append(label)
                llm_scores.append(score)
        
        except Exception as e:
            print(f"\nError at batch {i}: {e}")
            # Fill with slight neutral
            import random
            for _ in range(len(batch_texts)):
                llm_labels.append("neutral")
                llm_scores.append(random.uniform(-0.05, 0.05))
    
    df_llm["llm_sentiment_label"] = llm_labels
    df_llm["llm_sentiment_score"] = llm_scores
    
    # ============================================================================
    # MERGE BACK
    # ============================================================================
    
    df["llm_sentiment_label"] = np.nan
    df["llm_sentiment_score"] = np.nan
    
    df.loc[df_llm.index, "llm_sentiment_label"] = df_llm["llm_sentiment_label"]
    df.loc[df_llm.index, "llm_sentiment_score"] = df_llm["llm_sentiment_score"]
    
    # ============================================================================
    # ANALYSIS
    # ============================================================================
    
    print("\n" + "="*80)
    print("LLM RESULTS ANALYSIS")
    print("="*80)
    
    print(f"\nLLM Label Distribution:")
    print(df_llm["llm_sentiment_label"].value_counts())
    
    print(f"\nLLM Score Statistics:")
    print(df_llm["llm_sentiment_score"].describe())
    
    print(f"\nLLM Score Diversity:")
    unique_scores = df_llm["llm_sentiment_score"].nunique()
    print(f"  Unique values: {unique_scores} out of {len(df_llm)}")
    
    if unique_scores < 10:
        print("  ⚠️ Low diversity - showing most common scores:")
        print(df_llm["llm_sentiment_score"].value_counts().head(10))
    else:
        print("  ✓ Good diversity!")
    
    # Show examples
    print("\n" + "="*80)
    print("LLM EXAMPLES")
    print("="*80)
    
    print("\nMost positive by LLM:")
    for _, row in df_llm.nlargest(3, "llm_sentiment_score").iterrows():
        print(f"\n  Score: {row['llm_sentiment_score']:+.2f} | Ticker: {row['ticker']}")
        print(f"  Twitter: {row['tw_score']:+.2f} (uncertain)")
        print(f"  Text: {row[TEXT_COL][:100]}...")
    
    print("\nMost negative by LLM:")
    for _, row in df_llm.nsmallest(3, "llm_sentiment_score").iterrows():
        print(f"\n  Score: {row['llm_sentiment_score']:+.2f} | Ticker: {row['ticker']}")
        print(f"  Twitter: {row['tw_score']:+.2f} (uncertain)")
        print(f"  Text: {row[TEXT_COL][:100]}...")
    
    # ============================================================================
    # SAVE
    # ============================================================================
    
    print("\n" + "="*80)
    print("SAVING")
    print("="*80)
    
    df.to_csv(OUTPUT_FILE, index=False)
    print(f"✓ Saved to {OUTPUT_FILE}")
    
    # ============================================================================
    # SUMMARY
    # ============================================================================
    
    print("\n" + "="*80)
    print("PIPELINE SUMMARY")
    print("="*80)
    
    print(f"\nTotal rows: {len(df):,}")
    print(f"\nTwitter-RoBERTa (all rows):")
    print(f"  Positive: {(df['tw_label'] == 'positive').sum():,}")
    print(f"  Neutral:  {(df['tw_label'] == 'neutral').sum():,}")
    print(f"  Negative: {(df['tw_label'] == 'negative').sum():,}")
    
    print(f"\nLLM (uncertain cases only):")
    print(f"  Processed: {len(df_llm):,}")
    print(f"  Positive: {(df_llm['llm_sentiment_label'] == 'positive').sum():,}")
    print(f"  Neutral:  {(df_llm['llm_sentiment_label'] == 'neutral').sum():,}")
    print(f"  Negative: {(df_llm['llm_sentiment_label'] == 'negative').sum():,}")
    
    print("\n✓ Complete!")

  from .autonotebook import tqdm as notebook_tqdm


FIXED HYBRID SENTIMENT PIPELINE
Device: cuda
Per-ticker rows: 1,950

Loading Twitter-RoBERTa...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✓ Loaded

RUNNING TWITTER ROBERTA


Twitter RoBERTa: 100%|██████████| 31/31 [00:46<00:00,  1.49s/it]



Twitter-RoBERTa Results:
  Label distribution:
tw_label
positive    784
neutral     648
negative    518
Name: count, dtype: int64
  Score stats: mean=0.095, std=0.473

SELECTING UNCERTAIN CASES FOR LLM
Uncertain cases (tw_score ∈ [-0.1, 0.1]): 488 rows (25.0%)

LOADING QWEN LLM


`torch_dtype` is deprecated! Use `dtype` instead!
INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


✓ Qwen loaded

RUNNING LLM ON UNCERTAIN CASES


LLM:   0%|          | 0/61 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:   2%|▏         | 1/61 [00:18<18:10, 18.17s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:   3%|▎         | 2/61 [00:34<16:51, 17.15s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:   5%|▍         | 3/61 [00:52<16:47, 17.38s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing


Error at batch 208: 'NoneType' object is not subscriptable


LLM:  46%|████▌     | 28/61 [07:41<07:23, 13.45s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:  48%|████▊     | 29/61 [07:58<07:38, 14.34s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:  49%|████▉     | 30/61 [08:15<07:46, 15.04s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:  51%|█████     | 31/61 [08:33<07:51, 15.72s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
LLM:  52%|█████▏    | 32/61 [08:50<07:50, 16.22s/it]A decoder-only architecture is being used, but r


LLM RESULTS ANALYSIS

LLM Label Distribution:
llm_sentiment_label
negative    292
neutral     106
positive     90
Name: count, dtype: int64

LLM Score Statistics:
count    488.000000
mean      -0.123590
std        0.470279
min       -1.000000
25%       -0.300000
50%       -0.300000
75%        0.000000
max        1.000000
Name: llm_sentiment_score, dtype: float64

LLM Score Diversity:
  Unique values: 75 out of 488
  ✓ Good diversity!

LLM EXAMPLES

Most positive by LLM:

  Score: +1.00 | Ticker: HIVE
  Twitter: +0.06 (uncertain)
  Text: They have no debt. Trades at a discount to book value. SG&A is minimal. They have an atm just in cas...

  Score: +1.00 | Ticker: AMD
  Twitter: +0.04 (uncertain)
  Text: You seriously think it’s more likely for NVDA to increase to 6T versus AMD going to 300-350B from 20...

  Score: +1.00 | Ticker: WVE
  Twitter: +0.05 (uncertain)
  Text: I sold half of my PLTR position after it doubled to recover the initial investment- and put it into ...

Most nega


 'neutral' 'neutral' 'negative' 'neutral' 'negative' 'negative' 'positive'
 'positive' 'positive' 'positive' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'neutral' 'negative' 'negative'
 'positive' 'negative' 'negative' 'positive' 'positive' 'positive'
 'positive' 'negative' 'negative' 'negative' 'negative' 'neutral'
 'positive' 'positive' 'negative' 'negative' 'negative' 'negative'
 'negative' 'negative' 'negative' 'negative' 'positive' 'negative'
 'positive' 'positive' 'positive' 'positive' 'negative' 'neutral'
 'negative' 'negative' 'negative' 'negative' 'negative' 'negative'
 'negative' 'neutral' 'neutral' 'negative' 'positive' 'positive'
 'positive' 'negative' 'negative' 'negative' 'neutral' 'negative'
 'negative' 'negative' 'negative' 'negative' 'positive' 'positive'
 'negative' 'neutral' 'negative' 'positive' 'positive' 'negative'
 'negative' 'negative' 'negative' 'neutral' 'negative' 'neutral'
 'negative' 'neutral' 'negative' 'negative' 'positive' 'negati

In [2]:
import pandas as pd
df=pd.read_csv('sentiment_hybrid_twitter_llm.csv')

In [None]:
df.head()