In [None]:
!pip install emoji
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "stock_tweets.csv" # Example: Replace with the actual file name from the dataset

# Load the latest version
df = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "equinxx/stock-tweets-for-sentiment-analysis-and-prediction",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

print("First 5 records:", df.head())



  df = kagglehub.load_dataset(


Using Colab cache for faster access to the 'stock-tweets-for-sentiment-analysis-and-prediction' dataset.
First 5 records:                         Date  \
0  2022-09-29 23:41:16+00:00   
1  2022-09-29 23:24:43+00:00   
2  2022-09-29 23:18:08+00:00   
3  2022-09-29 22:40:07+00:00   
4  2022-09-29 22:27:05+00:00   

                                               Tweet Stock Name Company Name  
0  Mainstream media has done an amazing job at br...       TSLA  Tesla, Inc.  
1  Tesla delivery estimates are at around 364k fr...       TSLA  Tesla, Inc.  
2  3/ Even if I include 63.0M unvested RSUs as of...       TSLA  Tesla, Inc.  
3  @RealDanODowd @WholeMarsBlog @Tesla Hahaha why...       TSLA  Tesla, Inc.  
4  @RealDanODowd @Tesla Stop trying to kill kids,...       TSLA  Tesla, Inc.  


In [None]:
# ---- Install Requirements ----
!pip install transformers torch pandas tqdm emoji

import os
import re
import math
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import emoji

# ---- CONFIG ----
INPUT_CSV = "stock_tweets.csv"   # change to your file
TEXT_COL = "Tweet"                # change if your column name differs
OUTPUT_CSV = "kaggle_tweets_with_sentiment.csv"
BATCH_SIZE = 32                   # reduce if you run out of memory
MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment"  # strong baseline for tweets
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# ----------------

print("Using device:", DEVICE)

# ---- utilities: basic tweet cleaning (optional but helps) ----
def clean_tweet_text(text: str) -> str:
    """Basic cleaning for tweets: remove handles, URLs, reduce repeated whitespace, keep emojis."""
    if not isinstance(text, str):
        return ""
    # normalize emojis
    text = emoji.demojize(text)
    # remove user handles
    text = re.sub(r"@\w+", "", text)
    # remove URLs
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    # remove RT prefix
    text = re.sub(r"^RT\s+:?", "", text)
    # collapse whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ---- load tokenizer and model ----
print("Loading model and tokenizer:", MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.to(DEVICE)
model.eval()

# The cardiffnlp model has labels: 0 -> negative, 1 -> neutral, 2 -> positive

# ---- scoring function: map model outputs to continuous [-1, +1] ----
import torch.nn.functional as F

def predict_batch_scores(texts):
    """
    Input: list of raw tweet texts
    Output: numpy array of continuous scores in [-1, +1], shape (len(texts),)
    Approach: feed texts to model -> softmax probs -> score = P(pos) - P(neg)
    """
    # Tokenize
    enc = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    enc = {k: v.to(DEVICE) for k, v in enc.items()}
    with torch.no_grad():
        out = model(**enc)
        logits = out.logits  # shape (batch, 3)
        probs = F.softmax(logits, dim=-1).cpu().numpy()
    # probs[:, 2] = P(pos), probs[:, 0] = P(neg)
    scores = probs[:, 2] - probs[:, 0]
    # scores naturally lie in (-1, 1). They can be pushed to exactly [-1,1] by clipping.
    return np.clip(scores, -1.0, 1.0)


# ---- main loop: read CSV, compute scores, save ----
print("Loading data:", INPUT_CSV)
# The 'df' DataFrame was already loaded in the previous cell (bOv4P8EzpsJ) directly
# by kagglehub.load_dataset. No need to re-read from a file.
# The variable 'df' is already available from the notebook's global scope.

if TEXT_COL not in df.columns:
    raise ValueError(f"Column '{TEXT_COL}' not found in CSV. Available columns: {df.columns.tolist()}")

# Optional: create cleaned text column (we'll feed cleaned text to the model)
df["_cleaned_text"] = df[TEXT_COL].fillna("").astype(str).map(clean_tweet_text)

scores = np.zeros(len(df), dtype=float)

# Batch inference with progress bar
num_batches = math.ceil(len(df) / BATCH_SIZE)
for i in tqdm(range(num_batches), desc="Inferencing"):
    start = i * BATCH_SIZE
    end = min(len(df), start + BATCH_SIZE)
    batch_texts = df["_cleaned_text"].iloc[start:end].tolist()
    batch_scores = predict_batch_scores(batch_texts)
    scores[start:end] = batch_scores

df["sentiment_score"] = scores  # continuous score in [-1, 1], where 1 = very positive, -1 = very negative

# Optional: add component probs/columns if you want more info:
def predict_probs_for_batch(texts):
    enc = tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    enc = {k: v.to(DEVICE) for k, v in enc.items()}
    with torch.no_grad():
        out = model(**enc)
        logits = out.logits
        probs = F.softmax(logits, dim=-1).cpu().numpy()
    return probs

# If you want to save the raw probabilities (slow; optional)
# Example: save first 1000 to avoid extra time
SAVE_RAW_PROBS = False
if SAVE_RAW_PROBS:
    negs = np.zeros(len(df))
    neuts = np.zeros(len(df))
    poss = np.zeros(len(df))
    for i in tqdm(range(num_batches), desc="Infer probs"):
        start = i * BATCH_SIZE
        end = min(len(df), start + BATCH_SIZE)
        batch_texts = df["_cleaned_text"].iloc[start:end].tolist()
        p = predict_probs_for_batch(batch_texts)
        negs[start:end] = p[:, 0]
        neuts[start:end] = p[:, 1]
        poss[start:end] = p[:, 2]
    df["prob_neg"] = negs
    df["prob_neutral"] = neuts
    df["prob_pos"] = poss

# Save
print("Saving output to:", OUTPUT_CSV)
df.to_csv(OUTPUT_CSV, index=False)
print("Done. Example rows:")
print(df[[TEXT_COL, "_cleaned_text", "sentiment_score"]].head(10).to_string(index=False))

Using device: cuda
Loading model and tokenizer: cardiffnlp/twitter-roberta-base-sentiment


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Loading data: stock_tweets.csv


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Inferencing:   0%|          | 0/2525 [00:00<?, ?it/s]

Saving output to: kaggle_tweets_with_sentiment.csv
Done. Example rows:
                                                                                                                                                                                                                                                                                                       Tweet                                                                                                                                                                                                                                                                             _cleaned_text  sentiment_score
            Mainstream media has done an amazing job at brainwashing people. Today at work, we were asked what companies we believe in &amp; I said @Tesla because they make the safest cars &amp; EVERYONE disagreed with me because they heard“they catch on fire &amp; the batteries cost 20k to replace” Mainstream media has done an

In [None]:
# Test a few example sentences
custom_texts = [
    "Nvidia stock is really shit!",
    "I'm really worried about the market crash tomorrow.",
    "The company reported earnings in line with expectations.",
    "Buy the dip, it's going to the moon!",
    "This stock is performing terribly."
]

def get_sentiment_details(texts):
    cleaned = [clean_tweet_text(t) for t in texts]
    enc = tokenizer(cleaned, padding=True, truncation=True, max_length=128, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        out = model(**enc)
        probs = F.softmax(out.logits, dim=-1).cpu().numpy()
    # Convert to dataframe for clarity
    df_preds = pd.DataFrame(probs, columns=["prob_neg", "prob_neutral", "prob_pos"])
    df_preds["sentiment_score"] = df_preds["prob_pos"] - df_preds["prob_neg"]
    df_preds["input_text"] = texts
    return df_preds[["input_text", "prob_neg", "prob_neutral", "prob_pos", "sentiment_score"]]

# Run
results = get_sentiment_details(custom_texts)
print(results.to_string(index=False))


                                              input_text  prob_neg  prob_neutral  prob_pos  sentiment_score
                            Nvidia stock is really shit!  0.959400      0.036368  0.004232        -0.955167
     I'm really worried about the market crash tomorrow.  0.929944      0.064946  0.005109        -0.924835
The company reported earnings in line with expectations.  0.001847      0.144266  0.853887         0.852040
                    Buy the dip, it's going to the moon!  0.075255      0.517282  0.407463         0.332208
                      This stock is performing terribly.  0.948704      0.048136  0.003159        -0.945545


In [None]:
import math
sentiment_score = math.log((1 + positive_count) / (1 + negative_count))
investment_amount = base_budget * np.tanh(sentiment_score)


NameError: name 'positive_count' is not defined