In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split
import os
import asyncio
import numpy as np

# Ensure nltk resources are downloaded
nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download('punkt_tab', quiet=True) # Ensure punkt_tab is downloaded

# --- 1. Data Loading and Preparation ---
def load_and_prepare_data(csv_file):
    """Loads the CSV, combines title and text, and maps sentiment labels."""
    try:
        df = pd.read_csv(csv_file)
    except FileNotFoundError:
        print(f"Error: CSV file not found at {csv_file}")
        return None

    # Combine 'title' and 'text' into a single 'raw_text' column
    df['raw_text'] = df['title'].fillna('') + ' ' + df['text'].fillna('')  # Handle NaN values

    # Map sentiment strings to numerical labels - Remapped to 0, 1, 2
    sentiment_map = {'NEGATIVE': 0, 'Neutral': 1, 'POSITIVE': 2}
    df['label'] = df['sentiment'].map(sentiment_map)

    # Drop rows where sentiment couldn't be mapped (e.g., NaN sentiment)
    df.dropna(subset=['label', 'raw_text'], inplace=True)

    print(f"Data loaded from {csv_file} with {len(df)} valid rows after preparation.")
    return df

# --- 2. Text Cleaning (same as before) ---
def clean_text(text):
    """Cleans text for NLP processing."""
    text = str(text)
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words("english")]
    return " ".join(words)

# --- 3. Sentiment Analysis Model Training (with index reset and correct labels) ---
def train_sentiment_model(df, model_name="cardiffnlp/twitter-roberta-base-sentiment-latest", output_dir="./results", test_size=0.2):
    """Fine-tunes a pre-trained sentiment model and saves it."""

    if 'label' not in df.columns:
        raise ValueError("❌ DataFrame must contain a 'label' column for sentiment training.")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # 3 labels: negative, neutral, positive

    # Split data - use cleaned text and labels
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df["cleaned_text"], df["label"], test_size=test_size, random_state=42, stratify=df["label"]
    )

    train_labels = train_labels.reset_index(drop=True) # ✅ Reset index for train_labels - FIX for KeyError
    test_labels = test_labels.reset_index(drop=True)   # ✅ Reset index for test_labels  - FIX for KeyError

    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=512)
    test_encodings = tokenizer(list(test_texts), truncation=True, padding=True, max_length=512)

    class SentimentDataset(torch.utils.data.Dataset):
        def __init__(self, encodings, labels):
            self.encodings = encodings
            self.labels = labels

        def __len__(self):
            return len(self.labels)

        def __getitem__(self, idx):
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
            item['labels'] = torch.tensor(self.labels[idx]) # Ensure labels are also tensors
            return item

    train_dataset = SentimentDataset(train_encodings, train_labels)
    test_dataset = SentimentDataset(test_encodings, test_labels)

    training_args = TrainingArguments(
        output_dir=output_dir,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
        load_best_model_at_end=True,
        metric_for_best_model='eval_loss',
        greater_is_better=False,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    # Save the fine-tuned model and tokenizer
    model.save_pretrained("sentiment-analysis-model")
    tokenizer.save_pretrained("sentiment-analysis-model")
    print("✅ Fine-tuned sentiment analysis model saved to `sentiment-analysis-model`")
    return trainer

# --- 4. Sentiment Prediction Function (using fine-tuned model, outputs -1 to +1) ---
def predict_sentiment(text, model_path="sentiment-analysis-model"):
    """Predicts sentiment for text using the fine-tuned model and returns score in [-1, 1]."""
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    sentiment_pipeline = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer,  return_all_scores=True) # return_all_scores=True for probabilities

    try:
        result = sentiment_pipeline(text)

        # Get probabilities for each sentiment label
        probs = {item['label']: item['score'] for item in result[0]}

        # Calculate sentiment score based on probabilities: Positive - Negative
        positive_prob = probs.get('POSITIVE', probs.get('positive', 0.0)) # Handle different label cases (uppercase/lowercase)
        negative_prob = probs.get('NEGATIVE', probs.get('negative', 0.0))

        sentiment_score = positive_prob - negative_prob # Simple score from -1 to +1

        return sentiment_score

    except Exception as e:
        print(f"⚠️ Sentiment prediction error: {e}")
        return 0


# --- Main execution ---
if __name__ == "__main__":
    # 1. Load and Prepare Data
    csv_file_path = "reddit_comments_with_sentiment.csv"  # Use your CSV file path - filename is correct now
    df = load_and_prepare_data(csv_file_path)

    if df is not None:
        # 2. Clean Text
        df["cleaned_text"] = df["raw_text"].apply(clean_text)

        # 3. Train Sentiment Model
        try:
            trainer_output = train_sentiment_model(df)
            print(f"✅ Training completed. Trainer output: {trainer_output}")

            # 4. Example Sentiment Prediction
            example_message = "Exciting new AI chip announced by NVIDIA! Stock price expected to surge."
            sentiment_score = predict_sentiment(example_message)
            print(f"Example sentiment for: '{example_message}' - Score: {sentiment_score:.2f}")
            example_message_negative = "Tech stocks are crashing after disappointing earnings reports."
            sentiment_score_negative = predict_sentiment(example_message_negative)
            print(f"Example sentiment for: '{example_message_negative}' - Score: {sentiment_score_negative:.2f}")


        except ValueError as ve:
            print(f"❌ Data Preparation Error: {ve}")
        except Exception as e:
            print(f"❌ Error during model training or prediction: {e}")

Data loaded from reddit_comments_with_sentiment.csv with 2482 valid rows after preparation.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mrazan1412siraj[0m ([33mrazan1412siraj-nanyang-technological-university-singapore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,0.4383,0.447054
2,0.3153,0.434469
3,0.2823,0.589049


✅ Fine-tuned sentiment analysis model saved to `sentiment-analysis-model`
✅ Training completed. Trainer output: <transformers.trainer.Trainer object at 0x78d7c585f1d0>


Device set to use cuda:0
Device set to use cuda:0


Example sentiment for: 'Exciting new AI chip announced by NVIDIA! Stock price expected to surge.' - Score: 0.99
Example sentiment for: 'Tech stocks are crashing after disappointing earnings reports.' - Score: -0.40
