In [4]:
# --- 4.1 Setup and Model Loading ‚öôÔ∏è ---

# 1. Ensure SentencePiece is installed (essential for XLMRobertaTokenizer)
# Run this line once in your Anaconda/VS Code terminal if you haven't already:
# !pip install sentencepiece

# Install other required libraries (if needed)
# !pip install transformers pandas torch

import pandas as pd
import torch
# We use the specific slow tokenizer class to avoid the previous error
from transformers import AutoModelForSequenceClassification, XLMRobertaTokenizer
import numpy as np
import os
import time

# --- FILE PATHS ---
INPUT_FILE = '../data/cleaned_multilingual_data.csv'
OUTPUT_FILE = '../data/labeled_multilingual_data.csv'

# --- MODEL DEFINITIONS ---
# 1. Sentiment Model (Multilingual RoBERTa fine-tuned on Twitter data)
SENTIMENT_MODEL = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
# Emotion Model is intentionally excluded

# --- DEVICE CONFIGURATION ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


# Load Model and Tokenizer
print("\nLoading Sentiment Model...")
# Use the stable XLMRobertaTokenizer class
sentiment_tokenizer = XLMRobertaTokenizer.from_pretrained(SENTIMENT_MODEL)
sentiment_model = AutoModelForSequenceClassification.from_pretrained(SENTIMENT_MODEL).to(device)

# Load the cleaned data from Step 3
try:
    df = pd.read_csv(INPUT_FILE, encoding='utf-8')
    print(f"\nData loaded successfully. Total records: {len(df)}")
    print("\nInitial DataFrame Head:")
    print(df.head())
except FileNotFoundError:
    print(f"‚ùå Error: Input file not found at {INPUT_FILE}. Please ensure Step 3 ran correctly.")
    df = pd.DataFrame()


# ----------------------------------------------------------------------
# --- 4.2 Analysis Function (Sentiment Only) ---
# ----------------------------------------------------------------------

def analyze_sentiment(text):
    """
    Performs sentiment analysis (Positive, Neutral, Negative) on a text string.
    """
    if not text or not isinstance(text, str):
        return np.nan
    try:
        # Tokenize and move to the appropriate device
        inputs = sentiment_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)
        
        # Inference
        with torch.no_grad():
            outputs = sentiment_model(**inputs)
            
        # Get the predicted class index (0: Negative, 1: Neutral, 2: Positive)
        scores = outputs.logits.softmax(dim=-1).cpu().numpy()[0]
        sentiment_labels = sentiment_model.config.id2label
        predicted_id = np.argmax(scores)
        sentiment = sentiment_labels[predicted_id]
        
        return sentiment
        
    except Exception:
        # Catch any processing errors
        return "PROCESSING_ERROR"


# ----------------------------------------------------------------------
# --- 4.3 Execution and Data Saving üíæ ---
# ----------------------------------------------------------------------

if not df.empty:
    print("\n--- Starting Sentiment Analysis ---")
    start_time = time.time()
    
    # Apply Sentiment Analysis
    df['sentiment'] = df['text'].apply(analyze_sentiment)
    print("‚úÖ Sentiment analysis complete.")
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"\nTotal analysis time: {elapsed_time:.2f} seconds.")

    # Drop the temporary 'raw_timestamp' column if it's no longer needed
    df.drop(columns=['raw_timestamp'], inplace=True, errors='ignore')
    # Add a placeholder/empty emotion column for consistency if needed later
    df['emotion'] = 'N/A (Excluded)'
    
    # --- Final Save ---
    os.makedirs(os.path.dirname(OUTPUT_FILE), exist_ok=True) 
    # Ensure correct encoding for multilingual data
    df.to_csv(OUTPUT_FILE, index=False, encoding='utf-8')

    print("\n" + "="*60)
    print("        üéâ Model Implementation Complete! üéâ")
    print("="*60)
    print(f"Final Labeled Data saved to: {os.path.abspath(OUTPUT_FILE)}")
    
    print("\nSentiment Distribution:")
    print(df['sentiment'].value_counts())
    
    print("\nReady for Step 5: Visualization Dashboard.")
else:
    print("\nCannot proceed with Step 4 as the input DataFrame is empty.")

Using device: cpu

Loading Sentiment Model...

Data loaded successfully. Total records: 2792

Initial DataFrame Head:
                                                text language  raw_timestamp  \
0         Grok is openly rebelling against its owner       en   1.743083e+09   
1  Graphic designers panicking about losing their...       en   1.743171e+09   
2                              He s absolutely right       en   1.760363e+09   
3  Elon Musk s AI chatbot estimates 75-85 likelih...       en   1.741306e+09   
4  UAE deposited 2 billion in Trump s crypto firm...       en   1.758013e+09   

        source_type source_name  
0  Reddit_Subreddit  artificial  
1  Reddit_Subreddit  artificial  
2  Reddit_Subreddit  artificial  
3  Reddit_Subreddit  artificial  
4  Reddit_Subreddit  artificial  

--- Starting Sentiment Analysis ---


KeyboardInterrupt: 