In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from transformers import RobertaTokenizer, RobertaModel, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.nn.utils.rnn import pad_sequence

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download VADER Lexicon
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# ---------------------------
# Function to Extract Word-Level Sentiment Features
# ---------------------------
def get_word_sentiment_scores(text, tokenizer):
    tokens = tokenizer.tokenize(text)
    scores = []
    for token in tokens:
        sentiment = sia.polarity_scores(token)
        scores.append([sentiment['pos'], sentiment['neg'], sentiment['neu'], sentiment['compound']])
    return torch.tensor(scores, dtype=torch.float32)


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nives\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
# ---------------------------
# Dataset Class with Padded Sentiment Features
# ---------------------------
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        # Ensure texts are in list format and of type string
        if isinstance(texts, pd.Series):  # Check if texts is a pandas Series
            texts = texts.tolist()  # Convert to list
        elif not isinstance(texts, list):  # Check if it's not already a list
            raise ValueError("Expected 'texts' to be a pandas Series or list of strings.")
        
        # Ensure that each text is a string
        texts = [str(text) for text in texts]

        # Debug: Check the type of texts and sample text data
        print(f"Texts type: {type(texts)}")
        print(f"Sample texts: {texts[:5]}")  # Print first 5 texts

        # Tokenize the texts
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256, return_tensors='pt')

        # Debug: Check the tokenized output
        print(f"Tokenized encodings: {self.encodings.keys()}")

        # Generate sentiment features for each word (ensure text is in string format)
        self.sentiment_features = [
            torch.tensor(get_word_sentiment_scores(str(text), tokenizer), dtype=torch.float32) for text in texts
        ]
        self.labels = torch.tensor(labels.values, dtype=torch.long)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'sentiment_features': self.sentiment_features[idx],
            'labels': self.labels[idx]
        }

In [4]:
# ---------------------------
# Collate Function for Dataloader
# ---------------------------
def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.tensor([item['labels'] for item in batch])

    # Pad sentiment features
    sentiment_features = [item['sentiment_features'] for item in batch]
    padded_sentiment_features = pad_sequence(sentiment_features, batch_first=True, padding_value=0.0)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'sentiment_features': padded_sentiment_features,
        'labels': labels
    }

In [5]:
# ---------------------------
# Load Preprocessed Data
# ---------------------------
df = pd.read_csv("../cleaned_data/preprocessed_sentiment140.csv")
# cols = ['labels','id','date','query','user','tweet']
# df = pd.read_csv("C:/Users/nives/OneDrive/Desktop/HybridSA/datasets/sentiment140.csv",encoding="ISO-8859-1",header=None, names=cols)
# df.drop(['id','date', 'query','user'],axis=1,inplace=True)

# df.rename({'label': 'labels'}, axis=1, inplace=True)
print(df['labels'].unique())
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['tweet'], df['labels'], test_size=0.2, random_state=42
)

# Initialize Tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Create Dataset and DataLoader
train_dataset = IMDBDataset(train_texts, train_labels, tokenizer)
val_dataset = IMDBDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, collate_fn=collate_fn)

[0 1]
Texts type: <class 'list'>
Sample texts: ['ya quot like palm pre touchston charger readynow ye sound good beer readi prelaunch', 'felt earthquak afternoon seem epicent', 'ruffl shirt like likey', 'pretti bad night crappi morn fml buttfac didnt say could go work today', 'yeah clear view']
Tokenized encodings: dict_keys(['input_ids', 'attention_mask'])


  torch.tensor(get_word_sentiment_scores(str(text), tokenizer), dtype=torch.float32) for text in texts


Texts type: <class 'list'>
Sample texts: ['ahhh hope ok', 'cool tweet app razr 2', 'know famili drama lame hey next time u hang kim n u guy like sleepov whatev ill call u', 'school email open geographi stuff revis stupid school', 'upper airway problem']
Tokenized encodings: dict_keys(['input_ids', 'attention_mask'])


In [6]:
# ---------------------------
# Hybrid Model Definition
# ---------------------------
class HybridSentimentModel(nn.Module):
    def __init__(self):
        super(HybridSentimentModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.lexicon_fc = nn.Linear(4, 16)  # 4 lexicon features -> 16 dimensions
        self.fc = nn.Linear(self.roberta.config.hidden_size + 16, 2)  # Combined -> 2 classes

    def forward(self, input_ids, attention_mask, sentiment_features):
        roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        roberta_cls = roberta_output.last_hidden_state[:, 0, :]  # Fixed: Using CLS token

        lexicon_out = self.lexicon_fc(sentiment_features.mean(dim=1))  # Mean word-level features
        combined = torch.cat((roberta_cls, lexicon_out), dim=1)
        
        return self.fc(combined)

In [None]:
# ---------------------------
# Initialize Model
# ---------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridSentimentModel().to(device)

# Freeze RoBERTa layers initially
for param in model.roberta.parameters():
    param.requires_grad = False

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

# Learning Rate Scheduler
total_steps = len(train_loader) * 4  # For 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# ---------------------------
# Training Function
# ---------------------------
def train_model(model, train_loader, val_loader, epochs=4):
    for epoch in range(epochs):
        if epoch == 2:  # Unfreeze RoBERTa after 2 epochs
            for param in model.roberta.parameters():
                param.requires_grad = True

        model.train()
        total_loss = 0

        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            sentiment_features = batch['sentiment_features'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask, sentiment_features)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()  # Update learning rate
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

    # Evaluate
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            sentiment_features = batch['sentiment_features'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask, sentiment_features)
            preds = torch.argmax(outputs, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print("Validation Accuracy:", accuracy)
    print(classification_report(all_labels, all_preds))

# ---------------------------
# Train the Model
# ---------------------------
train_model(model, train_loader, val_loader, epochs=4)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ---------------------------
# Save the Model
# ---------------------------
torch.save({
    'model_state_dict': model.state_dict(),
    'tokenizer': tokenizer
}, 'sentiment140model_hybrid.pth')
print("Model saved successfully!")

Model saved successfully!


In [None]:
import torch
import torch.nn.functional as F
from transformers import RobertaTokenizer, RobertaModel
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Load VADER Sentiment Analyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

# Load the trained model and tokenizer
checkpoint = torch.load('sentiment140model_hybrid.pth', map_location=torch.device('cpu'))
tokenizer = checkpoint['tokenizer']

class HybridSentimentModel(nn.Module):
    def __init__(self):
        super(HybridSentimentModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained('roberta-base')
        self.lexicon_fc = nn.Linear(4, 16)  # 4 lexicon features -> 16 dimensions
        self.fc = nn.Linear(self.roberta.config.hidden_size + 16, 2)  # Combined -> 2 classes

    def forward(self, input_ids, attention_mask, sentiment_features):
        roberta_output = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        roberta_cls = roberta_output.last_hidden_state[:, 0, :]  # CLS token

        lexicon_out = self.lexicon_fc(sentiment_features.mean(dim=1))  # Mean word-level features
        combined = torch.cat((roberta_cls, lexicon_out), dim=1)
        
        return self.fc(combined)

# Initialize model and load weights
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridSentimentModel().to(device)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Function to extract word sentiment scores
def get_word_sentiment_scores(text):
    tokens = tokenizer.tokenize(text)
    scores = [sia.polarity_scores(token) for token in tokens]
    return torch.tensor([[s['pos'], s['neg'], s['neu'], s['compound']] for s in scores], dtype=torch.float32)

# Function to predict sentiment
def predict_sentiment(text):
    # Tokenize input text
    encoding = tokenizer(text, truncation=True, padding='max_length', max_length=256, return_tensors='pt')

    # Extract word-level sentiment features
    sentiment_features = get_word_sentiment_scores(text).unsqueeze(0)  # Add batch dimension

    # Move to device
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    sentiment_features = sentiment_features.to(device)

    # Make prediction
    with torch.no_grad():
        output = model(input_ids, attention_mask, sentiment_features)
        probabilities = F.softmax(output, dim=1)
        confidence, predicted_class = torch.max(probabilities, dim=1)

    # Define class labels
    class_labels = {0: "Negative", 1: "Positive"}
    
    return {
        "text": text,
        "predicted_class": class_labels[predicted_class.item()],
        "confidence": confidence.item() * 100  # Convert to percentage
    }

# Example inference
sample_text = "I absolutely loved the movie! It was a masterpiece."
result = predict_sentiment(sample_text)
print("Input Text:", result['text'])
print("Predicted Sentiment:", result['predicted_class'])
print("Confidence Score:", result['confidence']) 

sample_text3 = "Oh wow, another software update. I can't wait to lose all my settings again!"
result3 = predict_sentiment(sample_text3)
print("Input Text:", result3['text'])
print("Predicted Sentiment:", result3['predicted_class'])
print("Confidence Score:", result3['confidence']) 

sample_text2 = "The ending was so bad that it ruined the entire movie for me."
result2 = predict_sentiment(sample_text2)
print("\nInput Text:", result2['text'])
print("Predicted Sentiment:", result2['predicted_class'])
print("Confidence Score:", result2['confidence'])

sample_text4 = "Oh great, my flight got delayed. More time to enjoy these uncomfortable airport chairs!"
result4 = predict_sentiment(sample_text4)
print("\nInput Text:", result4['text'])
print("Predicted Sentiment:", result4['predicted_class'])
print("Confidence Score:", result4['confidence'])


# Handling low-confidence predictions
# if result2['confidence'] < 50:
#     print("This prediction has low confidence and might be unreliable.")
