### RoBERTa

In [1]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
X_train = pd.read_csv('../Data/X_train_smote.csv')
y_train = pd.read_csv('../Data/y_train_smote.csv')
X_test = pd.read_csv('../Data/X_test.csv')
y_test = pd.read_csv('../Data/y_test.csv')

# Combine text and labels into a dataset for RoBERTa
train_data = pd.DataFrame({'text': X_train['text'], 'sentiment': y_train['sentiment']})
test_data = pd.DataFrame({'text': X_test['text'], 'sentiment': y_test['sentiment']})

# Define the RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Tokenize the data
def tokenize_data(data):
    return tokenizer(data['text'].tolist(), padding=True, truncation=True, max_length=128)

train_encodings = tokenize_data(train_data)
test_encodings = tokenize_data(test_data)

# Convert labels to a tensor
train_labels = torch.tensor(train_data['sentiment'].tolist())
test_labels = torch.tensor(test_data['sentiment'].tolist())





KeyError: 'text'

In [None]:
# Create Dataset Class for RoBERTa

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)


In [None]:
# Train RoBERTa model

# Initialize RoBERTa model for sequence classification
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

In [None]:
# Evaluate Model

# Make predictions on the test dataset
y_pred_roberta = trainer.predict(test_dataset).predictions.argmax(axis=-1)
y_true_roberta = test_labels.numpy()

# Performance metrics
accuracy_roberta = accuracy_score(y_true_roberta, y_pred_roberta)
print("🔹 RoBERTa Performance:")
print(f"Accuracy: {accuracy_roberta:.4f}")

# Classification Report
print("\nClassification Report:")
print(classification_report(y_true_roberta, y_pred_roberta))

# Confusion Matrix
conf_matrix = confusion_matrix(y_true_roberta, y_pred_roberta)
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['-1', '0', '1'], yticklabels=['-1', '0', '1'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('RoBERTa Confusion Matrix')
plt.show()
