In [1]:
!pip install transformers
!pip install lime
!pip install scikit-learn

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=5e18a3fd1474a2dd4360d361628251a7d47a6f69325ef9841ff9518367c5f716
  Stored in directory: /root/.cache/pip/wheels/85/fa/a3/9c2d44c9f3cd77cf4e533b58900b2bf4487f2a17e8ec212a3d
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import lime
from lime.lime_text import LimeTextExplainer

In [3]:
import pandas as pd

# Load the merged CSV
df = pd.read_csv('/content/dataset.csv')

# Check columns
print(df.columns)
df.head()


Index(['sentiment', 'text', 'user'], dtype='object')


Unnamed: 0,sentiment,text,user
0,neutral,Michael Parenti. \To Kill a Nation: The Attack...,juristnaprovode
1,positive,RT @VanberghenEU: 100 days of war in #Ukraine ...,VanberghenEU
2,neutral,RT @EU_today: Meet the crew of the Spanish sub...,EU_today
3,neutral,RT @radio3mondo: Il podcast della rassegna sta...,CostanzaSpocci
4,positive,RT @jensstoltenberg: Great to meet with Prime ...,jonathankrico


In [4]:
# Rename columns if needed
df = df.rename(columns={'text': 'content', 'category': 'sentiment'})  # Only if needed

# Keep only 'content' and 'sentiment'
df = df[['content', 'sentiment']]

# Map Sentiment Labels
label2id = {'Positive': 0, 'Negative': 1, 'Neutral': 2}
id2label = {0: 'Positive', 1: 'Negative', 2: 'Neutral'}

df['sentiment'] = df['sentiment'].str.capitalize()
df['label'] = df['sentiment'].map(label2id)

# Drop any missing labels
df = df.dropna(subset=['label'])

# Get final texts and labels
texts = df['content'].tolist()
labels = df['label'].tolist()

print(f"Dataset ready: {len(texts)} samples.")


Dataset ready: 1188 samples.


In [5]:
from transformers import RobertaTokenizer

# Load RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Tokenize
encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
print("Tokenization done.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Tokenization done.


In [6]:
import torch
from torch.utils.data import Dataset

class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

print("Dataset class ready.")

Dataset class ready.


In [8]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from transformers import RobertaForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# 5-Fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
f1_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels)):
    print(f'🔵 Fold {fold+1}')

    train_texts = [texts[i] for i in train_idx]
    train_labels = [labels[i] for i in train_idx]
    val_texts = [texts[i] for i in val_idx]
    val_labels = [labels[i] for i in val_idx]

    train_dataset = NewsDataset(train_texts, train_labels)
    val_dataset = NewsDataset(val_texts, val_labels)

    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

    training_args = TrainingArguments(
        output_dir=f'./results_fold_{fold+1}',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        logging_dir='./logs',
        logging_steps=10,
        learning_rate=2e-5,
        load_best_model_at_end=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_result = trainer.evaluate()

    accuracies.append(eval_result['eval_accuracy'])
    f1_scores.append(eval_result['eval_f1'])

print("Training complete.")


🔵 Fold 1


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,0.9428
20,0.7897
30,0.8402
40,0.6903
50,0.7936
60,0.7485
70,0.5845
80,0.6523
90,0.4864
100,0.5886


🔵 Fold 2


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,0.9624


Step,Training Loss
10,0.9624
20,0.8861
30,0.7291
40,0.7177
50,0.6361
60,0.6611
70,0.5674
80,0.4934
90,0.5145
100,0.4455


🔵 Fold 3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss
10,0.9569


In [None]:
print(f'Average Accuracy across folds: {np.mean(accuracies):.4f}')
print(f'Average F1-score across folds: {np.mean(f1_scores):.4f}')

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Predict on last fold validation set
val_dataset = NewsDataset(val_texts, val_labels)
preds_output = trainer.predict(val_dataset)
preds = np.argmax(preds_output.predictions, axis=1)

# Create confusion matrix
cm = confusion_matrix(val_labels, preds)

# Plot
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label2id.keys(), yticklabels=label2id.keys())
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix - Last Fold')
plt.show()


In [None]:
!pip install lime

from lime.lime_text import LimeTextExplainer

# Define prediction function for LIME
def predict_proba(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=256)
    outputs = model(**inputs)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return probs.detach().numpy()

# Initialise Lime Explainer
explainer = LimeTextExplainer(class_names=['Positive', 'Negative', 'Neutral'])

# Pick a random sample from validation set
idx = 0  # you can change idx to see different examples

exp = explainer.explain_instance(val_texts[idx], predict_proba, num_features=6)

# Show explanation
exp.show_in_notebook(text=True)
