### Load and Prepare Raw Dataset

In [1]:
import pandas as pd
from collections import Counter
import re

# Load the dataset
file_path = "batch_400K.csv"
df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip", low_memory=False)

# Keep only necessary columns and drop rows with missing 'tweet' or 'classes'
df = df[['tweet', 'classes']].dropna(subset=['tweet', 'classes'])

# Process the 'classes' column: split by comma and clean each label
df['label_list'] = df['classes'].apply(lambda x: [label.strip().lower() for label in x.split(",") if label.strip() != ""])

# Compute label distribution for the entire dataset
all_labels = []
for labels in df['label_list']:
    all_labels.extend(labels)
label_counts = Counter(all_labels)

# Convert the Counter to a DataFrame and sort by count descending
df_label_distribution = pd.DataFrame(list(label_counts.items()), columns=["Label", "Count"])
df_label_distribution = df_label_distribution.sort_values(by="Count", ascending=False).reset_index(drop=True)

print("Initial Label Distribution:")
display(df_label_distribution)

Initial Label Distribution:


Unnamed: 0,Label,Count
0,negative,124351
1,positive,119483
2,sarcastic,66213
3,ironic,58302
4,mixed,49040
5,neutral,46244
6,unclear,795


### Remove 'Unclear' Labels

In [3]:
from collections import Counter
import pandas as pd

# Drop rows containing "unclear"
df = df[~df['label_list'].apply(lambda labels: 'unclear' in labels)]

# Compute label distribution
all_labels_cleaned = []
for labels in df['label_list']:
    all_labels_cleaned.extend(labels)

label_counts_cleaned = Counter(all_labels_cleaned)

# Convert to DataFrame and sort
df_label_distribution_cleaned = pd.DataFrame(list(label_counts_cleaned.items()), columns=["Label", "Count"])
df_label_distribution_cleaned = df_label_distribution_cleaned.sort_values(by="Count", ascending=False).reset_index(drop=True)

# Display label distribution
print("Label Distribution After Dropping 'unclear':")
display(df_label_distribution_cleaned)

Label Distribution After Dropping 'unclear':


Unnamed: 0,Label,Count
0,negative,124348
1,positive,119480
2,sarcastic,66209
3,ironic,58300
4,mixed,49013
5,neutral,46238


Clean Dataset: Remove Duplicates and Links

In [5]:
import pandas as pd
from collections import Counter
import re

# Load the dataset
file_path = "batch_400K.csv"
df = pd.read_csv(file_path, encoding="utf-8", on_bad_lines="skip", low_memory=False)

# Keep only necessary columns and drop missing values
df = df[['tweet', 'classes']].dropna(subset=['tweet', 'classes'])

# Convert 'classes' string to a list of labels
df['label_list'] = df['classes'].apply(lambda x: [label.strip().lower() for label in x.split(",") if label.strip() != ""])

# Drop rows with 'unclear' in the labels
df = df[~df['label_list'].apply(lambda labels: 'unclear' in labels)]

# Drop exact duplicate tweets
df = df.drop_duplicates(subset='tweet').reset_index(drop=True)
print(f"After removing duplicates, dataset size: {df.shape[0]}")

# Remove just links (leave everything else untouched)
df['tweet'] = df['tweet'].apply(lambda text: re.sub(r'http\S+|www\S+', '', text))

# Compute label distribution
all_labels = []
for labels in df['label_list']:
    all_labels.extend(labels)

label_counts = Counter(all_labels)
df_label_distribution = pd.DataFrame(label_counts.items(), columns=["Label", "Count"])
df_label_distribution = df_label_distribution.sort_values(by="Count", ascending=False).reset_index(drop=True)

# Display preview and label distribution
print("First few rows of the cleaned dataset:")
display(df.head())

print("Label distribution (after dropping 'unclear' & duplicates):")
display(df_label_distribution)

After removing duplicates, dataset size: 344984
First few rows of the cleaned dataset:


Unnamed: 0,tweet,classes,label_list
0,congrats modi for launching youtube streaming ...,"positive, sarcastic","[positive, sarcastic]"
1,course willing free show tickets are not his p...,mixed,[mixed]
2,Going to the gym is like going to a party wher...,mixed,[mixed]
3,@odarling @Magpie_Guy - Socialise with real hu...,positive,[positive]
4,Spent 3 hours watching tutorial videos. Still ...,"sarcastic, neutral","[sarcastic, neutral]"


Label distribution (after dropping 'unclear' & duplicates):


Unnamed: 0,Label,Count
0,negative,110053
1,positive,94529
2,sarcastic,61810
3,ironic,50494
4,mixed,45435
5,neutral,43823


### Save Cleaned Dataset

In [6]:
# Save the cleaned dataset to a new CSV file
output_path = "batch.400K(no_dupe).csv"
df.to_csv(output_path, index=False, encoding='utf-8')

print(f"Cleaned dataset saved as: {output_path}")

Cleaned dataset saved as: batch.400K(no_dupe).csv


### Multi-Label Stratified Train/Test Split

In [7]:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
import numpy as np

# List of all possible labels (e.g., sarcastic, ironic, etc.)
all_possible_labels = list({label for labels in df['label_list'] for label in labels})
print("All possible labels:", all_possible_labels)

# Create binary matrix for multi-label stratification
def create_binary_matrix(label_lists, all_labels):
    return [[1 if label in labels else 0 for label in all_labels] for labels in label_lists]

Y = np.array(create_binary_matrix(df['label_list'], all_possible_labels))

# Stratified split (80% train, 20% test)
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(msss.split(df, Y))

train_df = df.iloc[train_idx].reset_index(drop=True)
test_df = df.iloc[test_idx].reset_index(drop=True)

print(f"Training set size: {train_df.shape[0]}")
print(f"Test set size: {test_df.shape[0]}")

All possible labels: ['negative', 'sarcastic', 'positive', 'ironic', 'mixed', 'neutral']
Training set size: 275986
Test set size: 68998


### Tokenization + Dataset Preparation (Multi-Label, RoBERTa)

In [8]:
from transformers import AutoTokenizer
from torch.utils.data import Dataset
import torch
import numpy as np

# Load the RoBERTa tokenizer
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create a binary matrix for labels
def encode_labels(label_lists, all_labels):
    return np.array([
        [1 if label in row else 0 for label in all_labels]
        for row in label_lists
    ])

# Encode train and test labels
train_labels = encode_labels(train_df['label_list'], all_possible_labels)
test_labels = encode_labels(test_df['label_list'], all_possible_labels)

# Define a PyTorch Dataset class for HuggingFace Trainer
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = torch.tensor(self.labels[idx], dtype=torch.float)

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        # Return inputs in Trainer-compatible format
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': label
        }

# Create train and test datasets
train_dataset = TweetDataset(train_df['tweet'].tolist(), train_labels, tokenizer)
test_dataset = TweetDataset(test_df['tweet'].tolist(), test_labels, tokenizer)

print("Tokenization complete. Datasets are ready.")

Tokenization complete. Datasets are ready.


### Load, Train & Save RoBERTa Model (Multi-Label)

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, precision_score, recall_score
import torch

# Automatically detect GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load RoBERTa with multi-label setup
num_labels = len(all_possible_labels)
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=num_labels,
    problem_type="multi_label_classification"
).to(device)

# Metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs >= 0.5).int().numpy()
    labels = labels.astype(int)

    return {
        "f1_macro": f1_score(labels, preds, average='macro', zero_division=0),
        "precision_macro": precision_score(labels, preds, average='macro', zero_division=0),
        "recall_macro": recall_score(labels, preds, average='macro', zero_division=0),
    }

# Training configuration
training_args = TrainingArguments(
    output_dir="./results_multilabel_full",       # Model will be saved here
    evaluation_strategy="epoch",                  # Evaluate at end of each epoch
    save_strategy="epoch",                        # Save after every epoch
    load_best_model_at_end=True,                  # Use best version automatically
    metric_for_best_model="f1_macro",             # Based on F1 score
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_steps=100,
    fp16=torch.cuda.is_available(),               # Mixed precision on GPU (if available)
)

# Create Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

🚀 Using device: cuda


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/51750 [00:00<?, ?it/s]

{'loss': 0.4421, 'grad_norm': 0.8763728737831116, 'learning_rate': 4.990338164251208e-05, 'epoch': 0.01}
{'loss': 0.3973, 'grad_norm': 1.6078428030014038, 'learning_rate': 4.980676328502415e-05, 'epoch': 0.01}
{'loss': 0.3773, 'grad_norm': 1.6943782567977905, 'learning_rate': 4.9710144927536237e-05, 'epoch': 0.02}
{'loss': 0.3579, 'grad_norm': 1.9690569639205933, 'learning_rate': 4.9613526570048315e-05, 'epoch': 0.02}
{'loss': 0.3364, 'grad_norm': 2.6708874702453613, 'learning_rate': 4.9516908212560386e-05, 'epoch': 0.03}
{'loss': 0.3281, 'grad_norm': 1.966283917427063, 'learning_rate': 4.9420289855072464e-05, 'epoch': 0.03}
{'loss': 0.3207, 'grad_norm': 3.538959503173828, 'learning_rate': 4.932367149758454e-05, 'epoch': 0.04}
{'loss': 0.3266, 'grad_norm': 3.614055633544922, 'learning_rate': 4.922705314009662e-05, 'epoch': 0.05}
{'loss': 0.3165, 'grad_norm': 2.1703553199768066, 'learning_rate': 4.91304347826087e-05, 'epoch': 0.05}
{'loss': 0.3043, 'grad_norm': 2.213656425476074, 'learn

  0%|          | 0/4313 [00:00<?, ?it/s]

{'eval_loss': 0.2647472321987152, 'eval_f1_macro': 0.6167814019651642, 'eval_precision_macro': 0.7520799647979256, 'eval_recall_macro': 0.5457464308240857, 'eval_runtime': 104.9066, 'eval_samples_per_second': 657.709, 'eval_steps_per_second': 41.113, 'epoch': 1.0}
{'loss': 0.2683, 'grad_norm': 2.0122339725494385, 'learning_rate': 3.329371980676329e-05, 'epoch': 1.0}
{'loss': 0.2413, 'grad_norm': 1.813323974609375, 'learning_rate': 3.3197101449275366e-05, 'epoch': 1.01}
{'loss': 0.2528, 'grad_norm': 2.1719465255737305, 'learning_rate': 3.3100483091787444e-05, 'epoch': 1.01}
{'loss': 0.2624, 'grad_norm': 1.3266047239303589, 'learning_rate': 3.3003864734299515e-05, 'epoch': 1.02}
{'loss': 0.2573, 'grad_norm': 1.803391933441162, 'learning_rate': 3.2907246376811593e-05, 'epoch': 1.03}
{'loss': 0.2544, 'grad_norm': 5.096257209777832, 'learning_rate': 3.281062801932367e-05, 'epoch': 1.03}
{'loss': 0.242, 'grad_norm': 2.73459792137146, 'learning_rate': 3.271400966183575e-05, 'epoch': 1.04}
{'l

  0%|          | 0/4313 [00:00<?, ?it/s]

{'eval_loss': 0.2619418501853943, 'eval_f1_macro': 0.6397526773872966, 'eval_precision_macro': 0.7510739819414312, 'eval_recall_macro': 0.5771059114551075, 'eval_runtime': 104.8764, 'eval_samples_per_second': 657.898, 'eval_steps_per_second': 41.125, 'epoch': 2.0}
{'loss': 0.2282, 'grad_norm': 1.5204733610153198, 'learning_rate': 1.6582608695652176e-05, 'epoch': 2.01}
{'loss': 0.2213, 'grad_norm': 2.410201072692871, 'learning_rate': 1.6485990338164254e-05, 'epoch': 2.01}
{'loss': 0.2233, 'grad_norm': 2.4890148639678955, 'learning_rate': 1.639033816425121e-05, 'epoch': 2.02}
{'loss': 0.2276, 'grad_norm': 3.089738130569458, 'learning_rate': 1.6293719806763285e-05, 'epoch': 2.02}
{'loss': 0.2264, 'grad_norm': 1.6722990274429321, 'learning_rate': 1.619806763285024e-05, 'epoch': 2.03}
{'loss': 0.2252, 'grad_norm': 2.4561803340911865, 'learning_rate': 1.6101449275362322e-05, 'epoch': 2.03}
{'loss': 0.2258, 'grad_norm': 1.9559632539749146, 'learning_rate': 1.6004830917874397e-05, 'epoch': 2.0

  0%|          | 0/4313 [00:00<?, ?it/s]

{'eval_loss': 0.26113641262054443, 'eval_f1_macro': 0.6510836303865438, 'eval_precision_macro': 0.7452413551932254, 'eval_recall_macro': 0.5877528001475311, 'eval_runtime': 106.2236, 'eval_samples_per_second': 649.554, 'eval_steps_per_second': 40.603, 'epoch': 3.0}
{'train_runtime': 6391.1981, 'train_samples_per_second': 129.547, 'train_steps_per_second': 8.097, 'train_loss': 0.2528356359546311, 'epoch': 3.0}


TrainOutput(global_step=51750, training_loss=0.2528356359546311, metrics={'train_runtime': 6391.1981, 'train_samples_per_second': 129.547, 'train_steps_per_second': 8.097, 'total_flos': 5.446318173492326e+16, 'train_loss': 0.2528356359546311, 'epoch': 3.0})

### Save the Trained Model

In [None]:
# Save the model and tokenizer to a new directory
save_path = "./roberta-multilabel-full"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to: {save_path}")

### Generate a Detailed Classification Report

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Evaluate the model on the test dataset
predictions = trainer.predict(test_dataset)

# Apply sigmoid and threshold
probs = torch.sigmoid(torch.tensor(predictions.predictions))
y_pred = (probs >= 0.5).int().numpy()
y_true = predictions.label_ids.astype(int)

# Classification report
report = classification_report(
    y_true, y_pred, target_names=all_possible_labels, zero_division=0
)
print("Classification Report (Multi-Label):")
print(report)

  0%|          | 0/4313 [00:00<?, ?it/s]

📋 Classification Report (Multi-Label):
              precision    recall  f1-score   support

     neutral       0.70      0.44      0.54      8765
    positive       0.83      0.82      0.83     18906
       mixed       0.64      0.44      0.52      9087
   sarcastic       0.82      0.50      0.62     12362
    negative       0.80      0.78      0.79     22010
      ironic       0.69      0.54      0.61     10099

   micro avg       0.77      0.64      0.70     81229
   macro avg       0.75      0.59      0.65     81229
weighted avg       0.77      0.64      0.69     81229
 samples avg       0.72      0.68      0.69     81229

