In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaModel, XLMRobertaTokenizer
from sklearn.metrics import classification_report
import pandas as pd

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
class FauxHateDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

        # Map labels to integers, treating blank as "N/A"
        self.target_map = {'O': 0, 'I': 1, 'R': 2, 'N/A': 3}
        self.severity_map = {'H': 0, 'M': 1, 'L': 2, 'N/A': 3}

        # Fill blanks with 'N/A' and map to integers
        self.data['Target'] = self.data['Target'].fillna('N/A').map(self.target_map)
        self.data['Severity'] = self.data['Severity'].fillna('N/A').map(self.severity_map)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text = row['Tweet']
        label_target = row['Target']
        label_severity = row['Severity']

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels_target': torch.tensor(label_target, dtype=torch.long),
            'labels_severity': torch.tensor(label_severity, dtype=torch.long)
        }

In [13]:
class AttentionLayer(nn.Module):
    def __init__(self, hidden_dim):
        super(AttentionLayer, self).__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, features):
        attn_weights = torch.softmax(self.attn(features), dim=1)
        weighted_representation = torch.sum(features * attn_weights, dim=1)
        return weighted_representation

In [14]:
class MultiTaskAttentionModel(nn.Module):
    def __init__(self, num_classes_target=4, num_classes_severity=4, hidden_dim=768):
        super(MultiTaskAttentionModel, self).__init__()
        self.xlm_roberta = XLMRobertaModel.from_pretrained('xlm-roberta-base')

        # Task-specific attention layers
        self.target_attention = AttentionLayer(hidden_dim)
        self.severity_attention = AttentionLayer(hidden_dim)

        # Final classification layers
        self.target_classifier = nn.Linear(hidden_dim, num_classes_target)
        self.severity_classifier = nn.Linear(hidden_dim, num_classes_severity)

    def forward(self, input_ids, attention_mask):
        outputs = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask)
        all_hidden_states = outputs.last_hidden_state  # Get token-level representations

        target_representation = self.target_attention(all_hidden_states)
        severity_representation = self.severity_attention(all_hidden_states)

        target_logits = self.target_classifier(target_representation)
        severity_logits = self.severity_classifier(severity_representation)

        return target_logits, severity_logits


In [15]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, patience=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for batch in train_loader:
            target_labels = batch['labels_target'].to(device)
            severity_labels = batch['labels_severity'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            optimizer.zero_grad()
            target_logits, severity_logits = model(input_ids, attention_mask)

            target_loss = criterion(target_logits, target_labels)
            severity_loss = criterion(severity_logits, severity_labels)
            loss = target_loss + severity_loss

            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        train_loss /= len(train_loader)
        val_loss, val_target_preds, val_severity_preds, val_target_labels, val_severity_labels = evaluate_model(model, val_loader, criterion, device)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
        print("Validation Target Classification Report:")
        print(classification_report(val_target_labels, val_target_preds))
        print("Validation Severity Classification Report:")
        print(classification_report(val_severity_labels, val_severity_preds))
        print("--------------------------------------------------")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), '/content/drive/MyDrive/Icon Conference/multitaskXLMRoBERTa/best_attention_model_task_b.pth')  # Save the best model
            print("Model improved, saving current model.")
        else:
            patience_counter += 1
            print(f"No improvement. Early stopping counter: {patience_counter}/{patience}")

        if patience_counter >= patience:
            print("Early stopping triggered. Training terminated.")
            break

In [16]:
def evaluate_model(model, val_loader, criterion, device):
    model.eval()
    val_loss = 0.0
    val_target_preds, val_severity_preds = [], []
    val_target_labels, val_severity_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            target_labels = batch['labels_target'].to(device)
            severity_labels = batch['labels_severity'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            target_logits, severity_logits = model(input_ids, attention_mask)
            target_loss = criterion(target_logits, target_labels)
            severity_loss = criterion(severity_logits, severity_labels)
            loss = target_loss + severity_loss
            val_loss += loss.item()

            val_target_preds.extend(torch.argmax(target_logits, dim=1).cpu().numpy())
            val_severity_preds.extend(torch.argmax(severity_logits, dim=1).cpu().numpy())
            val_target_labels.extend(target_labels.cpu().numpy())
            val_severity_labels.extend(severity_labels.cpu().numpy())

    val_loss /= len(val_loader)
    return val_loss, val_target_preds, val_severity_preds, val_target_labels, val_severity_labels


In [17]:
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
train_data = pd.read_csv("/content/drive/MyDrive/Icon Conference/Data/cleaned_train_b.csv").dropna(subset=['Tweet'])
val_data = pd.read_csv("/content/drive/MyDrive/Icon Conference/Data/cleaned_val_b.csv").dropna(subset=['Tweet'])
MAX_LEN = 180

train_dataset = FauxHateDataset(train_data, tokenizer, MAX_LEN)
val_dataset = FauxHateDataset(val_data, tokenizer, MAX_LEN)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

model = MultiTaskAttentionModel()
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

In [18]:
train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=30, patience=3)

Epoch 1/30, Train Loss: 2.3238, Val Loss: 2.1500
Validation Target Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.71      0.73       274
           1       0.39      0.42      0.41       140
           2       0.40      0.70      0.51        99
           3       0.68      0.52      0.59       286

    accuracy                           0.59       799
   macro avg       0.56      0.59      0.56       799
weighted avg       0.62      0.59      0.59       799

Validation Severity Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.22      0.34        74
           1       0.53      0.64      0.58       182
           2       0.41      0.58      0.48       257
           3       0.68      0.48      0.57       286

    accuracy                           0.52       799
   macro avg       0.61      0.48      0.49       799
weighted avg       0.57      0.52      0.52       79

In [20]:
model = MultiTaskAttentionModel()
model.load_state_dict(torch.load("/content/drive/MyDrive/Icon Conference/multitaskXLMRoBERTa/best_attention_model_task_b.pth"))
model.eval()
target_map_inv = {0: 'O', 1: 'I', 2: 'R', 3: 'N/A'}
severity_map_inv = {0: 'H', 1: 'M', 2: 'L', 3: 'N/A'}


  model.load_state_dict(torch.load("/content/drive/MyDrive/Icon Conference/multitaskXLMRoBERTa/best_attention_model_task_b.pth"))


In [21]:
class TestDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data.iloc[index]
        text = row['Tweet']

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'Id': row['Id']  # Keep track of the Id for output
        }

In [22]:
# Load the test data
test_data = pd.read_csv("/content/drive/MyDrive/Icon Conference/Data/cleaned_test_b.csv")  # This should have 'Id' and 'Tweet' columns
test_dataset = TestDataset(test_data, tokenizer, MAX_LEN)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [23]:
def generate_predictions(model, test_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    predictions = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            ids = batch['Id']  # Ids to keep track of the original tweet

            target_logits, severity_logits = model(input_ids, attention_mask)
            target_preds = torch.argmax(target_logits, dim=1)
            severity_preds = torch.argmax(severity_logits, dim=1)

            for i in range(len(ids)):
                predictions.append({
                    'Id': ids[i],
                    'Target': target_map_inv[target_preds[i].item()],
                    'Severity': severity_map_inv[severity_preds[i].item()]
                })

    return pd.DataFrame(predictions)

In [24]:
predictions_df = generate_predictions(model, test_loader)

In [25]:
predictions_df.to_csv("/content/drive/MyDrive/Icon Conference/Data/predictions_task_b.csv", index=False)
