# Siamese Network for Cross-Similarity

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Libraries

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaModel, XLMRobertaTokenizer
from sklearn.metrics import classification_report
import pandas as pd

## Dataset Class

In [None]:
class SiameseFauxHateDataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        # Number of pairs to return
        return len(self.data)

    def __getitem__(self, index):
        row1 = self.data.iloc[index]
        row2 = self.data.sample(1).iloc[0]

        text1 = row1['Tweet']
        text2 = row2['Tweet']

        label_fake = 1 if row1['Fake'] == row2['Fake'] else 0
        label_hate = 1 if row1['Hate'] == row2['Hate'] else 0

        encoding1 = self.tokenizer.encode_plus(
            text1,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='pt'
        )

        encoding2 = self.tokenizer.encode_plus(
            text2,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='pt'
        )

        return {
            'input_ids1': encoding1['input_ids'].flatten(),
            'attention_mask1': encoding1['attention_mask'].flatten(),
            'input_ids2': encoding2['input_ids'].flatten(),
            'attention_mask2': encoding2['attention_mask'].flatten(),
            'label_fake': torch.tensor(label_fake, dtype=torch.long),
            'label_hate': torch.tensor(label_hate, dtype=torch.long)
        }

## Model architecture

In [None]:
class SiameseNetwork(nn.Module):
    def __init__(self, hidden_dim=256, dropout_rate=0.3):
        super(SiameseNetwork, self).__init__()
        self.xlm_roberta = XLMRobertaModel.from_pretrained('xlm-roberta-base')

        self.fc = nn.Sequential(
            nn.Linear(self.xlm_roberta.config.hidden_size, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout_rate)
        )

        self.fake_classifier = nn.Linear(hidden_dim * 2, 2)
        self.hate_classifier = nn.Linear(hidden_dim * 2, 2)

    def forward_once(self, input_ids, attention_mask):

        outputs = self.xlm_roberta(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        return self.fc(cls_output)

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):

        embedding1 = self.forward_once(input_ids1, attention_mask1)
        embedding2 = self.forward_once(input_ids2, attention_mask2)


        combined_embedding = torch.cat((embedding1, embedding2), dim=1)


        fake_logits = self.fake_classifier(combined_embedding)
        hate_logits = self.hate_classifier(combined_embedding)

        return fake_logits, hate_logits

In [None]:
train_data = pd.read_csv("/content/drive/MyDrive/Icon Conference/cleaned_train.csv").dropna(subset=['Tweet'])
val_data = pd.read_csv("/content/drive/MyDrive/Icon Conference/cleaned_val.csv").dropna(subset=['Tweet'])
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

MAX_LEN = 180
train_dataset = SiameseFauxHateDataset(train_data, tokenizer, MAX_LEN)
val_dataset = SiameseFauxHateDataset(val_data, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model = SiameseNetwork().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

## Training function

In [None]:
def train_siamese_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=30, patience=3):
    torch.cuda.empty_cache()
    best_val_loss = float('inf')
    patience_counter = 0
    device = next(model.parameters()).device

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for batch in train_loader:

            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            label_fake = batch['label_fake'].to(device)
            label_hate = batch['label_hate'].to(device)

            optimizer.zero_grad()

            fake_logits, hate_logits = model(input_ids1, attention_mask1, input_ids2, attention_mask2)

            fake_loss = criterion(fake_logits, label_fake)
            hate_loss = criterion(hate_logits, label_hate)
            loss = fake_loss + hate_loss

            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)


        val_loss, val_fake_preds, val_hate_preds, val_fake_labels, val_hate_labels = evaluate_siamese_model(
            model, val_loader, criterion)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
        print("Validation Fake Classification Report:")
        print(classification_report(val_fake_labels, val_fake_preds, zero_division=0))
        print("Validation Hate Classification Report:")
        print(classification_report(val_hate_labels, val_hate_preds, zero_division=0))
        print("--------------------------------------------------")


        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), '/content/drive/MyDrive/Icon Conference/best_siamese_model.pth')  # Save the best model
            print("Model improved, saving current model.")
        else:
            patience_counter += 1
            print(f"No improvement. Early stopping counter: {patience_counter}/{patience}")

        if patience_counter >= patience:
            print("Early stopping triggered. Training terminated.")
            break

## Evaluation function

In [None]:
def evaluate_siamese_model(model, val_loader, criterion):
    model.eval()
    val_loss = 0.0
    val_fake_preds, val_hate_preds, val_fake_labels, val_hate_labels = [], [], [], []
    device = next(model.parameters()).device

    with torch.no_grad():
        for batch in val_loader:

            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            label_fake = batch['label_fake'].to(device)
            label_hate = batch['label_hate'].to(device)

            fake_logits, hate_logits = model(input_ids1, attention_mask1, input_ids2, attention_mask2)

            fake_loss = criterion(fake_logits, label_fake)
            hate_loss = criterion(hate_logits, label_hate)
            loss = fake_loss + hate_loss
            val_loss += loss.item()

            fake_preds = torch.argmax(fake_logits, dim=1)
            hate_preds = torch.argmax(hate_logits, dim=1)

            val_fake_preds.extend(fake_preds.cpu().numpy())
            val_hate_preds.extend(hate_preds.cpu().numpy())
            val_fake_labels.extend(label_fake.cpu().numpy())
            val_hate_labels.extend(label_hate.cpu().numpy())

    val_loss /= len(val_loader)

    return val_loss, val_fake_preds, val_hate_preds, val_fake_labels, val_hate_labels

## Training the model

In [None]:
train_siamese_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=30)

Epoch 1/30, Train Loss: 1.3891, Val Loss: 1.3827
Validation Fake Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.76      0.60       398
           1       0.50      0.24      0.32       401

    accuracy                           0.50       799
   macro avg       0.50      0.50      0.46       799
weighted avg       0.50      0.50      0.46       799

Validation Hate Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       374
           1       0.53      1.00      0.69       425

    accuracy                           0.53       799
   macro avg       0.27      0.50      0.35       799
weighted avg       0.28      0.53      0.37       799

--------------------------------------------------
Model improved, saving current model.
Epoch 2/30, Train Loss: 1.3903, Val Loss: 1.3844
Validation Fake Classification Report:
              precision    recall  f1-scor