In [1]:
!git clone https://github.com/PC0907/Urdu_Sarcasm

Cloning into 'Urdu_Sarcasm'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 19 (delta 2), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (19/19), 2.79 MiB | 9.42 MiB/s, done.
Resolving deltas: 100% (2/2), done.


In [2]:
import os
os.getcwd()

'/kaggle/working'

In [3]:
cd Urdu_Sarcasm

/kaggle/working/Urdu_Sarcasm


In [4]:
cd Data

/kaggle/working/Urdu_Sarcasm/Data


## Model Architecture

In [5]:
import torch
from torch import nn
from transformers import BertTokenizer, BertModel, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [6]:
import torch
from torch import nn
from transformers import BertTokenizer, BertModel

class SarcasmDetector(nn.Module):
    def __init__(self, bert_model, hidden_size=256, num_attention_heads=8):
        super(SarcasmDetector, self).__init__()
        self.bert = bert_model
        self.bert_hidden_size = 768
        self.hidden_size = hidden_size
        
        # BiGRU layer
        self.bigru = nn.GRU(
            input_size=self.bert_hidden_size,
            hidden_size=hidden_size,
            num_layers=2,
            batch_first=True,
            bidirectional=True
        )
        
        # Multi-head attention layer
        self.multihead_attn = nn.MultiheadAttention(
            embed_dim=hidden_size * 2,  # BiGRU output dimension (hidden_size * 2 for bidirectional)
            num_heads=num_attention_heads,
            batch_first=True
        )
        
        # Layer normalization
        self.layer_norm = nn.LayerNorm(hidden_size * 2)
        
        # Output layers
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, 2)
        )

    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_sequence = bert_outputs.last_hidden_state
        
        # Pass through BiGRU
        gru_output, _ = self.bigru(bert_sequence)
        
        # Apply multi-head attention
        # Using self-attention: query, key, and value are all the same
        attn_output, _ = self.multihead_attn(gru_output, gru_output, gru_output)
        
        # Add residual connection and layer normalization
        attn_output = self.layer_norm(attn_output + gru_output)
        
        # Pool the attention output (using mean pooling)
        pooled_output = torch.mean(attn_output, dim=1)
        
        # Pass through classifier
        output = self.classifier(pooled_output)
        return output

class UrduSarcasmDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # Get BERT encodings
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [7]:
def prepare_data(texts, labels, tokenizer, max_length=128, test_size=0.2, random_state=42):
    """
    Prepare train and test datasets with an 80:20 split
    """
    # Perform train-test split
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        texts, labels, test_size=test_size, random_state=random_state
    )

    # Create datasets
    train_dataset = UrduSarcasmDataset(train_texts, train_labels, tokenizer, max_length)
    test_dataset = UrduSarcasmDataset(test_texts, test_labels, tokenizer, max_length)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    return train_loader, test_loader


def calculate_metrics(true_labels, predictions):
    """
    Calculate all metrics in one place to avoid code duplication
    """
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        true_labels, predictions, average='binary', zero_division=0
    )

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [8]:
def train_model(model, train_loader, device, num_epochs=20, patience=3):
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    best_loss = float('inf')
    patience_counter = 0
    best_model = None

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        predictions = []
        true_labels = []

        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)  # Removed emoji_ids
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

        avg_loss = total_loss / len(train_loader)
        metrics = calculate_metrics(true_labels, predictions)

        print(f'\nEpoch {epoch + 1}/{num_epochs}:')
        print(f'Training - Loss: {avg_loss:.4f}, Accuracy: {metrics["accuracy"]:.4f}, '
              f'F1: {metrics["f1"]:.4f}, Precision: {metrics["precision"]:.4f}, '
              f'Recall: {metrics["recall"]:.4f}')

        if avg_loss < best_loss:
            best_loss = avg_loss
            patience_counter = 0
            best_model = model.state_dict().copy()
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f'\nEarly stopping triggered after epoch {epoch + 1}')
                model.load_state_dict(best_model)
                break

    return model

def evaluate_model(model, test_loader, device):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Evaluating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask)  # Removed emoji_ids
            _, predicted = torch.max(outputs, 1)

            predictions.extend(predicted.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    metrics = calculate_metrics(true_labels, predictions)
    metrics['detailed_report'] = classification_report(true_labels, predictions)

    return metrics

def calculate_metrics(true_labels, predictions):
    """Helper function to calculate various metrics"""
    return {
        'accuracy': accuracy_score(true_labels, predictions),
        'precision': precision_score(true_labels, predictions, average='binary'),
        'recall': recall_score(true_labels, predictions, average='binary'),
        'f1': f1_score(true_labels, predictions, average='binary')
    }

In [9]:
def main():
    # Load Preprocessed Data
    df = pd.read_csv('preprocessed_data.csv')

    # Initialize Tokenizer and BERT Model
    tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
    bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')

    # Prepare data using our prepare_data function
    train_loader, test_loader = prepare_data(
        texts=df['Preprocessed'].values,
        labels=df['is_sarcastic'].values,
        tokenizer=tokenizer,
        max_length=128,
        test_size=0.2,
        random_state=42
    )

    # Initialize Model and Device
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SarcasmDetector(bert_model).to(device)

    # Train the Model
    print("Training the model...")
    trained_model = train_model(
        model=model,
        train_loader=train_loader,
        device=device,
        num_epochs=30,
        patience=3
    )

    # Evaluate on test set
    print("\nEvaluating on test set...")
    test_metrics = evaluate_model(trained_model, test_loader, device)

    # Print results
    print("\nTest Set Metrics:")
    print(f"Accuracy: {test_metrics['accuracy']:.4f}")
    print(f"Precision: {test_metrics['precision']:.4f}")
    print(f"Recall: {test_metrics['recall']:.4f}")
    print(f"F1 Score: {test_metrics['f1']:.4f}")
    print("\nDetailed Classification Report:")
    print(test_metrics['detailed_report'])

    # Save the model
    print("\nSaving the model...")
    torch.save(trained_model.state_dict(), 'sarcasm_detector.pth')

    # Save the metrics
    print("Saving the metrics...")
    with open('test_metrics.txt', 'w') as f:
        f.write("Test Set Metrics:\n")
        f.write(f"Accuracy: {test_metrics['accuracy']:.4f}\n")
        f.write(f"Precision: {test_metrics['precision']:.4f}\n")
        f.write(f"Recall: {test_metrics['recall']:.4f}\n")
        f.write(f"F1 Score: {test_metrics['f1']:.4f}\n")
        f.write("\nDetailed Classification Report:\n")
        f.write(test_metrics['detailed_report'])

    print("Training and evaluation completed!")

In [None]:
if __name__ == "__main__":
    main()