In [None]:
import pandas as pd
import re
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer

# Load the datasets
test_df = pd.read_csv('/content/sarcasm_tam_test_without_labels.csv')
dev_df = pd.read_csv('/content/sarcasm_tam_dev.csv')
train_df = pd.read_csv('/content/sarcasm_tam_train.csv')

# Preprocess the data
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s!?.,]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['cleaned_text'] = train_df['Text'].apply(preprocess_text)
dev_df['cleaned_text'] = dev_df['Text'].apply(preprocess_text)
test_df['cleaned_text'] = test_df['Text'].apply(preprocess_text)

# Map labels to binary values
train_df['labels'] = train_df['labels'].apply(lambda x: 1 if x == 'Sarcastic' else 0)
dev_df['labels'] = dev_df['labels'].apply(lambda x: 1 if x == 'Sarcastic' else 0)

# Load the tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize the data
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, return_tensors="pt")

class SarcasmDataset(Dataset):
    def __init__(self, df):
        self.texts = df['cleaned_text'].tolist()
        self.labels = df['labels'].tolist() if 'labels' in df else None

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encodings = tokenize_function(text)
        item = {key: torch.squeeze(value) for key, value in encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SarcasmDataset(train_df)
dev_dataset = SarcasmDataset(dev_df)
test_dataset = SarcasmDataset(test_df)



In [None]:
# Define the GRU model
class GRUClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding(tokenizer.vocab_size, input_dim)
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        _, hidden = self.gru(embedded)
        if self.gru.bidirectional:
            hidden = self.dropout(torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1))
        else:
            hidden = self.dropout(hidden[-1, :, :])
        output = self.fc(hidden)
        return output

# Set hyperparameters
input_dim = 128
hidden_dim = 256
output_dim = 2
n_layers = 2
bidirectional = True
dropout = 0.5

model = GRUClassifier(input_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout)

# Training settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader):.4f}')

    # Evaluate the model
    model.eval()
    eval_preds = []
    eval_labels = []
    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask)
            _, preds = torch.max(outputs, dim=1)
            eval_preds.extend(preds.cpu().numpy())
            eval_labels.extend(labels.cpu().numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(eval_labels, eval_preds, average='weighted')
    acc = accuracy_score(eval_labels, eval_preds)
    print(f"Development Accuracy: {acc:.2f}")
    print(f"Development F1 Score: {f1:.2f}")

# Predict on the test set
model.eval()
test_preds = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask)
        _, preds = torch.max(outputs, dim=1)
        test_preds.extend(preds.cpu().numpy())

test_df['predicted_labels'] = test_preds
test_df['predicted_labels'] = test_df['predicted_labels'].apply(lambda x: 'Sarcastic' if x == 1 else 'Non-sarcastic')

# Save test set predictions to CSV
test_df[['ID', 'predicted_labels']].to_csv('test_predictions.csv', index=False)

Epoch 1/3, Loss: 0.5415
Development Accuracy: 0.78
Development F1 Score: 0.76
Epoch 2/3, Loss: 0.4693
Development Accuracy: 0.79
Development F1 Score: 0.77
Epoch 3/3, Loss: 0.4183
Development Accuracy: 0.79
Development F1 Score: 0.79
