In [None]:
import numpy as np
import pandas as pd
from datasets import Dataset, load_from_disk
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

### Loading Datasets

In [70]:
final_train_dataset = load_from_disk("final_train_dataset")
train_dataset = final_train_dataset.remove_columns(["Sentence_A", "Sentence_B","embeddings_B", "embeddings_A"])

In [71]:
train_dataset = train_dataset.train_test_split(test_size=0.1)['train']
val_dataset = train_dataset.train_test_split(test_size=0.1)['test']

In [111]:
final_test_dataset = load_from_disk("final_test_dataset")
test_dataset = final_test_dataset.remove_columns(["Sentence_A", "Sentence_B","embeddings_B", "embeddings_A"])

In [73]:
class PrecomputedDifferenceDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.differences = dataset["difference"]
        self.labels = dataset["labels"]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "difference": torch.tensor(self.differences[idx], dtype=torch.float32),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long),
        }
        
class PrecomputedTestDifferenceDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.differences = dataset["difference"]

    def __len__(self):
        return len(self.differences)

    def __getitem__(self, idx):
        return {
            "difference": torch.tensor(self.differences[idx], dtype=torch.float32),
        }

In [112]:
train_dataset = PrecomputedDifferenceDataset(train_dataset)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataset = PrecomputedDifferenceDataset(val_dataset)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_dataset = PrecomputedTestDifferenceDataset(test_dataset)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

### Bidirectional LSTM Model

In [86]:
class BiLSTMForDifferences(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(BiLSTMForDifferences, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.5)

    def save_pretrained(self, save_directory):
        self.base_model.save_pretrained(save_directory)
        torch.save(self.classifier.state_dict(), f"{save_directory}/classifier.pth")

    def forward(self, differences, labels=None):
        differences = differences.unsqueeze(1)
        lstm_out, _ = self.lstm(differences)
        lstm_out_cat = torch.cat((lstm_out[:, -1, :lstm_out.shape[2] // 2], lstm_out[:, 0, lstm_out.shape[2] // 2:]), dim=1)

        logits = self.fc(self.dropout(lstm_out_cat))

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


In [77]:
def evaluate(model, dataloader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch["difference"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(inputs)
            logits = outputs["logits"]  # Pass the inputs directly
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average="binary"
    )
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [82]:
def train(model, train_dataloader, val_dataloader, epochs, device):
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()

        total_loss = 0
        for batch in train_dataloader:
            inputs = batch["difference"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()

            outputs = model(inputs)  # Pass the embeddings (differences)
            logits = outputs["logits"]
            loss = loss_fn(logits, labels)

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}/{epochs}, Training Loss: {avg_loss:.4f}")
        scores = evaluate(model, val_dataloader, device)
        print(f"Epoch {epoch + 1}/{epochs}, Validation Accuracy: {scores['accuracy']:.4f}")
        print(f"Epoch {epoch + 1}/{epochs}, Precision: {scores['precision']:.4f}")
        print(f"Epoch {epoch + 1}/{epochs}, Recall: {scores['recall']:.4f}")
        print(f"Epoch {epoch + 1}/{epochs}, F1 Score: {scores['f1']:.4f}")


input_dim = train_dataset[0]["difference"].shape[0]  # Dimensionality of difference embeddings
hidden_dim = 256
output_dim = 2

bilstm_model = BiLSTMForDifferences(input_dim, hidden_dim, output_dim).to("cuda")

In [96]:
train(bilstm_model, train_dataloader, val_dataloader, epochs=3, device="cuda")

Epoch 1/3, Training Loss: 0.1143
Epoch 1/3, Validation Accuracy: 0.9622
Epoch 1/3, Precision: 0.9632
Epoch 1/3, Recall: 0.9614
Epoch 1/3, F1 Score: 0.9623
Epoch 2/3, Training Loss: 0.1124
Epoch 2/3, Validation Accuracy: 0.9626
Epoch 2/3, Precision: 0.9634
Epoch 2/3, Recall: 0.9619
Epoch 2/3, F1 Score: 0.9626
Epoch 3/3, Training Loss: 0.1108
Epoch 3/3, Validation Accuracy: 0.9638
Epoch 3/3, Precision: 0.9670
Epoch 3/3, Recall: 0.9606
Epoch 3/3, F1 Score: 0.9638


### Generating Predictions

In [113]:
def generate_predictions(model, dataloader, device):
    model.eval()
    all_preds = []

    with torch.no_grad():
        for batch in dataloader:
            inputs = batch["difference"].to(device)

            outputs = model(inputs)
            logits = outputs["logits"]
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())

    return all_preds

In [None]:
test_preds = generate_predictions(bilstm_model, test_dataloader, device="cuda")
output_df = pd.DataFrame({
    "Predicted": test_preds
})
output_df['Predicted'] = = output_df['Predicted'].replace({0:"A",1:"B"})
output_df["Predicted"].to_csv("part1.txt", index=False, header=False)