In [None]:
# Dependencies


In [None]:
import re
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from model import SimilarityClassifierDualEncoder

In [None]:
# Read in train and dev set
df = pd.read_csv("data/train.csv")

train_df, dev_df = train_test_split(df, test_size=0.25, seed=42) # Split into 75/25 train/dev

dev_df, test_df = train_test_split(dev_df, test_size=0.6, seed=42) # Further split for 75/15/10 train/dev/test

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, optimizer, and loss function
model = SimilarityClassifierDualEncoder()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

NUM_EPOCHS = 5
BATCH_SIZE = 64

train_df["input_ids1"], train_df["attention_masks1"], \
train_df["input_ids2"], train_df["attention_masks2"] = zip(*train_df["text"]
                                                            .apply(model.tokenize))

for epoch in range(NUM_EPOCHS):
    model.train()

    for i in range(0, len(train_df), BATCH_SIZE):
        batch = train_df.iloc[i:min(i+BATCH_SIZE, len(train_df)-1)]
        
        # Forward pass for a batch of pairs
        embeddings1, embeddings2 = model(batch["input_ids1"], batch["attention_masks1"],
                                       batch["input_ids2"], batch["attention_masks2"])

        # Calculate contrastive loss
        loss = model.compute_contrastive_loss(embeddings1, embeddings2)
        
        # Backward pass and optimization step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

In [None]:
# Dev Eval

model.eval()

predictions = model.predict(dev_df["text"])

print(f"F1: {f1_score(dev_df["label"], predictions)}")