In [None]:
import datasets
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
from utils import remove_extra_brackets, CLASSIFICATION_PROMPT
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from torch.nn import Linear, CrossEntropyLoss
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import Muon, AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.nn.utils import clip_grad_norm_ as CLIP_GRADIENTS
from tqdm import tqdm


In [None]:
# Load multiple CSV files
df = datasets.load_dataset('csv', data_files={
    'train': './data/train.csv',
    'test': './data/test.csv'
})

In [None]:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
print(tokenizer.model_max_length)
model_classification = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=3)
model_classification = model_classification.to("cuda", torch.bfloat16)
# model_maskedLM = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-base")

8192


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
df

DatasetDict({
    train: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 57477
    })
    test: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 3
    })
})

In [None]:
def fix_dataset(row):
    cleaned_prompt = remove_extra_brackets(row['prompt'])
    cleaned_response_a = remove_extra_brackets(row['response_a'])
    cleaned_response_b = remove_extra_brackets(row['response_b'])
    prompt = CLASSIFICATION_PROMPT.format(
        prompt=cleaned_prompt,
        response_a=cleaned_response_a,
        response_b=cleaned_response_b
    )
    winner = [row['winner_model_a'], row['winner_model_b'], row['winner_tie']]
    return {
        "final_prompt": prompt,
        "winner": winner
    }
    
def tokenize_dataset(batch):
    tokenized = tokenizer(
        batch["final_prompt"],
        padding="max_length",
        max_length=tokenizer.model_max_length,
        truncation=True,
        return_tensors=None
    )
    return tokenized

In [None]:
df = df.map(fix_dataset, batched=False).remove_columns(['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b','winner_model_a', 'winner_model_b', 'winner_tie'])
df = df.map(tokenize_dataset, batched=True, num_proc=13).remove_columns(['final_prompt'])

num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


In [None]:
df

DatasetDict({
    train: Dataset({
        features: ['winner', 'input_ids', 'attention_mask'],
        num_rows: 57477
    })
    test: Dataset({
        features: ['winner', 'input_ids', 'attention_mask'],
        num_rows: 3
    })
})

In [None]:
train_val_split = df['train'].train_test_split(test_size=0.1, seed=42)
df['train'] = train_val_split['train']
df['validation'] = train_val_split['test']
df = df.with_format("torch")
train_dataloader = DataLoader(df["train"], batch_size=4, shuffle=True)
val_dataloader = DataLoader(df["validation"], batch_size=4, shuffle=False)

In [None]:
df

DatasetDict({
    train: Dataset({
        features: ['winner', 'input_ids', 'attention_mask'],
        num_rows: 51729
    })
    test: Dataset({
        features: ['winner', 'input_ids', 'attention_mask'],
        num_rows: 3
    })
    validation: Dataset({
        features: ['winner', 'input_ids', 'attention_mask'],
        num_rows: 5748
    })
})

In [None]:
# next(iter(train_dataloader))

In [None]:
model_classification = torch.compile(model_classification)

In [None]:
optimizer = AdamW(model_classification.parameters(), lr=1e-4, weight_decay=0.01)
EPOCHS = 10
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-6)
loss_fn = CrossEntropyLoss()
GRADIENT_ACCUMULATION_STEPS = 32

for epoch in range(EPOCHS):
    model_classification.train()
    total_loss = 0
    optimizer.zero_grad()
    
    train_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=True, position=0)
    validation_bar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=False, position=1)
    for step, data in enumerate(train_bar):
        data = {key: value.to("cuda") for key, value in data.items()}
        
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            with torch.no_grad():
                _, true_labels = torch.max(data["winner"], 1)
            
            outputs = model_classification(data["input_ids"], attention_mask=data["attention_mask"]).logits
            loss = loss_fn(outputs, true_labels)
        
        (loss / GRADIENT_ACCUMULATION_STEPS).backward()

        total_loss += loss.item()
        
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm_(model_classification.parameters(), max_norm=1.0)
            optimizer.step()
            optimizer.zero_grad()
        
        if step % GRADIENT_ACCUMULATION_STEPS == 0:
            train_bar.set_postfix({'loss': f"{(loss.item()):.4f}"})
            
        if step % 400 == 0 and step != 0:
            model_classification.eval()
            correct = 0
            total = 0
            with torch.no_grad():
                for val_data in validation_bar:
                    val_data = {key: value.to("cuda") for key, value in val_data.items()}
                    outputs = model_classification(val_data["input_ids"], attention_mask=val_data["attention_mask"]).logits
                    _, predicted = torch.max(outputs, 1)
                    _, true_labels = torch.max(val_data["winner"], 1)
                    total += true_labels.size(0)
                    correct += (predicted == true_labels).sum().item()
            
            accuracy = 100 * (correct / total)
            print(f"Validation Accuracy: {accuracy:.2f}%")
            model_classification.train()
    
    if (len(train_dataloader) % GRADIENT_ACCUMULATION_STEPS) != 0:
        torch.nn.utils.clip_grad_norm_(model_classification.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
    
    scheduler.step()
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Avg Loss: {avg_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.2e}")

Epoch 1/10:   3%|▎         | 401/12933 [16:08<512:35:46, 147.25s/it, loss=0.9883]

Validation Accuracy: 35.35%


Epoch 1/10:   6%|▌         | 800/12933 [23:54<4:03:51,  1.21s/it, loss=1.2949]   