In [1]:
import datasets
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
from utils import remove_extra_brackets, CLASSIFICATION_PROMPT
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from torch.nn import Linear, BCEWithLogitsLoss
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import Muon, AdamW
from tqdm import tqdm

In [2]:
def remove_extra_brackets(input: str) -> str:
    text = input[2:-2]
    text = text.strip()
    return text

In [3]:
# Load multiple CSV files
df = datasets.load_dataset('csv', data_files={
    'train': './data/train.csv',
    'test': './data/test.csv'
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
print(tokenizer.model_max_length)
model_classification = AutoModelForSequenceClassification.from_pretrained("allenai/longformer-base-4096", num_labels=3)
model_classification = model_classification.to("cuda", torch.bfloat16)
# model_maskedLM = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-base")

1000000000000000019884624838656


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
for param in model_classification.longformer.parameters():
    param.requires_grad = False

In [19]:
df

DatasetDict({
    train: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 57477
    })
    test: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 3
    })
})

In [20]:
def fix_dataset(row):
    cleaned_prompt = remove_extra_brackets(row['prompt'])
    cleaned_response_a = remove_extra_brackets(row['response_a'])
    cleaned_response_b = remove_extra_brackets(row['response_b'])
    prompt = CLASSIFICATION_PROMPT.format(
        prompt=cleaned_prompt,
        response_a=cleaned_response_a,
        response_b=cleaned_response_b
    )
    winner = [row['winner_model_a'], row['winner_model_b'], row['winner_tie']]
    return {
        "final_prompt": prompt,
        "winner": winner
    }
    
def tokenize_dataset(batch):
    tokenized = tokenizer(
        batch["final_prompt"],
        padding="max_length",
        max_length=4096,
        truncation=True,
        return_tensors=None
    )
    return tokenized

In [21]:
df = df.map(fix_dataset, batched=False).remove_columns(['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b','winner_model_a', 'winner_model_b', 'winner_tie'])
df = df.map(tokenize_dataset, batched=True, num_proc=6).remove_columns(['final_prompt'])

num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


In [22]:
df

DatasetDict({
    train: Dataset({
        features: ['winner', 'input_ids', 'attention_mask'],
        num_rows: 57477
    })
    test: Dataset({
        features: ['winner', 'input_ids', 'attention_mask'],
        num_rows: 3
    })
})

In [26]:
df = df.with_format("torch")
train_dataloader = DataLoader(df["train"], batch_size=32, shuffle=True)

In [29]:
# next(iter(train_dataloader))

In [30]:
model_classification = torch.compile(model_classification)

In [None]:
# optimizer = Muon(model_classification.parameters(), lr=5e-5)
optimizer = AdamW(model_classification.parameters(), lr=5e-5)
loss_fn = BCEWithLogitsLoss()
EPOCHS = 10

for epoch in range(EPOCHS):
    model_classification.train()
    total_loss = 0
    for step, data in enumerate(tqdm(train_dataloader)):
        data = {key: value.to("cuda") for key, value in data.items()}
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            outputs = model_classification(data["input_ids"], attention_mask=data["attention_mask"]).logits
            loss = loss_fn(outputs, data["winner"].float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if step % 50 == 0:
            print(f"Step {step}, Loss: {loss.item()}")
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {avg_loss}")

  0%|          | 0/1797 [00:00<?, ?it/s]W1227 20:12:13.452000 6943 site-packages/torch/_dynamo/variables/tensor.py:1048] [1/0] Graph break from `Tensor.item()`, consider setting:
W1227 20:12:13.452000 6943 site-packages/torch/_dynamo/variables/tensor.py:1048] [1/0]     torch._dynamo.config.capture_scalar_outputs = True
W1227 20:12:13.452000 6943 site-packages/torch/_dynamo/variables/tensor.py:1048] [1/0] or:
W1227 20:12:13.452000 6943 site-packages/torch/_dynamo/variables/tensor.py:1048] [1/0]     env TORCHDYNAMO_CAPTURE_SCALAR_OUTPUTS=1
W1227 20:12:13.452000 6943 site-packages/torch/_dynamo/variables/tensor.py:1048] [1/0] to include these operations in the captured graph.
W1227 20:12:13.452000 6943 site-packages/torch/_dynamo/variables/tensor.py:1048] [1/0] 
W1227 20:12:13.452000 6943 site-packages/torch/_dynamo/variables/tensor.py:1048] [1/0] Graph break: from user code at:
W1227 20:12:13.452000 6943 site-packages/torch/_dynamo/variables/tensor.py:1048] [1/0]   File "/venv/main/lib/p

Step 0, Loss: 0.6610779166221619


  3%|▎         | 51/1797 [01:39<40:34,  1.39s/it]  

Step 50, Loss: 0.6378701329231262


  6%|▌         | 101/1797 [02:49<39:30,  1.40s/it]

Step 100, Loss: 0.6400316953659058


  8%|▊         | 151/1797 [03:59<38:30,  1.40s/it]

Step 150, Loss: 0.6345478892326355


  9%|▊         | 157/1797 [04:08<38:18,  1.40s/it]

In [14]:
data = next(iter(train_dataloader))

In [24]:
with torch.no_grad():
    data["input_ids"] = data["input_ids"].to("cuda")
    outputs = model_classification(data["input_ids"]).logits
    # predictions = torch.sigmoid(outputs)
    # print("Predictions:", predictions)
    # print("Winners:", winners)

In [25]:
outputs

tensor([[ 0.1114, -0.1926, -0.0627],
        [ 0.1262, -0.1778, -0.0078],
        [ 0.1090, -0.1850, -0.0451],
        [ 0.1148, -0.1850, -0.0530]], device='cuda:0')