In [1]:
import datasets
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForMaskedLM
from torch.nn import Linear, CrossEntropyLoss
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import Muon, AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm
import sys
from pathlib import Path
sys.path.append(str(Path.cwd().parent))
from utils import remove_extra_brackets, CLASSIFICATION_PROMPT

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load multiple CSV files
df = datasets.load_dataset('csv', data_files={
    'train': '../data/train.csv',
    'test': '../data/test.csv'
})

In [3]:
# keep only a small portion for quick testing
# df['train'] = df['train'].shuffle(seed=42).select(range(1000))

In [4]:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")
print(f"Model max length is {tokenizer.model_max_length} characters.")
model_classification = AutoModelForSequenceClassification.from_pretrained("answerdotai/ModernBERT-base", num_labels=3)
model_classification = model_classification.to("cuda", torch.bfloat16)
model_maskedLM = AutoModelForMaskedLM.from_pretrained("FacebookAI/roberta-base")

Model max length is 8192 characters.


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
df

DatasetDict({
    train: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 57477
    })
    test: Dataset({
        features: ['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie'],
        num_rows: 3
    })
})

In [6]:
# Example of final senentence fed into the model
row = df["train"][0]
cleaned_prompt = remove_extra_brackets(row['prompt'])
cleaned_response_a = remove_extra_brackets(row['response_a'])
cleaned_response_b = remove_extra_brackets(row['response_b'])
prompt = CLASSIFICATION_PROMPT.format(
    prompt=cleaned_prompt,
    response_a=cleaned_response_a,
    response_b=cleaned_response_b,
    seperator=tokenizer.sep_token
)
encoded_prompt = tokenizer(prompt, truncation=True, padding='max_length', max_length=tokenizer.model_max_length, return_tensors = None)
print(tokenizer.decode(encoded_prompt['input_ids']))

[CLS]Is it morally right to try to have a certain percentage of females on managerial positions?","OK, does pineapple belong on a pizza? Relax and give me fun answer.

[SEP]

The question of whether it is morally right to aim for a certain percentage of females in managerial positions is a complex ethical issue that involves considerations of fairness, equality, diversity, and discrimination.\n\nHere are some arguments in favor of and against such policies:\n\n**Arguments in favor:**\n\n1. **Correcting Historical Inequities:** Women have historically been underrepresented in leadership roles due to various cultural, institutional, and social barriers. Aiming for a specific percentage can be seen as a corrective measure to address past and ongoing discrimination.\n\n2. **Promoting Diversity:** Diverse leadership teams can enhance decision-making and represent a broader range of perspectives. This can lead to better outcomes for organizations and society as a whole.\n\n3. **Equality of O

In [7]:
def fix_dataset(row):
    cleaned_prompt = remove_extra_brackets(row['prompt'])
    cleaned_response_a = remove_extra_brackets(row['response_a'])
    cleaned_response_b = remove_extra_brackets(row['response_b'])
    prompt = CLASSIFICATION_PROMPT.format(
        prompt=cleaned_prompt,
        response_a=cleaned_response_a,
        response_b=cleaned_response_b,
        seperator=tokenizer.sep_token
    )
    winner = [row['winner_model_a'], row['winner_model_b'], row['winner_tie']]
    return {
        "final_prompt": prompt,
        "winner": winner
    }
    
def tokenize_dataset(batch):
    tokenized = tokenizer(
        batch["final_prompt"],
        padding="max_length",
        max_length=tokenizer.model_max_length,
        truncation=False,
        return_tensors=None
    )
    
    length = [len(row) for row in tokenized["input_ids"]]
    return {**tokenized, "length": length}

In [8]:
df = df.map(fix_dataset, batched=False).remove_columns(['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b','winner_model_a', 'winner_model_b', 'winner_tie'])

In [9]:
df = df.map(tokenize_dataset, batched=True, num_proc=16).remove_columns(['final_prompt'])

num_proc must be <= 3. Reducing num_proc to 3 for dataset of size 3.


In [10]:
df = df.filter(lambda batch: np.array(batch["length"]) <= 8192, batched=True).remove_columns(["length"])

In [12]:
train_val_split = df['train'].train_test_split(test_size=0.05, seed=42)
df['train'] = train_val_split['train']
df['validation'] = train_val_split['test']
df = df.with_format("torch")
train_dataloader = DataLoader(df["train"], batch_size=4, shuffle=True)
val_dataloader = DataLoader(df["validation"], batch_size=4, shuffle=False)

In [13]:
df

DatasetDict({
    train: Dataset({
        features: ['winner', 'input_ids', 'attention_mask'],
        num_rows: 54524
    })
    test: Dataset({
        features: ['winner', 'input_ids', 'attention_mask'],
        num_rows: 3
    })
    validation: Dataset({
        features: ['winner', 'input_ids', 'attention_mask'],
        num_rows: 2870
    })
})

In [14]:
# next(iter(train_dataloader))

In [15]:
model_classification = torch.compile(model_classification)

In [None]:
optimizer = AdamW(model_classification.parameters(), lr=1e-4, weight_decay=0.01)
EPOCHS = 10
scheduler = CosineAnnealingLR(optimizer, T_max=EPOCHS, eta_min=1e-6)
loss_fn = CrossEntropyLoss()
GRADIENT_ACCUMULATION_STEPS = 32

torch.set_float32_matmul_precision("medium")

grad_steps_corrects = 0
grad_steps_count = 0


for epoch in range(EPOCHS):
    model_classification.train()
    total_loss = 0
    total_correct = 0
    total_count = 0
    optimizer.zero_grad()
    
    train_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=True, position=0)
    validation_bar = tqdm(val_dataloader, desc=f"Validation {epoch+1}/{EPOCHS}", leave=False, position=0)
    for step, data in enumerate(train_bar):
        data = {key: value.to("cuda") for key, value in data.items()}
        
        with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
            outputs = model_classification(data["input_ids"], attention_mask=data["attention_mask"]).logits
            with torch.no_grad():
                _, predicted = torch.max(outputs, 1)
                _, true_labels = torch.max(data["winner"], 1)
                examples_count = data["input_ids"].size(0)
                correct_count = (predicted == true_labels).sum().item()
                grad_steps_count += examples_count
                total_count += examples_count
                total_correct += correct_count
                grad_steps_corrects += correct_count
                if (step+1) % 10 == 0:
                    train_bar.set_postfix({'Prediction': f"{predicted.cpu().tolist()} | {true_labels.cpu().tolist()}"})
                
            
            loss = loss_fn(outputs, true_labels)
        
        (loss / GRADIENT_ACCUMULATION_STEPS).backward()

        total_loss += loss.item()
            
        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            accuracy = 100 * (grad_steps_corrects / grad_steps_count)
            grad_steps_corrects = 0
            grad_steps_count = 0
            
            train_bar.set_postfix({'accuracy': f"{(accuracy):.2f}%"})
            # torch.nn.utils.clip_grad_norm_(model_classification.parameters(), max_norm=1.0)
            optimizer.step()
            optimizer.zero_grad()
        
            
        if step % 4000 == 0 and step != 0:
            model_classification.eval()
            correct = 0
            total = 0
            with torch.no_grad():
                for val_data in validation_bar:
                    val_data = {key: value.to("cuda") for key, value in val_data.items()}
                    outputs = model_classification(val_data["input_ids"], attention_mask=val_data["attention_mask"]).logits
                    _, predicted = torch.max(outputs, 1)
                    _, true_labels = torch.max(val_data["winner"], 1)
                    total += true_labels.size(0)
                    correct += (predicted == true_labels).sum().item()
            
            accuracy = 100 * (correct / total)
            print(f"Validation Accuracy: {accuracy:.2f}%")
            model_classification.train()
    
    
    scheduler.step()
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{EPOCHS}, Avg Loss: {avg_loss:.4f}, LR: {scheduler.get_last_lr()[0]:.2e}")
    print(f"Epoch {epoch+1}/{EPOCHS}, Training Accuracy: {100 * (total_correct / total_count):.2f}%")
    
    # model_classification.eval()
    # correct = 0
    # total = 0
    # with torch.no_grad():
    #     for val_data in validation_bar:
    #         val_data = {key: value.to("cuda") for key, value in val_data.items()}
    #         outputs = model_classification(val_data["input_ids"], attention_mask=val_data["attention_mask"]).logits
    #         _, predicted = torch.max(outputs, 1)
    #         _, true_labels = torch.max(val_data["winner"], 1)
    #         total += true_labels.size(0)
    #         correct += (predicted == true_labels).sum().item()
    
    accuracy = 100 * (correct / total)
    print(f"Validation Accuracy: {accuracy:.2f}%")

Validation 1/10:   0%|          | 0/718 [00:00<?, ?it/s]