In [1]:
import os
import random
from dataclasses import dataclass

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split


def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


  from .autonotebook import tqdm as notebook_tqdm


In [10]:
@dataclass
class TrainCfg:
    model_name: str = "roberta-base"

    max_length: int = 512
    train_batch_size: int = 4
    valid_batch_size: int = 4
    learning_rate: float = 2e-5
    epochs: int = 10
    seed: int = 42

    # Paths (Kaggle-compatible structure)
    train_path: str = "data/lmsys-chatbot-arena/train.csv"
    train_path_extend: str = "data/lmsys-33k-deduplicated.csv"

    test_path: str = "data/lmsys-chatbot-arena/test.csv"
    sub_path: str = "data/ours_submission.csv"

    device: torch.device = torch.device(
        "cuda" if torch.cuda.is_available() else "cpu"
    )

cfg = TrainCfg()
set_seed(cfg.seed)
print("Using device:", cfg.device)


Using device: cuda


In [11]:
from transformers import DataCollatorWithPadding
import ast

def process_text(text):
    """
    Robustly convert the CSV field to plain text.
    Handles:
      - None / NaN
      - actual Python lists
      - stringified lists like '["hello", "world"]'
      - malformed '[...]' that can't be parsed (falls back gracefully)
    """
    # 1. Missing values
    if text is None:
        return ""
    if isinstance(text, float) and np.isnan(text):
        return ""

    if isinstance(text, list):
        return " ".join("" if t is None else str(t) for t in text)

    s = str(text)

    if s.startswith("[") and s.endswith("]"):
        try:
            parsed = ast.literal_eval(s)
        except Exception:
            return s.strip("[]")

        if isinstance(parsed, list):
            return " ".join("" if t is None else str(t) for t in parsed)
        else:
            return str(parsed)

    return s



class LMSYSDataset(Dataset):
    def __init__(self, df, tokenizer, max_length, is_test=False):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_test = is_test
        
        # Convert to records for speed
        self.data = self.df[["prompt", "response_a", "response_b"]].to_dict("records")

        # Pre-compute labels
        if not self.is_test:
            self.labels = []
            for _, row in self.df.iterrows():
                if row["winner_model_a"] == 1:
                    label = 0
                elif row["winner_model_b"] == 1:
                    label = 1
                else:
                    label = 2
                self.labels.append(label)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        prompt = row["prompt"]
        response_a = row["response_a"]
        response_b = row["response_b"]

        if not self.is_test:
            if row["winner_model_a"] == 1:
                label = 0   # A wins
            elif row["winner_model_b"] == 1:
                label = 1   # B wins
            else:
                label = 2   # tie
        else:
            label = -1  # unused

        # ----- Data augmentation: random swap A/B -----
        if not self.is_test:
            # 50% chance to swap
            if random.random() < 0.5:
                response_a, response_b = response_b, response_a
                if label == 0:
                    label = 1
                elif label == 1:
                    label = 0
        
        encoded = self.tokenizer(
            prompt,
            response_a + self.tokenizer.eos_token + response_b,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        item = {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
        }

        if not self.is_test:
            item["labels"] = torch.tensor(label, dtype=torch.long)

        return item


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = F.softmax(torch.tensor(logits), dim=-1).numpy()

    num_classes = probs.shape[1]
    labels = np.array(labels, dtype=int)
    labels_oh = np.eye(num_classes)[labels]

    return {"log_loss": log_loss(labels_oh, probs)}

In [13]:
import ast
import numpy as np
import pandas as pd
import re

def strip_surrogates(text: str) -> str:
    """Remove characters that cannot be encoded in UTF-8 (surrogates etc)."""
    if not isinstance(text, str):
        text = str(text)
    return text.encode("utf-8", "replace").decode("utf-8")

def strip_control_chars(s: str) -> str:
    """Remove ASCII control chars except \n and \t."""
    return re.sub(r"[\x00-\x08\x0B-\x1F\x7F]", "", s)

def safe_literal_list(text: str):
    """Safely parse stringified list like '["a", "b"]', else return None."""
    if text.startswith("[") and text.endswith("]"):
        try:
            parsed = ast.literal_eval(text)
            if isinstance(parsed, list):
                return parsed
        except Exception:
            return None
    return None

def clean_text(x):
    """Complete cleaning pipeline."""
    # None / NaN
    if x is None:
        return ""
    if isinstance(x, float) and np.isnan(x):
        return ""

    if isinstance(x, list):
        return " ".join(strip_surrogates(strip_control_chars(str(t))) for t in x)

    s = str(x).strip()

    parsed_list = safe_literal_list(s)
    if parsed_list is not None:
        return " ".join(strip_surrogates(strip_control_chars(str(t))) for t in parsed_list)

    # Strip unicode surrogates and control characters
    s = strip_surrogates(s)
    s = strip_control_chars(s)

    return s

print("Loading data...")

train_df = pd.read_csv(cfg.train_path)
train_ext_df = pd.read_csv(cfg.train_path_extend)
test_df = pd.read_csv(cfg.test_path)

print("Original train shape:", train_df.shape)
print("Original extended train shape :", train_ext_df.shape)
print("Original test shape:", test_df.shape)

dfs = {"train": train_df, "test": test_df, "train_ext": train_ext_df}

for df_name, df in dfs.items():
    df.dropna(subset=["prompt", "response_a", "response_b"], inplace=True)
    for col in ["prompt", "response_a", "response_b"]:
        df[col] = df[col].apply(clean_text)

    mask_empty = (
        df["prompt"].str.strip().eq("") |
        df["response_a"].str.strip().eq("") |
        df["response_b"].str.strip().eq("")
    )
    to_drop = mask_empty.sum()
    if to_drop > 0:
        print(f"{df_name}: Dropping {to_drop} empty rows")
        df.drop(df[mask_empty].index, inplace=True)

    df.reset_index(drop=True, inplace=True)

    print(f"After cleaning, {df_name} shape:", df.shape)

print("Concatenating main + extended train...")
train_df = pd.concat([train_df, train_ext_df], ignore_index=True)
print("Final combined train shape:", train_df.shape)

Loading data...
Original train shape: (57477, 9)
Original extended train shape : (21187, 9)
Original test shape: (3, 4)
train: Dropping 26 empty rows
After cleaning, train shape: (57451, 9)
After cleaning, test shape: (3, 4)
train_ext: Dropping 21 empty rows
After cleaning, train_ext shape: (21166, 9)
Concatenating main + extended train...
Final combined train shape: (78617, 9)


In [14]:
train_df.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,Is it morally right to try to have a certain p...,The question of whether it is morally right to...,"As an AI, I don't have personal beliefs or opi...",1,0,0
1,53567,koala-13b,gpt-4-0613,What is the difference between marriage licens...,A marriage license is a legal document that al...,A marriage license and a marriage certificate ...,0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,explain function calling. how would you call a...,Function calling is the process of invoking or...,Function calling is the process of invoking a ...,0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,How can I create a test set for a very rare ca...,Creating a test set for a very rare category c...,When building a classifier for a very rare cat...,1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,What is the best way to travel from Tel-Aviv t...,The best way to travel from Tel Aviv to Jerusa...,The best way to travel from Tel-Aviv to Jerusa...,0,1,0


In [None]:
print("Loading tokenizer & model...")
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

# Ensure pad_token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Split data
train_split, val_split = train_test_split(
    train_df,
    test_size=0.1,
    random_state=cfg.seed,
    shuffle=True,
)

# Initialize Datasets (Now fast because we don't tokenize yet)
train_dataset = LMSYSDataset(train_split, tokenizer, cfg.max_length, is_test=False)
valid_dataset = LMSYSDataset(val_split, tokenizer, cfg.max_length, is_test=False)

print(f"Train samples: {len(train_dataset)}, Val samples: {len(valid_dataset)}")

model = AutoModelForSequenceClassification.from_pretrained(
    cfg.model_name,
    num_labels=3,
)

if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id

# Use DataCollator to pad batches dynamically to the longest sequence in the BATCH
# (instead of padding everything to max_length=1024, which wastes GPU memory)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(
    output_dir=f"./outputs/{cfg.model_name.replace('/', '_')}_finetuned",
    num_train_epochs=cfg.epochs,
    per_device_train_batch_size=cfg.train_batch_size,
    per_device_eval_batch_size=cfg.valid_batch_size,
    learning_rate=cfg.learning_rate,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="log_loss",
    greater_is_better=False,
    logging_steps=100,
    save_total_limit=1,
    report_to="none",
    fp16=True, # Recommended for T4/P100 GPUs (Kaggle) to save memory/speed up
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator
)

print("Starting training...")
trainer.train()
print("Training done.")

Loading tokenizer & model...
Train samples: 51705, Val samples: 5746


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Log Loss,Runtime,Samples Per Second,Steps Per Second
1,1.0959,1.097005,1.097114,37.673,152.523,38.144
2,1.0949,1.096594,1.09655,55.7784,103.015,25.763
3,1.0994,1.09717,1.097041,54.686,105.073,26.277


Training done.


In [None]:
cfg = TrainCfg(
    model_name="distilroberta-base", 
    epochs=10,
    learning_rate=3e-5, 
    train_batch_size=64, 
    valid_batch_size=64
)

In [9]:
print("Loading tokenizer & model...")
tokenizer = AutoTokenizer.from_pretrained(cfg.model_name)

# Ensure pad_token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Split data
train_split, val_split = train_test_split(
    train_df,
    test_size=0.1,
    random_state=cfg.seed,
    shuffle=True,
)

# Initialize Datasets (Now fast because we don't tokenize yet)
train_dataset = LMSYSDataset(train_split, tokenizer, cfg.max_length, is_test=False)
valid_dataset = LMSYSDataset(val_split, tokenizer, cfg.max_length, is_test=False)

print(f"Train samples: {len(train_dataset)}, Val samples: {len(valid_dataset)}")

model = AutoModelForSequenceClassification.from_pretrained(
    cfg.model_name,
    num_labels=3,
)

if model.config.pad_token_id is None:
    model.config.pad_token_id = tokenizer.pad_token_id

# Use DataCollator to pad batches dynamically to the longest sequence in the BATCH
# (instead of padding everything to max_length=1024, which wastes GPU memory)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

args = TrainingArguments(
    output_dir=f"./outputs/{cfg.model_name.replace('/', '_')}_finetuned",
    num_train_epochs=cfg.epochs,
    per_device_train_batch_size=cfg.train_batch_size,
    per_device_eval_batch_size=cfg.valid_batch_size,
    learning_rate=cfg.learning_rate,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="log_loss",
    greater_is_better=False,
    logging_steps=100,
    save_total_limit=1,
    report_to="none",
    fp16=True, # Recommended for T4/P100 GPUs (Kaggle) to save memory/speed up
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=data_collator, # <--- Added this
)

print("Starting training...")
trainer.train()
print("Training done.")

Loading tokenizer & model...
Train samples: 51705, Val samples: 5746


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Starting training...


Epoch,Training Loss,Validation Loss,Log Loss,Runtime,Samples Per Second,Steps Per Second
1,1.0849,1.07851,1.078511,23.5411,244.084,3.823
2,1.0723,1.075383,1.075381,25.5923,224.52,3.517
3,1.0416,1.060189,1.060189,23.0309,249.49,3.908
4,1.0114,1.06365,1.063648,24.5329,234.216,3.669
5,0.942,1.158816,1.158819,24.1267,238.159,3.73
6,0.8937,1.256071,1.256077,24.0683,238.737,3.739


KeyboardInterrupt: 

In [None]:
cfg = TrainCfg(
    model_name="distilroberta-base", 
    max_length=1024,
    learning_rate=3e-5, 
    train_batch_size=16, 
    valid_batch_size=16
)
trainer.train(resume_from_checkpoint=True)

In [101]:
print("Starting inference with A/B swap TTA...")

# 1. Normal test dataset: (prompt, A, B)
test_ds_normal = LMSYSDataset(test_df, tokenizer, cfg.max_length, is_test=True)

# 2. Swapped test dataset: (prompt, B, A)
test_df_swapped = test_df.copy()
test_df_swapped[["response_a", "response_b"]] = test_df_swapped[["response_b", "response_a"]]
test_ds_swapped = LMSYSDataset(test_df_swapped, tokenizer, cfg.max_length, is_test=True)

# Predict
preds_normal = trainer.predict(test_ds_normal).predictions
preds_swapped = trainer.predict(test_ds_swapped).predictions

probs_normal = F.softmax(torch.tensor(preds_normal), dim=-1).numpy()
probs_swapped = F.softmax(torch.tensor(preds_swapped), dim=-1).numpy()

# For swapped, model's "class 0" means "first response wins" which is B in original,
# and "class 1" means "second response wins" which is A in original.
# So we need to swap back the first two probability columns.
probs_swapped_fixed = np.zeros_like(probs_swapped)
probs_swapped_fixed[:, 0] = probs_swapped[:, 1]  # B->A
probs_swapped_fixed[:, 1] = probs_swapped[:, 0]  # A->B
probs_swapped_fixed[:, 2] = probs_swapped[:, 2]  # tie unchanged

# Ensemble
final_probs = (probs_normal + probs_swapped_fixed) / 2.0

submission = pd.DataFrame(
    {
        "id": test_df["id"].values,
        "winner_model_a": final_probs[:, 0],
        "winner_model_b": final_probs[:, 1],
        "winner_tie": final_probs[:, 2],
    }
)

os.makedirs(os.path.dirname(cfg.sub_path), exist_ok=True)
submission.to_csv(cfg.sub_path, index=False)
print("Saved submission to:", cfg.sub_path)
submission.head()


Starting inference with A/B swap TTA...


Saved submission to: data/ours_submission.csv


Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
0,136060,0.347363,0.347363,0.305274
1,211333,0.347366,0.347366,0.305269
2,1233961,0.347362,0.347362,0.305276
