In [None]:
import wandb

from dataclasses import dataclass, field
from datasets import Dataset, load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import RewardConfig, RewardTrainer
from tqdm import tqdm

import pandas as pd

In [None]:
tqdm.pandas()

In [None]:
# Log in to Weights and Biases for training logging
wandb.login()

In [None]:
# TODO: Replace this with correct data split
DATASET = pd.read_feather("../mini_codenet/en_mini_codenet.ftr")
DATASET.head()

In [None]:
# Sample 1000 accepted solutions at random.
# TODO: In practice, to use all of our data, we would consider all accepted solutions
subset_accepted = DATASET[DATASET["status"] == "Accepted"].sample(1000)[["submission_id", "problem_id"]]

print("Total Problems:", len(subset_accepted["submission_id"]))
print("Unique IDs:", len(subset_accepted["problem_id"].unique()))

In [None]:
# For each accepted solution, chose a contrasting rejected solution at random
# TODO: In practice, to use all of our data, we would consider all accepted-rejected pairs per problem id
data = { "accepted": [], "rejected": [] }
for submission_id, problem_id in tqdm(subset_accepted.values):
    accepted = DATASET[DATASET["submission_id"] == submission_id]
    rejected = DATASET[(DATASET["problem_id"] == problem_id) & (DATASET["status"] != "Accepted")].sample(1)

    data["accepted"].append(accepted["solution"].iloc[0])
    data["rejected"].append(rejected["solution"].iloc[0])

In [None]:
@dataclass
class ScriptArguments:
    model_name: str = "../hf_model/" # TODO: Change path to correct SFT model
    """the model name"""
    eval_split: bool = False
    """the dataset split to evaluate on; default to 'none' (no evaluation)"""
    reward_config: RewardConfig = field(
        default_factory=lambda: RewardConfig(
            output_dir="output",
            per_device_train_batch_size=64,
            num_train_epochs=1,
            gradient_accumulation_steps=16,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": False},
            learning_rate=1.41e-5,
            report_to="wandb", # log training progress to Weights and Biases
            remove_unused_columns=False,
            optim="adamw_torch",
            logging_steps=500,
            evaluation_strategy="no",
            max_length=256, # TODO: NEED TO CHANGE THIS!
        )
    )

args = ScriptArguments()
args.reward_config.evaluation_strategy = "steps" if args.eval_split else "no"

In [None]:
# Step 1: Load the dataset and pre-process it
tokenizer = AutoTokenizer.from_pretrained(args.model_name)
train_dataset = Dataset.from_dict(data)

In [None]:
# Step 2: Load the model
model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=1)
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
# Tokenize chosen/rejected pairs of inputs
def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["accepted"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen)
        tokenized_rejected = tokenizer(rejected)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples


# Preprocess the dataset and filter out examples that are longer than args.max_length
train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)
train_dataset = train_dataset.filter(
    lambda x: len(x["input_ids_chosen"]) <= args.reward_config.max_length
    and len(x["input_ids_rejected"]) <= args.reward_config.max_length
)

In [None]:
if not args.eval_split:
    eval_dataset = None
else:
    # TODO: Load evaluation split following a similar process to loading the training split

    # eval_dataset = ...

    # eval_dataset = eval_dataset.map(
    #     preprocess_function,
    #     batched=True,
    #     num_proc=4,
    # )

    # eval_dataset = eval_dataset.filter(
    #     lambda x: len(x["input_ids_chosen"]) <= args.reward_config.max_length
    #     and len(x["input_ids_rejected"]) <= args.reward_config.max_length
    # )

    pass

In [None]:
# Step 4: Define the Trainer
trainer = RewardTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args.reward_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()