In [1]:
import wandb

from dataclasses import dataclass, field
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from trl import RewardConfig, RewardTrainer
from peft import LoraConfig, TaskType # Parameter Efficient Fine Tuning
from tqdm import tqdm

import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'




In [2]:
tqdm.pandas()

In [3]:
# Log in to Weights and Biases for training logging
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mfaustotnc[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
DATASET_TRAIN = pd.read_feather("../mini_codenet/data/split/reward_train.ftr")
DATASET_EVAL = pd.read_feather("../mini_codenet/data/split/reward_val.ftr")

In [5]:
DATASET_TRAIN.head()

Unnamed: 0,level_0,index,submission_id,problem_id,language,filename_ext,status,cpu_time,memory,code_size,accuracy,solution,problem_statement
0,489131,3150,s521116537,p03440,C++,cpp,Accepted,45.0,6528.0,1067,,#include<bits/stdc++.h>\nusing namespace std;\...,Score : 600 points \n Problem Statement You ar...
1,1591382,13068,s902754234,p03835,C++,cpp,Time Limit Exceeded,2103.0,256.0,270,,#include <iostream>\nusing namespace std;\n\ni...,Score : 200 points \n Problem Statement You ar...
2,2735943,2045,s058172052,p02345,C++,cpp,Wrong Answer,0.0,19400.0,1853,0/20,#include <iostream>\n#include <string>\n#inclu...,Range Minimum Query (RMQ) \nWrite a program wh...
3,1085890,29535,s447764573,p02658,Python,py,Wrong Answer,55.0,21656.0,322,,import sys\n \nread = sys.stdin.buffer.read\nr...,Score : 200 points \n Problem Statement Given ...
4,392827,6790,s702283400,p02824,C++,cpp,Wrong Answer,50.0,640.0,1415,,#include <bits/stdc++.h>\nusing namespace std;...,Score : 700 points \n Problem Statement N prob...


In [6]:
print(len(DATASET_TRAIN))
DATASET_TRAIN.groupby("status")["solution"].count()

666324


status
Accepted                  359086
Compile Error              25221
Memory Limit Exceeded        531
Output Limit Exceeded         41
Query Limit Exceeded           5
Runtime Error              41273
Time Limit Exceeded        36310
WA: Presentation Error      2693
Wrong Answer              201164
Name: solution, dtype: int64

In [7]:
DATASET_EVAL.head()

Unnamed: 0,level_0,index,submission_id,problem_id,language,filename_ext,status,cpu_time,memory,code_size,accuracy,solution,problem_statement
0,1606622,4831,s049565156,p02726,Python,py,Accepted,463.0,48220.0,850,,"from collections import deque\n\nn, x, y = [in...",Score : 400 points \n Problem Statement \n We ...
1,2671865,40710,s156125045,p02623,C++,cpp,Accepted,49.0,6776.0,1635,,#include<bits/stdc++.h>\n#include <ext/pb_ds/a...,Score : 300 points \n Problem Statement We hav...
2,544740,17793,s484568710,p02400,Python,py,Runtime Error,0.0,0.0,74,0/5,"import math\nr = int(input())\n\nprint(""%f %f""...",Circle \nWrite a program which calculates the ...
3,1992812,3370,s260772730,p03012,C++,cpp,Accepted,2.0,256.0,2275,,#define _GLIBCXX_DEBUG\n// header {{{\n#includ...,Score : 200 points \n Problem Statement We hav...
4,2123135,17567,s239779632,p02657,C++,cpp,Accepted,2.0,3636.0,1489,,"#include ""bits/stdc++.h""\nusing namespace std;...",Score : 100 points \n Problem Statement Comput...


In [8]:
print(len(DATASET_EVAL))
DATASET_EVAL.groupby("status")["solution"].count()

190379


status
Accepted                  102703
Compile Error               7130
Memory Limit Exceeded        159
Output Limit Exceeded         13
Runtime Error              11867
Time Limit Exceeded        10076
WA: Presentation Error       774
Wrong Answer               57657
Name: solution, dtype: int64

In [9]:
# Sample 1000 accepted solutions at random.
accepted_train = DATASET_TRAIN[DATASET_TRAIN["status"] == "Accepted"][["submission_id", "problem_id", "language", "solution"]]
rejected_train = DATASET_TRAIN[DATASET_TRAIN["status"] != "Accepted"][["submission_id", "problem_id", "language", "solution"]]
accepted_eval = DATASET_EVAL[DATASET_EVAL["status"] == "Accepted"][["submission_id", "problem_id", "language", "solution"]]
rejected_eval = DATASET_EVAL[DATASET_EVAL["status"] != "Accepted"][["submission_id", "problem_id", "language", "solution"]]

print("Total Accepted Problems in TRAIN:", len(accepted_train["submission_id"]))
print("Total Rejected Problems in TRAIN:", len(rejected_train["submission_id"]))
print("Unique IDs in Accepted TRAIN:", len(accepted_train["problem_id"].unique()))
print("Unique IDs in Rejected TRAIN:", len(rejected_train["problem_id"].unique()))
print("------------")
print("Total Accepted Problems in EVAL:", len(accepted_eval["submission_id"]))
print("Total Rejected Problems in EVAL:", len(rejected_eval["submission_id"]))
print("Unique IDs in Accepted EVAL:", len(accepted_eval["problem_id"].unique()))
print("Unique IDs in Rejected EVAL:", len(rejected_eval["problem_id"].unique()))

Total Accepted Problems in TRAIN: 359086
Total Rejected Problems in TRAIN: 307238
Unique IDs in Accepted TRAIN: 2449
Unique IDs in Rejected TRAIN: 2368
------------
Total Accepted Problems in EVAL: 102703
Total Rejected Problems in EVAL: 87676
Unique IDs in Accepted EVAL: 2198
Unique IDs in Rejected EVAL: 2120


In [10]:
# For each accepted solution, chose a contrasting rejected 
def get_contrastive_pairs(data_accepted, data_rejected, n=3):
    data = { "accepted": [], "rejected": [] }

    # SPEED UP!! Group rejected answers by problem_id and language and cache the results so
    # we do not have to filter the whole dataset inside the main for-loop on every iteration.
    # Plus, we get O(1) look up time 😎
    grouped_rejected = data_rejected.groupby(["problem_id", "language"])["solution"].apply(list).to_dict()

    for _, accepted_pid, accepted_lang, accepted_sol in tqdm(data_accepted.values):
        key = (accepted_pid, accepted_lang)

        if key in grouped_rejected:
            # Get up to `n`` rejected examples in the current language for the current problem.
            rejected_filtered = grouped_rejected[key]

            size = min(len(rejected_filtered), n)
            for idx in np.random.randint(0, len(rejected_filtered), size):
                data["accepted"].append(accepted_sol)
                data["rejected"].append(rejected_filtered[idx])
        else:
            # The problem only contains a correct solutions in the current language. Skip it.
            pass

    return Dataset.from_dict(data)

# Tokenize chosen/rejected pairs of inputs
def preprocess_function(examples, tokenizer):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }

    for chosen, rejected in zip(examples["accepted"], examples["rejected"]):
        tokenized_chosen = tokenizer(chosen)
        tokenized_rejected = tokenizer(rejected)

        new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_chosen["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_rejected["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])

    return new_examples

# Preprocess the dataset and filter out examples that are longer than args.max_length
def process_data(accepted, rejected, tokenizer, args):
    dataset = get_contrastive_pairs(accepted, rejected)

    dataset = dataset.map(
        lambda example: preprocess_function(example, tokenizer),
        batched=True,
        num_proc=4,
    )

    dataset = dataset.filter(
        lambda x: len(x["input_ids_chosen"]) <= args.reward_config.max_length
        and len(x["input_ids_rejected"]) <= args.reward_config.max_length
    )
    
    return dataset

In [11]:
@dataclass
class ScriptArguments:
    model_name: str = "../hf_model/" # TODO: Change path to correct SFT model
    """the model name"""
    eval_split: bool = False
    """the dataset split to evaluate on; default to 'none' (no evaluation)"""
    reward_config: RewardConfig = field(
        default_factory=lambda: RewardConfig(
            output_dir="output",
            per_device_train_batch_size=64,
            num_train_epochs=10,
            gradient_accumulation_steps=16,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={"use_reentrant": False},
            learning_rate=1.41e-5,
            report_to="wandb", # log training progress to Weights and Biases
            remove_unused_columns=False,
            optim="adamw_torch",
            logging_steps=500,
            evaluation_strategy="no",
            max_length=256, # TODO: NEED TO CHANGE THIS!
        )
    )

args = ScriptArguments()
args.reward_config.evaluation_strategy = "steps" if args.eval_split else "no"

In [12]:
# Step 1: Load the dataset and pre-process it
tokenizer = AutoTokenizer.from_pretrained(args.model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [17]:
print("Training Data:")
train_dataset = process_data(accepted_train, rejected_train, tokenizer, args)

print("\nEvaluation Data:")
eval_dataset = process_data(accepted_eval, rejected_eval, tokenizer, args)

Training Data:


100%|██████████| 359086/359086 [00:02<00:00, 177351.66it/s]
Map (num_proc=4):   4%|▎         | 40000/1069584 [00:35<11:13, 1528.27 examples/s]Exception ignored in: <generator object iflatmap_unordered at 0x2bc3019e0>
Traceback (most recent call last):
  File "/Users/faustogerman/miniforge3/envs/ML2Project/lib/python3.9/site-packages/datasets/utils/py_utils.py", line 1394, in iflatmap_unordered
    [async_result.get(timeout=0.05) for async_result in async_results]
  File "/Users/faustogerman/miniforge3/envs/ML2Project/lib/python3.9/site-packages/datasets/utils/py_utils.py", line 1394, in <listcomp>
    [async_result.get(timeout=0.05) for async_result in async_results]
  File "/Users/faustogerman/miniforge3/envs/ML2Project/lib/python3.9/site-packages/multiprocess/pool.py", line 767, in get
    raise TimeoutError
multiprocess.context.TimeoutError: 
Map (num_proc=4):   4%|▎         | 40000/1069584 [00:35<15:21, 1117.14 examples/s]


KeyboardInterrupt: 

In [14]:
# Step 2: Load the model
model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=1)
model.config.pad_token_id = tokenizer.pad_token_id

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ../hf_model/ and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [16]:
# Step 4: Define the Trainer
trainer = RewardTrainer(
    model=model,
    tokenizer=tokenizer,
    args=args.reward_config,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=peft_config
)

trainer.train()

Problem at: /Users/faustogerman/miniforge3/envs/ML2Project/lib/python3.9/site-packages/transformers/integrations/integration_utils.py 740 setup


KeyboardInterrupt: 