In [2]:
import sys

# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, "/root/AISMicroOrg")


from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import numpy as np
import torch
import torch.nn as nn
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    HfArgumentParser,
    PreTrainedTokenizerBase,
    Trainer,
    TrainerCallback,
    TrainingArguments,
)
from dataset_utils import build_reward_dataset, RewardDataCollatorWithPadding
from reward_utils import compute_metrics, RewardTrainer

In [3]:
@dataclass
class ScriptArguments:
    """
    These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
    """

    local_rank: Optional[int] = field(
        default=-1, metadata={"help": "Used for multi-gpu"}
    )
    resume_from_checkpoint: Optional[bool] = field(
        default=False,
        metadata={"help": "If you want to resume training where it left off."},
    )
    deepspeed: Optional[str] = field(
        default=None,
        metadata={
            "help": "Path to deepspeed config if using deepspeed. You may need this if the model that you want to train doesn't fit on a single GPU."
        },
    )
    per_device_train_batch_size: Optional[int] = field(default=2)
    per_device_eval_batch_size: Optional[int] = field(default=1)
    gradient_accumulation_steps: Optional[int] = field(default=1)
    learning_rate: Optional[float] = field(default=2e-5)
    weight_decay: Optional[float] = field(default=0.001)
    model_name: Optional[str] = field(
        default="gpt2",
        metadata={
            "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
        },
    )
    data_folder: Optional[str] = field(
        default="/root/AISMicroOrg/stack-exchange-paired_micro",
        metadata={"help": "The path to the data folder."},
    )
    tokenizer_name: Optional[str] = field(
        default=None,
        metadata={
            "help": "The tokenizer for your model, if left empty will use the default for your model",
        },
    )
    bf16: Optional[bool] = field(
        default=True,
        metadata={
            "help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU."
        },
    )
    num_train_epochs: Optional[int] = field(
        default=1,
        metadata={"help": "The number of training epochs for the reward model."},
    )
    train_subset: Optional[int] = field(
        default=20000,
        metadata={"help": "The size of the subset of the training data to use"},
    )
    eval_subset: Optional[int] = field(
        default=5000,
        metadata={"help": "The size of the subset of the eval data to use"},
    )
    gradient_checkpointing: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables gradient checkpointing."},
    )
    optim: Optional[str] = field(
        default="adamw_hf",
        metadata={"help": "The optimizer to use."},
    )
    lr_scheduler_type: Optional[str] = field(
        default="linear",
        metadata={"help": "The lr scheduler"},
    )
    max_length: Optional[int] = field(default=512)
    eval_first_step: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to run eval after the first step"},
    )

In [4]:
script_args = ScriptArguments()
script_args.data_folder = "/root/AISMicroOrg/stack-exchange-paired_micro"
script_args.model_name = "/root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9"

In [5]:
model_name_split = script_args.model_name.split("/")[-1]
output_name = f"{model_name_split}_peft_stack-exchange--paired_micro_rmts__{script_args.train_subset}_{script_args.learning_rate}"

In [6]:
training_args = TrainingArguments(
    output_dir=output_name,
    learning_rate=script_args.learning_rate,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    num_train_epochs=script_args.num_train_epochs,
    weight_decay=script_args.weight_decay,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    gradient_checkpointing=script_args.gradient_checkpointing,
    deepspeed=script_args.deepspeed,
    local_rank=script_args.local_rank,
    remove_unused_columns=False,
    label_names=[],
    bf16=script_args.bf16,
    logging_strategy="steps",
    logging_steps=10,
    optim=script_args.optim,
    lr_scheduler_type=script_args.lr_scheduler_type,
)
# Load the value-head model and tokenizer.
tokenizer_name = (
    script_args.tokenizer_name
    if script_args.tokenizer_name is not None
    else script_args.model_name
)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token



In [7]:
train_dataset, eval_dataset = build_reward_dataset(tokenizer, script_args)

  table = cls._concat_blocks(blocks, axis=0)


Filter:   0%|          | 0/20000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [12]:
tokenizer.decode(train_dataset[0]["input_ids_k"])

"<s> Question: I am having trouble precisely telling the difference between a fact and an opinion. For example, let's say there is a man that weighs 500 pounds. Would it be a fact to refer to him as fat because that could proven. Another example would be saying Donald Trump is a bad president. Could that proven or would that just be an opinion?\n\nAnswer: The man weighs 500 pounds. That's a fact.\n\nThe man is fat. That's an opinion, although coming close to a common definition.\n\nFacts can be proved by experts in the respective field with (almost) always the same result. Facts always remain the same.\n\nOpinions can be differing. They can change from person to person or even from time to time. Here is an example: \n\nWhen I was young, I knew a lot of elder persons in Germany whom I considered fat. When I visited the USA for the first time, I saw a lot of persons whom I considered really as fat as I never had seen before. I remember a sheriff ordering a Coca Cola and pouring a pound o

In [13]:
tokenizer.decode(train_dataset[0]["input_ids_j"])

'<s> Question: I am having trouble precisely telling the difference between a fact and an opinion. For example, let\'s say there is a man that weighs 500 pounds. Would it be a fact to refer to him as fat because that could proven. Another example would be saying Donald Trump is a bad president. Could that proven or would that just be an opinion?\n\nAnswer: I\'d suggest that a proposition like your "this man is fat" can only be a "fact" if it\'s operationally verifiable, as per, e.g., <https://plato.stanford.edu/entries/operationalism/> and <https://en.wikipedia.org/wiki/Verificationism>\n\nSo first you\'d have to define "fat" as, say, "ratio of weight-in-pounds divided by height-in-feet greater than 35". And then you\'d have to specify the construction of "experimental apparatus" like scales and rulers for your weight and height measurements.\n\nAnd now your statement, "this man is fat", is a true-or-false fact, at least with respect to your verifiable definition and measurement proced

In [8]:
import pickle

# from transformers import load_dataset

dataset_name = "/root/AISMicroOrg/stack-exchange-paired_micro/"
for n in ["evaluation", "finetune", "reward", "rl"]:
    print(
        n,
        ": ",
        load_dataset(dataset_name, data_dir=f"data/{n}", split="train").num_rows,
    )

evaluation :  8354
finetune :  25152
reward :  25704
rl :  25488


In [7]:
d

Dataset({
    features: ['qid', 'question', 'date', 'metadata', 'response_j', 'response_k'],
    num_rows: 25488
})

In [None]:
def load_pickled_file(path):

In [7]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
)

model = AutoModelForSequenceClassification.from_pretrained(
    script_args.model_name, num_labels=1, torch_dtype=torch.bfloat16
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Need to do this for gpt2, because it doesn't have an official pad token.
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id
model.config.use_cache = not script_args.gradient_checkpointing
num_proc = 24  # Can adjust to be higher if you have more processors.
original_columns = train_dataset.column_names

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 4,198,400 || all params: 6,611,546,112 || trainable%: 0.0635010318143267


In [8]:
# accuracy = evaluate.load("accuracy")


# Train the model, woohoo.
trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    data_collator=RewardDataCollatorWithPadding(
        tokenizer=tokenizer, max_length=script_args.max_length
    ),
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
if script_args.eval_first_step:

    class EvaluateFirstStepCallback(TrainerCallback):
        def on_step_end(self, args, state, control, **kwargs):
            if state.global_step == 1:
                control.should_evaluate = True

    trainer.add_callback(EvaluateFirstStepCallback())

trainer.train(script_args.resume_from_checkpoint)

print("Saving last checkpoint of the model")
model.save_pretrained(output_name + "_peft_last_checkpoint")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mtomas-t[0m ([33mda-zealots[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
500,1.4551,1.275517,0.482716
1000,0.8075,0.986979,0.562346
1500,0.7705,0.944661,0.594444
2000,1.0254,0.92954,0.623457
2500,1.3983,0.91166,0.625926
3000,0.9532,0.917703,0.62963




Saving last checkpoint of the model
