In [1]:
import sys

# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, "/root/AISMicroOrg")

from dataclasses import dataclass, field
from typing import Optional

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler

from transformers import AutoConfig



In [2]:
@dataclass
class ScriptArguments:
    """
    The name of the Casual LM model we wish to fine with PPO
    """

    # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode
    # models like gpt-neo* models are more suitable.
    model_name: Optional[str] = field(default="", metadata={"help": "the model name"})
    tokenizer_name: Optional[str] = field(
        default="", metadata={"help": "the tokenizer name"}
    )
    reward_model_name: Optional[str] = field(
        default="", metadata={"help": "the reward model name"}
    )
    log_with: Optional[str] = field(
        default=None, metadata={"help": "use 'wandb' to log with wandb"}
    )
    learning_rate: Optional[float] = field(
        default=1.41e-5, metadata={"help": "the learning rate"}
    )
    output_max_length: Optional[int] = field(
        default=512, metadata={"help": "maximum length for generation"}
    )
    mini_batch_size: Optional[int] = field(
        default=1, metadata={"help": "the PPO minibatch size"}
    )
    batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"})
    ppo_epochs: Optional[int] = field(
        default=4, metadata={"help": "the number of ppo epochs"}
    )
    gradient_accumulation_steps: Optional[int] = field(
        default=4, metadata={"help": "the number of gradient accumulation steps"}
    )
    adafactor: Optional[bool] = field(
        default=False, metadata={"help": "whether to use the adafactor optimizer"}
    )
    early_stopping: Optional[bool] = field(
        default=False, metadata={"help": "whether to early stop"}
    )
    target_kl: Optional[float] = field(
        default=0.1, metadata={"help": "kl target for early stopping"}
    )
    reward_baseline: Optional[float] = field(
        default=0.0,
        metadata={"help": "a baseline value that is subtracted from the reward"},
    )
    batched_gen: Optional[bool] = field(
        default=False, metadata={"help": "whether to use the batched text gen"}
    )
    save_freq: Optional[int] = field(
        default=None, metadata={"help": "n steps to save the model"}
    )
    output_dir: Optional[str] = field(
        default="runs/", metadata={"help": "n steps to save the model"}
    )
    seed: Optional[int] = field(default=0, metadata={"help": "the seed"})
    steps: Optional[int] = field(default=20000, metadata={"help": "number of epochs"})
    init_kl_coef: Optional[float] = field(
        default=0.2,
        metadata={
            "help": "Initial KL penalty coefficient (used for adaptive and linear control)"
        },
    )

    adap_kl_ctrl: Optional[bool] = field(
        default=True, metadata={"help": "Use adaptive KL control, otherwise linear"}
    )

In [3]:
script_args = ScriptArguments()
script_args.model_name = "/root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9"
# script_args.model_name = "/root/AISMicroOrg/finetuning/results_ft"
# script_args.model_name = "gpt2"
script_args.reward_model_name = "/root/AISMicroOrg/reward_model/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9_peft_stack-exchange--paired_micro_rmts__20000_2e-05_peft_last_checkpoint"

In [4]:
reward_model_name = script_args.reward_model_name
dataset_name = "../stack-exchange-paired_micro"
ppo_config = PPOConfig(
    steps=script_args.steps,
    model_name=script_args.model_name,
    learning_rate=script_args.learning_rate,
    log_with=script_args.log_with,
    batch_size=script_args.batch_size,
    mini_batch_size=script_args.mini_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    optimize_cuda_cache=True,
    early_stopping=script_args.early_stopping,
    target_kl=script_args.target_kl,
    ppo_epochs=script_args.ppo_epochs,
    seed=script_args.seed,
    init_kl_coef=script_args.init_kl_coef,
    adap_kl_ctrl=script_args.adap_kl_ctrl,
)

In [5]:
sent_kwargs = {
    "top_k": None,
    "function_to_apply": "none",
    "batch_size": 16,
    "truncation": True,
}

In [6]:
if script_args.tokenizer_name == "":
    tokenizer = AutoTokenizer.from_pretrained(script_args.model_name)
else:
    tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name)
# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
# only for this model.

if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token

In [7]:
from dataset_utils import build_rlhf_dataset

In [8]:
dataset = build_rlhf_dataset(tokenizer)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

  table = cls._concat_blocks(blocks, axis=0)


In [9]:
set_seed(ppo_config.seed)

# Now let's build the model, the reference model, and the tokenizer.
current_device = Accelerator().local_process_index

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [11]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    ppo_config.model_name,
    load_in_8bit=True,
    device_map={"": current_device},
    peft_config=lora_config,
)

optimizer = None
if script_args.adafactor:
    optimizer = Adafactor(
        filter(lambda p: p.requires_grad, model.parameters()),
        scale_parameter=False,
        relative_step=False,
        warmup_init=False,
        lr=ppo_config.learning_rate,
    )
# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = PPOTrainer(
    ppo_config,
    model,
    ref_model=None,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
    optimizer=optimizer,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [12]:
task = "sentiment-analysis"

model_reward_config = AutoConfig.from_pretrained(
    "/root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
    _from_pipeline=task,
)
model_reward_config.pad_token_id = model_reward_config.eos_token_id

In [14]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a ` pipeline` bug

sentiment_pipe = pipeline(
    task=task,
    model=reward_model_name,
    device_map={"": current_device},
    model_kwargs={"load_in_8bit": True, "num_labels": 1},
    tokenizer=tokenizer,
    return_token_type_ids=False,
    config=model_reward_config,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
output = r_m(tokenizer(batch["query"][0], return_tensors="pt")["input_ids"])

In [55]:
output[0]

tensor([[-1.0255]])

In [57]:
output["logits"]

tensor([[-1.0255]])

In [15]:
generation_kwargs = {
    # "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000,
}
output_min_length = 32
output_max_length = script_args.output_max_length
output_length_sampler = LengthSampler(output_min_length, output_max_length)

In [None]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    if epoch >= ppo_config.total_ppo_epochs:
        break

    question_tensors = batch["input_ids"]

    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **generation_kwargs,
    )
    batch["response"] = tokenizer.batch_decode(
        response_tensors, skip_special_tokens=True
    )

    # Compute reward score (using the sentiment analysis pipeline)
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [
        torch.tensor(output[0]["score"] - script_args.reward_baseline)
        for output in pipe_outputs
    ]

    # Run PPO step
    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    if script_args.save_freq and epoch and epoch % script_args.save_freq == 0:
        ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}")

In [25]:
response_tensors = ppo_trainer.generate(
    batch["input_ids"],
    return_prompt=False,
    length_sampler=output_length_sampler,
    **generation_kwargs,
)
batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)



In [47]:
batch["query"][6]

'Question: To see, means to consciously perceive, unlike detect, which is a mechanical process, which requires no consciousness. A infrared camera can detect light, but it sees nothing. We, humans, see a representation of reality. As answered by my question, "Do we really see objects?", we perceive our brains representation of objects, not the objects themselves. But what of light? Do we see actual electromagnetic radiation, or do we perceive brightness, because of light?\n\nAnswer: '

In [36]:
batch["response"][1]

'1) I don\'t know what you mean by "all" here.\n\n2) I don\'t think you can learn all of philosophy.'

In [46]:
batch["response"][6]

'1. There is no such thing as a "representation of reality" in consciousness.\n\n2. We see light by means of light sensitive cells in our eyes (photoreceptors).\n\n3. Light is a form of energy, and when it is absorbed by a photoreceptor, the photoreceptor changes its state of electrical charge.  These changes in electrical charge are what we see as light.\n\n4. The light sensitive'

In [44]:
len(tokenizer(batch["response"][6])["input_ids"])

100

In [27]:
texts = [q + r for q, r in zip(batch["query"], batch["response"])]

In [28]:
pipe_outputs = sentiment_pipe(texts, **sent_kwargs)



In [29]:
rewards = [
    torch.tensor(output[0]["score"] - script_args.reward_baseline)
    for output in pipe_outputs
]

In [35]:
pipe_outputs[0][0]["score"]

nan

In [18]:
transformers.models.llama.modeling_llama.LlamaForSequenceClassification

transformers.models.llama.modeling_llama.LlamaForSequenceClassification