In [None]:
import torch
from tqdm import tqdm

import pandas as pd
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, AutoModelForCausalLM, pipeline

from peft import AutoPeftModelForCausalLM, AutoPeftModelForSequenceClassification

from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead

from peft import load_peft_weights, set_peft_model_state_dict, get_peft_model, LoraConfig

from datasets import Dataset

In [None]:
# load reward model
model_rm= AutoPeftModelForSequenceClassification.from_pretrained(
   "reward_model",
    num_labels=1
)
rm_tokenizer = AutoTokenizer.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct"
)

model_rm.config.update({"pad_token_id": rm_tokenizer.eos_token_id})
model_rm.to("cuda")

reward_pipe= pipeline(
    "sentiment-analysis",
    model= model_rm.merge_and_unload(),
    tokenizer= rm_tokenizer,
    return_token_type_ids= False,
    torch_dtype=torch.float16,
    device_map={"": 0},
)

Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2-0.5B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
def print_trainable_parameters(model):
    # logger.warning("This function will be removed in the future. Please use count_params_of_model")
    trainable_params = 0
    all_param = 0
    # if isinstance(model, CrossEncoder) or isinstance(model, BiEncoder):
    #     params= model.parameters()
    # else:
    #     params= model.model.parameters()
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)

In [None]:
import copy

In [None]:
# load sft model
model_name= "SeaLLMs/SeaLLMs-v3-1.5B"

model_sft= AutoModelForCausalLM.from_pretrained(model_name)
tokenizer_sft= AutoTokenizer.from_pretrained('sft_model', use_fast= True)
# tokenizer_sft.add_special_tokens({'pad_token': '<[PAD]>'})
# tokenizer_sft.padding= 'left'

model_sft.config.update({'pad_token_id': tokenizer_sft.pad_token_id})

lora_cfg = LoraConfig(
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0.,
    bias = "none",
    task_type="CAUSAL_LM",
)

# load sft weight
model_sft= get_peft_model(model_sft, lora_cfg)
set_peft_model_state_dict(model_sft, load_peft_weights('sft_model'))
model_sft= model_sft.merge_and_unload()

model_ref= AutoModelForCausalLMWithValueHead.from_pretrained(model_sft, torch_dtype=torch.float16) # reference model
model_ref.to("cuda")

# create new lora for causalLM valuehead

model_sft= AutoModelForCausalLMWithValueHead.from_pretrained(model_sft, peft_config= lora_cfg, torch_dtype=torch.float16)
model_sft.to("cuda")

print('Build model success!!!')

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Build model success!!!


In [None]:
print_trainable_parameters(model_sft)

trainable params: 18466305 || all params: 1562180609 || trainable%: 1.182085150309275


In [None]:
prompt= "### Question:\n{}\n### Answer:\n{}"

data= pd.concat([pd.read_json('1k9_rlhf.json').rename(columns= {'answers': 'answer'}), pd.read_json('4k_rlhf.json')])

data= pd.DataFrame({'query': data.apply(lambda x: prompt.format(x['question'], ""), axis= 1)})

In [None]:
dataset= Dataset.from_pandas(data)

In [None]:
def tokenize(sample):
    sample["input_ids"] = tokenizer_sft.encode(sample["query"])
    return sample

dataset = dataset.map(tokenize)
dataset.set_format(type="torch")

Map:   0%|          | 0/5929 [00:00<?, ? examples/s]

In [None]:
def collator(data):
  return dict((key,[d[key] for d in data]) for key in data[0])

config = PPOConfig(
    learning_rate=1.41e-5,
    batch_size=4,
    mini_batch_size=1,
    # steps=1250,
    optimize_cuda_cache=True,
    remove_unused_columns=True,
    gradient_accumulation_steps= 4,
    # kl_penalty= 'full'
)

ppo_trainer = PPOTrainer(
    config,
    model_sft,
    # ref_model=model_ref,
    tokenizer=tokenizer_sft,
    dataset=dataset,
    data_collator=collator
)

In [None]:
generation_kwargs = {
    "min_length": -1,
    "max_new_tokens": 200,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer_sft.pad_token_id,
    "eos_token_id": tokenizer_sft.eos_token_id,
}


sent_kwargs = {"top_k": None, "function_to_apply": "none"}

In [None]:
for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from sft and ref
    response_tensors, ref_response_tensors= ppo_trainer.generate(query_tensors, generate_ref_response=True, **generation_kwargs)

    batch["response"] = [tokenizer_sft.decode(r.squeeze()) for r in response_tensors]
    batch["ref_response"] = tokenizer_sft.batch_decode(ref_response_tensors)

    #### Compute reward score
    texts = [r for  r in batch["response"]]
    pipe_outputs = reward_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[0]["score"]) for output in pipe_outputs]

    ref_texts = [r for r in batch["ref_response"]]
    ref_pipe_outputs = reward_pipe(ref_texts, **sent_kwargs)
    ref_rewards = [torch.tensor(output[0]["score"]) for output in ref_pipe_outputs]
    batch["ref_rewards"] = ref_rewards

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "ref_response", "ref_rewards"])

0it [00:00, ?it/s]You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
5it [01:26, 17.38s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
61it [17:07, 14.07s/it]

In [None]:
texts, ref_texts

In [None]:
rewards, ref_rewards

In [None]:
inputs = tokenizer_sft(
[
    prompt.format(
        "Đường cao tốc là đường gì?", # instruction
        ""
    )
], return_tensors = "pt").to("cuda")

outputs = ppo_trainer.model.generate(**inputs, max_new_tokens = 256, do_sample= True, top_p= 0.9, top_k= 100, temperature= 0.7)
tokenizer_sft.batch_decode(outputs)

In [None]:
ppo_trainer.model.save_pretrained("ppo_model")

In [None]:
[tokenizer_sft.decode(r.squeeze()) for r in response_tensors]