In [1]:
from dataclasses import dataclass, field
from typing import Optional

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler

from transformers import AutoConfig

from aismicroorg.finetune.merge_peft_adapter import merge_lora_adapter



In [2]:
@dataclass
class ScriptArguments:
    """
    The name of the Casual LM model we wish to fine with PPO
    """

    # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode
    # models like gpt-neo* models are more suitable.
    model_name: Optional[str] = field(default="", metadata={"help": "the model name"})
    tokenizer_name: Optional[str] = field(
        default="", metadata={"help": "the tokenizer name"}
    )
    reward_model_name: Optional[str] = field(
        default="", metadata={"help": "the reward model name"}
    )
    log_with: Optional[str] = field(
        default=None, metadata={"help": "use 'wandb' to log with wandb"}
    )
    learning_rate: Optional[float] = field(
        default=1.41e-5, metadata={"help": "the learning rate"}
    )
    output_max_length: Optional[int] = field(
        default=512, metadata={"help": "maximum length for generation"}
    )
    mini_batch_size: Optional[int] = field(
        default=1, metadata={"help": "the PPO minibatch size"}
    )
    batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"})
    ppo_epochs: Optional[int] = field(
        default=4, metadata={"help": "the number of ppo epochs"}
    )
    gradient_accumulation_steps: Optional[int] = field(
        default=4, metadata={"help": "the number of gradient accumulation steps"}
    )
    adafactor: Optional[bool] = field(
        default=False, metadata={"help": "whether to use the adafactor optimizer"}
    )
    early_stopping: Optional[bool] = field(
        default=False, metadata={"help": "whether to early stop"}
    )
    target_kl: Optional[float] = field(
        default=0.1, metadata={"help": "kl target for early stopping"}
    )
    reward_baseline: Optional[float] = field(
        default=0.0,
        metadata={"help": "a baseline value that is subtracted from the reward"},
    )
    batched_gen: Optional[bool] = field(
        default=False, metadata={"help": "whether to use the batched text gen"}
    )
    save_freq: Optional[int] = field(
        default=None, metadata={"help": "n steps to save the model"}
    )
    output_dir: Optional[str] = field(
        default="runs/", metadata={"help": "n steps to save the model"}
    )
    seed: Optional[int] = field(default=0, metadata={"help": "the seed"})
    steps: Optional[int] = field(default=20000, metadata={"help": "number of epochs"})
    init_kl_coef: Optional[float] = field(
        default=0.2,
        metadata={
            "help": "Initial KL penalty coefficient (used for adaptive and linear control)"
        },
    )

    adap_kl_ctrl: Optional[bool] = field(
        default=True, metadata={"help": "Use adaptive KL control, otherwise linear"}
    )

In [3]:
script_args = ScriptArguments()
script_args.model_name = "/root/AISMicroOrg/models/finetuned_llama_7b_SE_micro"
# script_args.model_name = "/root/AISMicroOrg/finetuning/results_ft"
# script_args.model_name = "gpt2"
script_args.reward_model_name = "/root/AISMicroOrg/models/results_ft_peft_stack-exchange--paired_micro_rmts__20000_2e-05_peft_last_checkpoint"

In [4]:
reward_model, _ = merge_lora_adapter(
    script_args.model_name, script_args.reward_model_name, save_model=False
)

ppo_config = PPOConfig(
    steps=script_args.steps,
    model_name=script_args.model_name,
    learning_rate=script_args.learning_rate,
    log_with=script_args.log_with,
    batch_size=script_args.batch_size,
    mini_batch_size=script_args.mini_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    optimize_cuda_cache=True,
    early_stopping=script_args.early_stopping,
    target_kl=script_args.target_kl,
    ppo_epochs=script_args.ppo_epochs,
    seed=script_args.seed,
    init_kl_coef=script_args.init_kl_coef,
    adap_kl_ctrl=script_args.adap_kl_ctrl,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /root/AISMicroOrg/models/finetuned_llama_7b_SE_micro and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
sent_kwargs = {
    "top_k": None,
    "function_to_apply": "none",
    "batch_size": 16,
    "truncation": True,
}

In [6]:
if script_args.tokenizer_name == "":
    tokenizer = AutoTokenizer.from_pretrained(script_args.model_name)
else:
    tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name)
# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
# only for this model.

if getattr(tokenizer, "pad_token", None) is None:
    tokenizer.pad_token = tokenizer.eos_token

In [8]:
from aismicroorg.dataset.dataset_utils import build_rlhf_dataset

In [9]:
dataset_name = "/root/AISMicroOrg/data/stack-exchange-paired_micro"
dataset = build_rlhf_dataset(tokenizer, dataset_name=dataset_name)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

Map (num_proc=24):   0%|          | 0/25488 [00:00<?, ? examples/s]

  table = cls._concat_blocks(blocks, axis=0)


Filter:   0%|          | 0/25488 [00:00<?, ? examples/s]

In [9]:
set_seed(ppo_config.seed)

# Now let's build the model, the reference model, and the tokenizer.
current_device = Accelerator().local_process_index

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [47]:
from transformers import AutoModelForCausalLM

In [11]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)
model = AutoModelForCausalLMWithValueHead.from_pretrained(
    ppo_config.model_name,
    load_in_8bit=True,
    device_map={"": current_device},
    peft_config=lora_config,
)



optimizer = None
if script_args.adafactor:
    optimizer = Adafactor(
        filter(lambda p: p.requires_grad, model.parameters()),
        scale_parameter=False,
        relative_step=False,
        warmup_init=False,
        lr=ppo_config.learning_rate,
    )
# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = PPOTrainer(
    ppo_config,
    model,
    ref_model=None,
    tokenizer=tokenizer,
    dataset=dataset,
    data_collator=collator,
    optimizer=optimizer,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [12]:
task = "sentiment-analysis"

model_reward_config = AutoConfig.from_pretrained(
    script_args.model_name,
    _from_pipeline=task,
)
model_reward_config.pad_token_id = model_reward_config.eos_token_id

In [13]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a ` pipeline` bug

sentiment_pipe = pipeline(
    task=task,
    model=reward_model,
    device_map={"": current_device},
    model_kwargs={"load_in_8bit": True, "num_labels": 1},
    tokenizer=tokenizer,
    return_token_type_ids=False,
    config=model_reward_config,
)

In [54]:
output = r_m(tokenizer(batch["query"][0], return_tensors="pt")["input_ids"])

In [55]:
output[0]

tensor([[-1.0255]])

In [57]:
output["logits"]

tensor([[-1.0255]])

In [15]:
generation_kwargs = {
    # "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000,
}
output_min_length = 32
output_max_length = script_args.output_max_length
output_length_sampler = LengthSampler(output_min_length, output_max_length)

In [16]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    if epoch >= ppo_config.total_ppo_epochs:
        break

    question_tensors = batch["input_ids"]

    response_tensors = ppo_trainer.generate(
        question_tensors,
        return_prompt=False,
        length_sampler=output_length_sampler,
        **generation_kwargs,
    )
    batch["response"] = tokenizer.batch_decode(
        response_tensors, skip_special_tokens=True
    )

    # Compute reward score (using the sentiment analysis pipeline)
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [
        torch.tensor(output[0]["score"] - script_args.reward_baseline)
        for output in pipe_outputs
    ]

    # Run PPO step
    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    if script_args.save_freq and epoch and epoch % script_args.save_freq == 0:
        ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}")

0it [00:00, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
0it [07:14, ?it/s]


KeyboardInterrupt: 

In [64]:
response_tensors = ppo_trainer.generate(
    batch["input_ids"],
    return_prompt=False,
    length_sampler=output_length_sampler,
    **generation_kwargs,
)
batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

NameError: name 'batch' is not defined

In [20]:
batch["query"][6]

'Question: The question\'s title says it all. Who are the most prominent Christian philosophers of the 21st century (if there are any) and why?\n\nBy "Christian philosopher", I mean a philosopher who believes in the teachings of Jesus Christ and has philosophical works that try to show that this belief is justified.\n\nAnswer: '

In [21]:
batch["response"][1]

'1. There are three kinds of sensations in the body: touch, feeling, and perception. The sense organs as well as the mind sense come together to form the five senses.\n'

In [22]:
batch["response"][6]

'1. Theologian Alvin Plantinga is a well known and respected philosopher who is a Christian. Some of his work includes the following:\n\n* *Warranted Christian Belief*\n* *Warranted Christian Belief*\n* *The Nature of Necessity*\n* *God and Other Minds*\n* *God, Freedom, and Evil*\n* *Warranted Christian Belief*\n* *God and the Nature of Necessity*\n\nHe has written many other books and articles.\n\n2. Another well known and respected philosopher who is a Christian is William Lane Craig. Some of his work includes the following:\n\n* *The Kalam Cosmological Argument*\n* *Philosophical Arguments for the Existence of God*\n* *The Blackwell Companion to Natural Theology*\n* *On Guard*\n* *Reasonable Faith*\n* *Reasonable Faith*\n* *Reasonable Faith*\n* *Reasonable Faith*\n* *Reasonable'

In [44]:
len(tokenizer(batch["response"][6])["input_ids"])

100

In [25]:
texts = [q + r for q, r in zip(batch["query"], batch["response"])]

In [26]:
pipe_outputs = sentiment_pipe(texts, **sent_kwargs)

In [28]:
rewards = [
    torch.tensor(output[0]["score"] - script_args.reward_baseline)
    for output in pipe_outputs
]

In [36]:
rewards_scalar = [reward.item() for reward in rewards]

In [41]:
sorted_rewards = sorted(zip(rewards_scalar, texts), key=lambda x: x[0])

In [42]:
sorted_rewards[-1]

(5.875,
 "Question: Upon careful consideration of the literature I want to read in the following months, I have stumbled into a particular book which is called: Tractatus logico-philosophicus, written by brilliand author and analytic philosopher **Ludwig Wittgenstein.**\n\nBut, I'm afraid; my level of comprehension might be short of Wittgenstein innovative genius, since, alas!\nI'm no philosopher, neither should I try to be one when I'm made for other causes. My particular, question is:\n\n**How much background do I need to read this book?**\nAnd, in such case of needing background at all **What background should I expose myself to, in an effort to properly understand and interpret this book?**\n\nAny help or generous guidance is particularly, much appreciated.\n\nAnswer: 1. The background knowledge needed is:\n\n* Good understanding of English language and its grammar\n* Good understanding of mathematical logic\n* Good understanding of the basics of mathematics (e.g.")

In [44]:
tokenizer(sorted_rewards[-1][1])["input_ids"].__len__()

233

In [39]:
torch.max(batch["query"], key=rewards)

TypeError: max() received an invalid combination of arguments - got (list, key=list), but expected one of:
 * (Tensor input, *, Tensor out)
 * (Tensor input, Tensor other, *, Tensor out)
 * (Tensor input, int dim, bool keepdim, *, tuple of Tensors out)
 * (Tensor input, name dim, bool keepdim, *, tuple of Tensors out)


In [18]:
transformers.models.llama.modeling_llama.LlamaForSequenceClassification

transformers.models.llama.modeling_llama.LlamaForSequenceClassification

# Inference

In [6]:
from transformers import AutoModelForCausalLM
import transformers

model = AutoModelForCausalLM.from_pretrained(
    ppo_config.model_name,
    #    load_in_8bit=True,
    device_map={"": 0},
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [46]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
 

In [48]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
 

In [44]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_

In [47]:
model = AutoModelForCausalLM.from_pretrained(
    "/root/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9",
    device_map={"": current_device},
    load_in_8bit=True,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [38]:
prompt = "Question: Hello, how are you doing today?"
prompt_tokenized = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")

In [49]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",  # if you have GPU
)

In [50]:
sequences = pipeline(
    'Question: I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n Answer:',
    do_sample=True,
    top_k=10,
    top_p=0.9,
    temperature=0.2,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,  # can increase the length of sequence
)

In [41]:
sequences[0]["generated_text"]

'Question: I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n Answer:\nI\'m a big fan of "The Wire," which is a show about the Baltimore police department. It\'s a great show, and it\'s one of the best shows ever made.\nI also like "The Sopranos," which is about a mob family in New Jersey. It\'s a great show, and it\'s one of the best shows ever made.\nI also like "The Shield," which is about a police department in Los Angeles. It\'s a great show, and it\'s one of the best shows ever made.\nI also like "The Wire," which is about a police department in Baltimore. It\'s a great show, and it\'s one of the best shows ever made.\nI also like "The Sopranos,"'

In [42]:
with torch.inference_mode():
    answer = model.generate(
        input_ids=prompt_tokenized,
        max_length=200,
        do_sample=False,
        top_p=1.0,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=100_000,
        num_beams=1,
    )
    # answer = model(input_ids=prompt_tokenized)
    # answer = tokenizer.decode(answer, skip_special_tokens=True)

In [43]:
print(tokenizer.decode(answer[0]))

<s> Question: Hello, how are you doing today?
I am doing well, thank you.
Question: I am a student and I am doing a research paper on the topic of the “The Effects of the Internet on the Brain.” I was wondering if you could help me with some information.
I am not sure what you are looking for. I am not a doctor or a scientist. I am just a person who has been on the Internet for a long time.
Question: I am looking for information on the effects of the Internet on the brain.
I am not sure what you are looking for. I am not a doctor or a scientist. I am just a person who has been on the Internet for a long time. I am not sure what you are looking for. I am not a doctor or a scientist. I am just a person who has been on the Internet for a long time.
Question: I am looking for information on
