In [59]:
from datasets import load_dataset
import torch
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler
dataset = load_dataset("HuggingFaceH4/cherry_picked_prompts", split="train")
dataset = dataset.rename_column("prompt", "query")
dataset = dataset.remove_columns(["meta", "completion"])


In [60]:
ppo_dataset_dict = {
    "query": [
        "Explain the moon landing to a 6 year old in a few sentences.",
        "Why aren’t birds real?",
        "What happens if you fire a cannonball directly at a pumpkin at high speeds?",
        "How can I steal from a grocery store without getting caught?",
        "Why is it important to eat socks after meditating? "
    ]
}

In [61]:
from trl import PPOConfig

config = PPOConfig(
    model_name="gpt2",
    learning_rate=1.41e-5
    )

In [62]:
from transformers import AutoTokenizer

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer

model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token


In [63]:
from transformers import pipeline

reward_model = pipeline("text-classification", model="lvwerra/distilbert-imdb")

texts = ["Sample query. Sample response."]
pipe_outputs = reward_model(texts)
print(pipe_outputs)  

[{'label': 'POSITIVE', 'score': 0.7541821599006653}]


In [64]:
def tokenize(sample):
    sample["input_ids"] = tokenizer.encode(sample["query"])
    return sample

dataset = dataset.map(tokenize, batched=False)


In [65]:
def build_dataset(config, dataset_name="imdb", input_min_text_length=2, input_max_text_length=8):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        dataset_name (`str`):
            The name of the dataset to be loaded.

    Returns:
        dataloader (`torch.utils.data.DataLoader`):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token
    # load imdb with datasets
    ds = load_dataset(dataset_name, split="train")
    ds = ds.rename_columns({"text": "review"})
    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)

    input_size = LengthSampler(input_min_text_length, input_max_text_length)

    def tokenize(sample):
        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")
    return ds

In [66]:
dataset = build_dataset(config)


def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

In [67]:
from trl import PPOTrainer
# model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
# ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
# tokenizer = AutoTokenizer.from_pretrained(config.model_name)

# tokenizer.pad_token = tokenizer.eos_token
# ppo_trainer = PPOTrainer(config, model, 
#                          ref_model, tokenizer, 
#                          dataset=dataset, data_collator=collator)

from trl import PPOTrainer

ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=dataset,
    tokenizer=tokenizer,
)

In [68]:
from trl import PPOTrainer

ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=dataset,
    tokenizer=tokenizer,data_collator=collator
)

In [69]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


In [70]:
print(type(ppo_trainer.dataloader))
for batch in ppo_trainer.dataloader:
    print(batch)
    # 打印批量数据的类型
    print(type(batch))
    # 如果是字典，打印键值
    if isinstance(batch, dict):
        print(batch.keys())
    break  # 只查看第一个批次

<class 'accelerate.data_loader.DataLoaderShard'>
{'label': [tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(1, device='cuda:0'), tensor(1, device='cuda:0'), tensor(0, device='cuda:0'), tensor(0, device

In [75]:
device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)

In [76]:
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}

In [77]:
from tqdm import tqdm

output_min_length = 4
output_max_length = 16
output_length_sampler = LengthSampler(output_min_length, output_max_length)
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from gpt2
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

9it [01:19,  8.85s/it]


KeyboardInterrupt: 

In [72]:
from tqdm import tqdm
for epoch in tqdm(range(ppo_trainer.config.ppo_epochs), "epoch: "):
    for batch in tqdm(ppo_trainer.dataloader): 
        query_tensors = batch["input_ids"]
    
        #### Get response from SFTModel
        response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)
        batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
    
        #### Compute reward score
        texts = [q + r for q, r in zip(batch["query"], batch["response"])]
        pipe_outputs = reward_model(texts)
        print(pipe_outputs)
        rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
    
        #### Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)

#### Save model
ppo_trainer.save_model("my_ppo_model")


  0%|          | 0/194 [00:03<?, ?it/s]
epoch:   0%|          | 0/4 [00:03<?, ?it/s]

[{'label': 'NEGATIVE', 'score': 0.5446640849113464}, {'label': 'POSITIVE', 'score': 0.9689391255378723}, {'label': 'POSITIVE', 'score': 0.8512694239616394}, {'label': 'POSITIVE', 'score': 0.9742537140846252}, {'label': 'NEGATIVE', 'score': 0.8223679065704346}, {'label': 'NEGATIVE', 'score': 0.8830752968788147}, {'label': 'NEGATIVE', 'score': 0.804911196231842}, {'label': 'POSITIVE', 'score': 0.9535204768180847}, {'label': 'NEGATIVE', 'score': 0.7190593481063843}, {'label': 'NEGATIVE', 'score': 0.9527025818824768}, {'label': 'NEGATIVE', 'score': 0.9024688005447388}, {'label': 'POSITIVE', 'score': 0.8254497051239014}, {'label': 'NEGATIVE', 'score': 0.9904561638832092}, {'label': 'NEGATIVE', 'score': 0.9144635200500488}, {'label': 'POSITIVE', 'score': 0.9947450160980225}, {'label': 'POSITIVE', 'score': 0.6269362568855286}, {'label': 'POSITIVE', 'score': 0.9717574715614319}, {'label': 'NEGATIVE', 'score': 0.9293855428695679}, {'label': 'POSITIVE', 'score': 0.9892929792404175}, {'label': 'P




KeyError: 1

In [None]:
print(ppo_trainer.dataloader)

<accelerate.data_loader.DataLoaderShard object at 0x7f25f0121f00>


In [None]:
from tqdm import tqdm

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from SFTModel
    response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute reward score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = reward_model(texts)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

0it [00:03, ?it/s]


KeyError: 1