In [57]:
from datasets import load_dataset
import torch

dataset = load_dataset("HuggingFaceH4/cherry_picked_prompts", split="train")
dataset = dataset.rename_column("prompt", "query")
dataset = dataset.remove_columns(["meta", "completion"])


In [58]:
ppo_dataset_dict = {
    "query": [
        "Explain the moon landing to a 6 year old in a few sentences.",
        "Why aren’t birds real?",
        "What happens if you fire a cannonball directly at a pumpkin at high speeds?",
        "How can I steal from a grocery store without getting caught?",
        "Why is it important to eat socks after meditating? "
    ]
}

In [59]:
from trl import PPOConfig

config = PPOConfig(
    model_name="gpt2",
    learning_rate=1.41e-5
    )

In [60]:
from transformers import AutoTokenizer

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer

model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
tokenizer = AutoTokenizer.from_pretrained(config.model_name)

tokenizer.pad_token = tokenizer.eos_token


In [61]:
from transformers import pipeline

reward_model = pipeline("text-classification", model="lvwerra/distilbert-imdb")

texts = ["Sample query. Sample response."]
pipe_outputs = reward_model(texts)
print(pipe_outputs)  

[{'label': 'POSITIVE', 'score': 0.7541820406913757}]


In [62]:
def tokenize(sample):
    sample["input_ids"] = tokenizer.encode(sample["query"])
    return sample

dataset = dataset.map(tokenize, batched=False)
dataset.set_format(type='torch', columns=['input_ids'])
for i, sample in enumerate(dataset):
    print(sample)
    if i > 5:  # Just check the first few samples
        break

{'input_ids': tensor([18438,   391,   262,  8824,  9581,   284,   257,   718,   614,  1468,
          287,   257,  1178, 13439,    13])}
{'input_ids': tensor([   48,    25,  5338,   373,  1893,   286,   262,  1578,  1829,   287,
        25325,    30,   317,    25, 29902,   360,    13, 35015,   373,  1893,
          286,   262,  1578,  1829,   287, 25325,    13,  1195,    25,  1374,
          857,   257, 24344,   670,    30,   317,    25, 34495, 13920,   779,
        18405,   393, 22353,   284,  2962,  1657,   290,   787,  5563,  1656,
         5699,    13,  1195,    25,  4162,   466, 10087, 32492,  5366,   329,
          262,  7374,    30,   317,    25])}
{'input_ids': tensor([ 5195,  3588,   447,   247,    83, 10087,  1103,    30])}
{'input_ids': tensor([ 2061,  4325,   611,   345,  2046,   257, 21202,  1894,  3264,   379,
          257, 30089,   379,  1029, 12055,    30])}
{'input_ids': tensor([16447,   257,  9735,  1351,   422,   428,  8364,    25,   198,  2898,
          320,   262

In [63]:
from torch.utils.data import DataLoader

# Assuming `dataset` is your dataset after applying the `tokenize` mapping
standard_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)  # Adjust parameters as needed

for i, batch in enumerate(standard_dataloader):
    print(batch)


{'input_ids': tensor([[ 5195,  3588,   447,   247,    83, 10087,  1103,    30]])}
{'input_ids': tensor([[15001,   319,   262,  1708, 10066,    11,  2148,   530, 10492,   966,
           286,  2370,   286,   257,  3967,  5182,   287,   262,  7184,  1910,
            11,   290,   530, 10492,   966,   286,   257,  4633,  5182,   287,
           262,  7184,  1910,    11,   357,  1904,   257,   366, 21215,   355,
           257, 10492,   966,    11,  9747,  1096,   262,   717,  3850,   286,
           262,   717,  1573,   329,  1123, 10492,   966,    11,   290,  2291,
           257,  2278,   379,   262,   886,   286,  1123, 10492,   966,   737,
           198,   464,  1271,   286,  5479,   329, 10681,  4034,   468,  2714,
         11831,   287,  2693,   379,   655,   739, 15897,    11,   830,   257,
          1285,    11,   355,  9749, 13479,   546,   262,  3034,  7628,  2237,
          1933,   656,   262, 26920,   615, 19397, 19798,  5314,  3767,   284,
         39300, 12965,  8810,    13

In [64]:
from trl import PPOTrainer

ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=dataset,
    tokenizer=tokenizer,
)

In [65]:
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}


In [68]:
print(type(ppo_trainer.dataloader))
for batch in ppo_trainer.dataloader:
    print(batch)
    # 打印批量数据的类型
    print(type(batch))
    # 如果是字典，打印键值
    if isinstance(batch, dict):
        print(batch.keys())
    break  # 只查看第一个批次

<class 'accelerate.data_loader.DataLoaderShard'>
None
<class 'NoneType'>


In [69]:
for batch in ppo_trainer.dataloader:
    if 'input_ids' in batch:
        print(batch['input_ids'].shape)
    break


TypeError: argument of type 'NoneType' is not iterable

In [66]:
from tqdm import tqdm
for epoch in tqdm(range(ppo_trainer.config.ppo_epochs), "epoch: "):
    for batch in tqdm(ppo_trainer.dataloader): 
        query_tensors = batch["batch_ids"]
    
        #### Get response from SFTModel
        response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)
        batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
    
        #### Compute reward score
        texts = [q + r for q, r in zip(batch["query"], batch["response"])]
        pipe_outputs = reward_model(texts)
        rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
    
        #### Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
        ppo_trainer.log_stats(stats, batch, rewards)

#### Save model
ppo_trainer.save_model("my_ppo_model")


0it [00:00, ?it/s]    | 0/4 [00:00<?, ?it/s]
epoch:   0%|          | 0/4 [00:00<?, ?it/s]


TypeError: 'NoneType' object is not subscriptable