# Fine-Tune with Reinforcement Learning (PPO) and PEFT to Generate Less-Toxic Summaries


* Make sure you change the kernel to **PyTorch 2.6** to run the notebook
* We mark **TODO** in the notebook cells to indicate the place where you need to complete the missing code. You can refer to the exercises in the course repository for code examples.

In [1]:
# import necessary packages
import os
import sys
import torch

!{sys.executable} -m pip install --upgrade transformers huggingface_hub peft \
  accelerate bitsandbytes datasets trl==0.11.4 ipywidgets evaluate tqdm

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# or use an input box on this notebook to copy/paste the token
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
USE_CPU = False
device = "xpu" if torch.xpu.is_available() else "cpu"
if USE_CPU:
    device = "cpu"
print(f"using device: {device}")

using device: xpu


In [4]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig, Trainer, TrainingArguments
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType, get_peft_model

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead, SFTTrainer, SFTConfig
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

## Load FLAN-T5 Model, Prepare Reward Model and Toxicity Evaluator

In [5]:
model_name="google/flan-t5-base"
huggingface_dataset_name = "knkarthick/dialogsum"

dataset_original = load_dataset(huggingface_dataset_name)

dataset_original

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
})

In [6]:
def build_dataset(model_name,
                  dataset_name,
                  input_min_text_length, 
                  input_max_text_length):

    # load dataset (only "train" part will be enough for this lab).
    dataset = load_dataset(dataset_name, split="train")
    
    # Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
    dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)

    # Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")
    
    def tokenize(sample):
        
        # Wrap each dialogue with the instruction.
        prompt = f"""
Summarize the following conversation.

{sample["dialogue"]}

Summary:
"""
        sample["input_ids"] = tokenizer.encode(prompt)
        
        # This must be called "query", which is a requirement of our PPO library.
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue.
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")
    
    # Split the dataset into train and test parts.
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

    return dataset_splits

dataset = build_dataset(model_name=model_name,
                        dataset_name=huggingface_dataset_name,
                        input_min_text_length=200, 
                        input_max_text_length=1000)

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 8017
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 2005
    })
})


In [7]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"\ntrainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

## Model Fine-Tuning

In [8]:
# import transformers

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

#TODO: create tokenizer using AutoTokenizer class
#NOTE: you need to set device_map argument properly to choose XPU device
# tokenizer = ...
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map = "xpu")


#TODO: create model using AutoModelForSeq2SeqLM class
# model = ...
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
                              

# create PEFT model for fine-tuning
peft_model = get_peft_model(model, lora_config)

print(f'PEFT model parameters to be updated:\n{print_number_of_trainable_model_parameters(peft_model)}\n')

def process_dataset(batch):
    prompt = [f'Summarize the following conversation:\n{dialogue}\n\nSummary:\n{summary}\n' for dialogue, summary in zip(batch['dialogue'], batch['dialogue'])]
    batch['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    batch['labels'] = tokenizer(batch["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    return batch

processed_dataset = dataset_original.map(process_dataset, batched=True)

output_dir = "peft-dialogue-finetuned"

#TODO: create trainer using SFTTrainer class
# trainer = SFTTrainer(...)
PUSH_TO_HUB = True
USE_WANDB = False

# Calculate max_steps based on the subset size
num_train_samples = len(dataset['train'])

print(num_train_samples)
batch_size = 2
gradient_accumulation_steps = 8
steps_per_epoch = num_train_samples // (batch_size * gradient_accumulation_steps)
num_epochs = 5
max_steps = steps_per_epoch * num_epochs
print(f"Finetuning for max number of steps: {max_steps}")

# training_args = transformers.TrainingArguments(
training_args = TrainingArguments(
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_ratio=0.05,
        max_steps=max_steps,
        learning_rate=1e-5,
        evaluation_strategy="steps",
        save_steps=100,
        bf16=True,
        logging_steps=100,
        output_dir=output_dir,
        # hub_model_id=output_dir if PUSH_TO_HUB else None,
        use_ipex=False,
        # report_to="wandb" if USE_WANDB else None,
        #push_to_hub=PUSH_TO_HUB,
        max_grad_norm=0.6,
        weight_decay=0.01,
        group_by_length=True
)

trainer = SFTTrainer(
    model=model,
    train_dataset= dataset['train'],
    eval_dataset= dataset['test'],
    # tokenizer=tokenizer,
    args=training_args,
    peft_config=lora_config,
    dataset_text_field="text",
    max_seq_length=512,
    # packing=True
)

trainer.train()

peft_model_path="./peft-dialogue-summary-checkpoint"

trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)


The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


PEFT model parameters to be updated:

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%

8017
Finetuning for max number of steps: 2505



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  super().__init__(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mchelsen[0m ([33mumass-lowell[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
100,0.7933,0.550527
200,0.6586,0.357912
300,0.4433,0.218302
400,0.2902,0.119862
500,0.1858,0.08019
600,0.1239,0.022758
700,0.0762,0.008624
800,0.0545,0.003502
900,0.0426,0.002109
1000,0.0371,0.001584


('./peft-dialogue-summary-checkpoint/tokenizer_config.json',
 './peft-dialogue-summary-checkpoint/special_tokens_map.json',
 './peft-dialogue-summary-checkpoint/spiece.model',
 './peft-dialogue-summary-checkpoint/added_tokens.json',
 './peft-dialogue-summary-checkpoint/tokenizer.json')

In [9]:
peft_model_path="./peft-dialogue-summary-checkpoint"

ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model_path,                                                               
                                                               torch_dtype=torch.bfloat16,
                                                               device_map="auto",
                                                               is_trainable=True)

print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)



PPO model parameters to be updated (ValueHead + 769 params):

trainable model parameters: 3539713
all model parameters: 251117569
percentage of trainable model parameters: 1.41%

ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=768, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


## Setup Reward Model

![](img/hf_facebook_hatespeec_reward_model.png)

In [11]:
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"

#TODO: create toxicity_tokenizer
#toxicity_tokenizer = ...
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name)

#TODO: create toxicity_model using AutoModelForSequenceClassification class
# toxicity_model = ...
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name)

print(toxicity_model.config.id2label)

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/816 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

{0: 'nothate', 1: 'hate'}


![](img/rlhf_reward_model_binary_classifier.png)

In [29]:
# import torch.nn.functional as F

non_toxic_text = "You are a great person and I like you"

device = "cpu"

toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors="pt").input_ids.to(device)

#TODO: perform model inference on the input tokens
#TODO: and capture the logits (the outputs from the last level of the neural network)
#NOTE: please refer to lecture slides
# logits = ...
with torch.no_grad():
    logits = toxicity_model(toxicity_input_ids).logits
    
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

#TODO: Print the probabilities for [not hate, hate]
#TODO: please refer to lecture slides
# probabilities = ...
# probabilities = F.softmax(logits, dim=-1).squeeze().tolist()
probabilities = logits.softmax(dim=-1).tolist()[0]


print(f'probabilities [not hate, hate]: {probabilities}')

# get the logits for "not hate" - this is the reward!
# TODO: please refer to lecture slides
# not_hate_index = ...
# nothate_reward = ...
not_hate_index = 0  # {0: 'nothate', 1: 'hate'}

nothate_reward = logits[0][not_hate_index].item()

print(f'reward (high): {nothate_reward}')

logits [not hate, hate]: [4.641770362854004, -4.23326301574707]
probabilities [not hate, hate]: [0.9998601675033569, 0.0001398174063069746]
reward (high): 4.641770362854004


In [30]:
toxic_text = "You are disgusting and terrible and i damn hate you"

#TODO: tokenize the toxic text
# toxicity_input_ids = ...
toxicity_input_ids = toxicity_tokenizer(toxic_text, return_tensors="pt").input_ids.to(device)


#TODO: perform model inference on the input tokens
#TODO: and capture the logits (the outputs from the last level of the neural network)
#NOTE: please refer to lecture slides
# logits = ...
with torch.no_grad():
    logits = toxicity_model(toxicity_input_ids).logits
    
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

#TODO: Print the probabilities for [not hate, hate]
#TODO: please refer to lecture slides
# probabilities = ...
# probabilities = F.softmax(logits, dim=-1).squeeze().tolist()
probabilities = logits.softmax(dim=-1).tolist()[0]


print(f'probabilities [not hate, hate]: {probabilities}')

# get the logits for "not hate" - this is the reward!
# TODO: please refer to lecture slides
# not_hate_index = ...
# nothate_reward = ...
not_hate_index = 0  # {0: 'nothate', 1: 'hate'}

nothate_reward = logits[0][not_hate_index].item()

print(f'reward (high): {nothate_reward}')

logits [not hate, hate]: [-2.0610787868499756, 1.5835537910461426]
probabilities [not hate, hate]: [0.025465568527579308, 0.9745343923568726]
reward (high): -2.0610787868499756


In [25]:
sentiment_pipe = pipeline("sentiment-analysis", 
                          model=toxicity_model_name,
                          tokenizer=toxicity_tokenizer,
                          max_length=512,
                          truncation=True,
                          device=device)
reward_logits_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # Set to "none" to retrieve raw logits.
    "batch_size": 16
}

reward_probabilities_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "softmax", # Set to "softmax" to apply softmax and retrieve probabilities.
    "batch_size": 16
}

print("Reward model output for non-toxic text:")
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))
print("\nReward model output for toxic text:")
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

Device set to use cpu


Reward model output for non-toxic text:
[{'label': 'nothate', 'score': 4.641770362854004}, {'label': 'hate', 'score': -4.23326301574707}]
[{'label': 'nothate', 'score': 0.9998601675033569}, {'label': 'hate', 'score': 0.00013981739175505936}]

Reward model output for toxic text:
[{'label': 'hate', 'score': 1.5835537910461426}, {'label': 'nothate', 'score': -2.0610787868499756}]
[{'label': 'hate', 'score': 0.9745343923568726}, {'label': 'nothate', 'score': 0.025465568527579308}]


In [26]:
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))

[{'label': 'nothate', 'score': 4.641770362854004}, {'label': 'hate', 'score': -4.23326301574707}]
[{'label': 'nothate', 'score': 0.9998601675033569}, {'label': 'hate', 'score': 0.00013981739175505936}]


In [27]:
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

[{'label': 'hate', 'score': 1.5835537910461426}, {'label': 'nothate', 'score': -2.0610787868499756}]
[{'label': 'hate', 'score': 0.9745343923568726}, {'label': 'nothate', 'score': 0.025465568527579308}]


## Evaluate Toxicity

In [39]:
import evaluate

#TODO: create toxicity_evaluator using evaluate.load()
#NOTE: please refer to exercise Toxicity_Detector_by_Meta.ipynb
# toxicity_evaluator = ...

toxicity_evaluator = evaluate.load("toxicity", 
                                    toxicity_model_name,
                                    module_type="measurement",
                                    toxic_label="hate")


Device set to use xpu:0


In [40]:
toxicity_score = toxicity_evaluator.compute(predictions=[
    non_toxic_text
])

print("Toxicity score for non-toxic text:")
print(toxicity_score["toxicity"])

toxicity_score = toxicity_evaluator.compute(predictions=[
    toxic_text
])

print("\nToxicity score for toxic text:")
print(toxicity_score["toxicity"])

Toxicity score for non-toxic text:
[0.00013981753727421165]

Toxicity score for toxic text:
[0.9745345115661621]


In [41]:
def evaluate_toxicity(model, 
                      toxicity_evaluator, 
                      tokenizer, 
                      dataset, 
                      num_samples):

    max_new_tokens=100

    toxicities = []
    input_texts = []
    for i, sample in tqdm(enumerate(dataset)):
        input_text = sample["query"]

        if i > num_samples:
            break
            
        input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids.to(device)
        
        generation_config = GenerationConfig(max_new_tokens=max_new_tokens,
                                             tok_k=0.0,
                                             top_p=1.0,
                                             do_sample=True)

        response_token_ids = model.generate(input_ids=input_ids,
                                            generation_config=generation_config)
        
        generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)
        
        toxicity_score = toxicity_evaluator.compute(predictions=[(input_text + " " + generated_text)])

        toxicities.extend(toxicity_score["toxicity"])

    # TODO: Compute mean & std using numpy functions.
    # mean = ...
    # std = ...
    mean = np.mean(toxicities)
    std = np.std(toxicities)
        
    return mean, std

In [42]:
def evaluate_toxicity(model, 
                      toxicity_evaluator, 
                      tokenizer, 
                      dataset, 
                      num_samples=10):

    # 自动检测 device（优先使用 XPU）
    device = torch.device("xpu" if torch.xpu.is_available() else "cpu")
    model = model.to(device)

    max_new_tokens = 100
    toxicities = []

    for i, sample in tqdm(enumerate(dataset), total=num_samples):
        if i >= num_samples:
            break

        input_text = sample["query"]

        # 编码输入并迁移到目标设备
        inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        # 配置生成策略
        generation_config = GenerationConfig(
            max_new_tokens=max_new_tokens,
            top_k=0,
            top_p=1.0,
            do_sample=True
        )

        # 模型生成文本
        with torch.no_grad():
            response_token_ids = model.generate(
                input_ids=inputs["input_ids"],
                generation_config=generation_config
            )

        generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)

        # 拼接输入和输出作为 toxicity 评估的目标
        full_text = input_text + " " + generated_text

        # 评估 toxicity（兼容 evaluate.load("toxicity")）
        toxicity_score = toxicity_evaluator.compute(predictions=[full_text])

        toxicities.extend(toxicity_score["toxicity"])

    # 计算均值与标准差
    mean = np.mean(toxicities)
    std = np.std(toxicities)

    return mean, std

In [43]:
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")


mean_before_detoxification, std_before_detoxification = evaluate_toxicity(model=ppo_model, 
                                                                          toxicity_evaluator=toxicity_evaluator, 
                                                                          tokenizer=tokenizer, 
                                                                          dataset=dataset["test"], 
                                                                          num_samples=10)

print(f'toxicity [mean, std] before detox: [{mean_before_detoxification}, {std_before_detoxification}]')

100%|██████████| 10/10 [00:24<00:00,  2.43s/it]

toxicity [mean, std] before detox: [0.025736885907826947, 0.03679577066516888]





## Perform Fine-Tuning to Detoxify the Summaries
Optimize a RL policy against the reward model using Proximal Policy Optimization (PPO).

In [44]:
#TODO: create a refenence model to be used as a frozen model
# ref_model = ...
ref_model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
ref_model.eval()  # 设置为评估模式（关闭 dropout 等）
for param in ref_model.parameters():
    param.requires_grad = False  # 冻结所有参数

print(f'Reference model parameters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

Reference model parameters to be updated:

trainable model parameters: 0
all model parameters: 247577856
percentage of trainable model parameters: 0.00%



![](img/rlhf_kl_divergence.png)

In [46]:
from trl import PPOConfig, PPOTrainer

learning_rate=1.41e-5
max_ppo_epochs=1
mini_batch_size=4
batch_size=16

config = PPOConfig(
    model_name=model_name,    
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

#TODO: create ppo_trainer using PPOTrainer class
# ppo_trainer = ...
ppo_trainer = PPOTrainer(
    config=config,
    model=ppo_model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=dataset['train'],     # 通常是训练集（带 prompt）
    data_collator=collator
)


### Fine-Tune the Model

In [47]:
output_min_length = 100
output_max_length = 400
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

reward_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # You want the raw logits without softmax.
    "batch_size": 16
}

max_ppo_steps = 10

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    # Break when you reach max_steps.
    if step >= max_ppo_steps:
        break   

    prompt_tensors = batch["input_ids"]

    # Get response from FLAN-T5/PEFT LLM.
    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()        
            
        generation_kwargs["max_new_tokens"] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)
        
        summary_tensors.append(summary.squeeze()[-max_new_tokens:])
        
    # This needs to be called "response".
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    # Compute reward outputs.
    query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]    
    rewards = sentiment_pipe(query_response_pairs, **reward_kwargs)

    # You use the `nothate` item because this is the score for the positive `nothate` class.
    reward_tensors = [torch.tensor(reward[not_hate_index]["score"]) for reward in rewards]    

    # Run PPO step.
    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)
    
    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))

1it [01:12, 72.77s/it]

objective/kl: 175.46324157714844
ppo/returns/mean: -3.2221853733062744
ppo/policy/advantages_mean: -0.0005682185292243958
---------------------------------------------------------------------------------------------------


2it [02:48, 86.48s/it]

objective/kl: 172.35850524902344
ppo/returns/mean: -3.179933547973633
ppo/policy/advantages_mean: -0.0052349865436553955
---------------------------------------------------------------------------------------------------


3it [03:59, 79.22s/it]

objective/kl: 94.30601501464844
ppo/returns/mean: -1.92532217502594
ppo/policy/advantages_mean: 0.001079365611076355
---------------------------------------------------------------------------------------------------


4it [05:10, 76.20s/it]

objective/kl: 101.38346862792969
ppo/returns/mean: -1.8475204706192017
ppo/policy/advantages_mean: -0.0023028627038002014
---------------------------------------------------------------------------------------------------


5it [06:22, 74.37s/it]

objective/kl: 176.7908935546875
ppo/returns/mean: -3.317161798477173
ppo/policy/advantages_mean: 0.023739691823720932
---------------------------------------------------------------------------------------------------


6it [07:37, 74.82s/it]

objective/kl: 139.14227294921875
ppo/returns/mean: -2.5710415840148926
ppo/policy/advantages_mean: -0.004548355937004089
---------------------------------------------------------------------------------------------------


7it [08:49, 73.65s/it]

objective/kl: 126.79751586914062
ppo/returns/mean: -2.4160685539245605
ppo/policy/advantages_mean: -0.0194123275578022
---------------------------------------------------------------------------------------------------


8it [09:57, 72.05s/it]

objective/kl: 165.66525268554688
ppo/returns/mean: -3.5792791843414307
ppo/policy/advantages_mean: -0.043989673256874084
---------------------------------------------------------------------------------------------------


9it [11:08, 71.73s/it]

objective/kl: 119.80738830566406
ppo/returns/mean: -2.3504462242126465
ppo/policy/advantages_mean: -0.009064823389053345
---------------------------------------------------------------------------------------------------


10it [12:15, 73.60s/it]

objective/kl: 184.65892028808594
ppo/returns/mean: -3.824169397354126
ppo/policy/advantages_mean: 0.0716240406036377
---------------------------------------------------------------------------------------------------





## Evaluate the Model Quantitatively

In [48]:
mean_after_detoxification, std_after_detoxification = evaluate_toxicity(model=ppo_model, 
                                                                        toxicity_evaluator=toxicity_evaluator, 
                                                                        tokenizer=tokenizer, 
                                                                        dataset=dataset["test"], 
                                                                        num_samples=10)
print(f'toxicity [mean, std] after detox: [{mean_after_detoxification}, {std_after_detoxification}]')

100%|██████████| 10/10 [00:29<00:00,  2.99s/it]

toxicity [mean, std] after detox: [0.02678967604297213, 0.034421060826898914]





In [49]:
mean_improvement = (mean_before_detoxification - mean_after_detoxification) / mean_before_detoxification
std_improvement = (std_before_detoxification - std_after_detoxification) / std_before_detoxification

print(f'Percentage improvement of toxicity score after detoxification:')
print(f'mean: {mean_improvement*100:.2f}%')
print(f'std: {std_improvement*100:.2f}%')

Percentage improvement of toxicity score after detoxification:
mean: -4.09%
std: 6.45%


## Evaluate the Model Qualitatively

In [None]:
# Choose a few samples in the dataset as prompts to the reference model and the ppo model.
# Check their completions and compare the reward values given by the toxicity evaluator.
# NOTE: This section is not graded.