In [1]:
%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch==1.13.1 \
    torchdata==0.5.1 --quiet

%pip install \
    transformers==4.27.2 \
    datasets==2.11.0 \
    evaluate==0.4.0 \
    rouge_score==0.1.2 \
    peft==0.3.0 --quiet

# Installing the Reinforcement Learning library directly from github.
%pip install git+https://github.com/lvwerra/trl.git@25fa1bd

[0mCollecting git+https://github.com/lvwerra/trl.git@25fa1bd
  Cloning https://github.com/lvwerra/trl.git (to revision 25fa1bd) to /tmp/pip-req-build-qqmdqtnl
  Running command git clone --filter=blob:none --quiet https://github.com/lvwerra/trl.git /tmp/pip-req-build-qqmdqtnl
[0m  Running command git checkout -q 25fa1bd
  Resolved https://github.com/lvwerra/trl.git to commit 25fa1bd
  Preparing metadata (setup.py) ... [?25l[?25hdone
[0m

In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, GenerationConfig
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, TaskType

# trl: Transformer Reinforcement Learning library
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead
from trl import create_reference_model
from trl.core import LengthSampler

import torch
import evaluate

import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()

In [3]:
model_name="google/flan-t5-base"
huggingface_dataset_name = "knkarthick/dialogsum"

dataset_original = load_dataset(huggingface_dataset_name)

dataset_original

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 12460
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 1500
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 500
    })
})

In [4]:
def build_dataset(model_name,
                  dataset_name,
                  input_min_text_length,
                  input_max_text_length):

    """
    Preprocess the dataset and split it into train and test parts.

    Parameters:
    - model_name (str): Tokenizer model name.
    - dataset_name (str): Name of the dataset to load.
    - input_min_text_length (int): Minimum length of the dialogues.
    - input_max_text_length (int): Maximum length of the dialogues.

    Returns:
    - dataset_splits (datasets.dataset_dict.DatasetDict): Preprocessed dataset containing train and test parts.
    """

    # load dataset (only "train" part will be enough for this lab).
    dataset = load_dataset(dataset_name, split="train")

    # Filter the dialogues of length between input_min_text_length and input_max_text_length characters.
    dataset = dataset.filter(lambda x: len(x["dialogue"]) > input_min_text_length and len(x["dialogue"]) <= input_max_text_length, batched=False)

    # Prepare tokenizer. Setting device_map="auto" allows to switch between GPU and CPU automatically.
    tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

    def tokenize(sample):

        # Wrap each dialogue with the instruction.
        prompt = f"""
Summarize the following conversation.

{sample["dialogue"]}

Summary:
"""
        sample["input_ids"] = tokenizer.encode(prompt)

        # This must be called "query", which is a requirement of our PPO library.
        sample["query"] = tokenizer.decode(sample["input_ids"])
        return sample

    # Tokenize each dialogue.
    dataset = dataset.map(tokenize, batched=False)
    dataset.set_format(type="torch")

    # Split the dataset into train and test parts.
    dataset_splits = dataset.train_test_split(test_size=0.2, shuffle=False, seed=42)

    return dataset_splits

dataset = build_dataset(model_name=model_name,
                        dataset_name=huggingface_dataset_name,
                        input_min_text_length=200,
                        input_max_text_length=1000)

print(dataset)



DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 8017
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic', 'input_ids', 'query'],
        num_rows: 2005
    })
})


In [7]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"\ntrainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [8]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name,
                                              torch_dtype=torch.bfloat16)




In [9]:
peft_model = PeftModel.from_pretrained(model,
                                       'z7ye/peft-dialogue-summary-checkpoint',
                                       lora_config=lora_config,
                                       torch_dtype=torch.bfloat16,
                                       device_map="auto",
                                       is_trainable=True)

print(f'PEFT model parameters to be updated:\n{print_number_of_trainable_model_parameters(peft_model)}\n')

PEFT model parameters to be updated:

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%



In [10]:
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(peft_model,
                                                               torch_dtype=torch.bfloat16,
                                                               is_trainable=True)

print(f'PPO model parameters to be updated (ValueHead + 769 params):\n{print_number_of_trainable_model_parameters(ppo_model)}\n')
print(ppo_model.v_head)

PPO model parameters to be updated (ValueHead + 769 params):

trainable model parameters: 3539713
all model parameters: 251117569
percentage of trainable model parameters: 1.41%

ValueHead(
  (dropout): Dropout(p=0.1, inplace=False)
  (summary): Linear(in_features=768, out_features=1, bias=True)
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


In [11]:
ref_model = create_reference_model(ppo_model)

print(f'Reference model parameters to be updated:\n{print_number_of_trainable_model_parameters(ref_model)}\n')

Reference model parameters to be updated:

trainable model parameters: 0
all model parameters: 251117569
percentage of trainable model parameters: 0.00%



## Reward model

In [12]:
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name, device_map="auto")
toxicity_model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name, device_map="auto")
print(toxicity_model.config.id2label)

{0: 'nothate', 1: 'hate'}


In [13]:
import torch

# Check if a GPU is available and set the device accordingly
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming `toxicity_model` is your model and `toxicity_tokenizer` is your tokenizer
# Move the model to the chosen device
toxicity_model = toxicity_model.to(device)

# Define the non-toxic text
non_toxic_text = "This is a harmless sentence."

# Tokenize the input text and move the input tensor to the same device as the model
toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors="pt").input_ids.to(device)

# Now, when you run your model, it should not raise the device mismatch error
logits = toxicity_model(input_ids=toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

logits [not hate, hate]: [4.604450225830078, -4.116073131561279]


In [14]:
non_toxic_text = "#Person 1# tells Tommy that he didn't like the movie."

#toxicity_input_ids = toxicity_tokenizer(non_toxic_text, return_tensors="pt").input_ids

#logits = toxicity_model(input_ids=toxicity_input_ids).logits
#print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# get the logits for "not hate" - this is the reward!
not_hate_index = 0
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (high): {nothate_reward}')

probabilities [not hate, hate]: [0.999836802482605, 0.0001631750346859917]
reward (high): [4.604450225830078]


Let's show a toxic comment.  This will have a low reward because it is more toxic.

In [15]:
import torch

# Check if a GPU is available and set the device accordingly
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming `toxicity_model` is your model and `toxicity_tokenizer` is your tokenizer
# Move the model to the chosen device
toxicity_model = toxicity_model.to(device)

# Define the toxic text
toxic_text = "#Person 1# tells Tommy that the movie was terrible, dumb and stupid."

# Tokenize the input text
toxicity_input_ids = toxicity_tokenizer(toxic_text, return_tensors="pt").input_ids

# Move the input tensor to the same device as the model
toxicity_input_ids = toxicity_input_ids.to(device)

# Now, when you run your model, it should not raise the device mismatch error
logits = toxicity_model(input_ids=toxicity_input_ids).logits
print(f'logits [not hate, hate]: {logits.tolist()[0]}')

# Print the probabilities for [not hate, hate]
probabilities = logits.softmax(dim=-1).tolist()[0]
print(f'probabilities [not hate, hate]: {probabilities}')

# Get the logits for "not hate" - this is the reward!
not_hate_index = 0
nothate_reward = (logits[:, not_hate_index]).tolist()
print(f'reward (low): {nothate_reward}')

logits [not hate, hate]: [-0.6921164393424988, 0.37227070331573486]
probabilities [not hate, hate]: [0.2564719617366791, 0.7435280084609985]
reward (low): [-0.6921164393424988]


In [16]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

device = 0 if torch.cuda.is_available() else "cpu"

toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"
tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name)
model = AutoModelForSequenceClassification.from_pretrained(toxicity_model_name)

# Move the model to the appropriate device
if device == "cpu":
    model = model.to("cpu")
else:
    model = model.to(f"cuda:{device}")

sentiment_pipe = pipeline("text-classification",
                           model=model,
                           tokenizer=tokenizer,
                           device=device)

reward_logits_kwargs = {
    "top_k": None,  # Return all scores.
    "function_to_apply": "none",  # Set to "none" to retrieve raw logits.
    "batch_size": 16
}

reward_probabilities_kwargs = {
    "top_k": None,  # Return all scores.
    "function_to_apply": "softmax",  # Set to "softmax" to apply softmax and retrieve probabilities.
    "batch_size": 16
}

non_toxic_text = "This is a harmless sentence."
toxic_text = "#Person 1# tells Tommy that the movie was terrible, dumb and stupid."

print("Reward model output:")
print("For non-toxic text")
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))
print("For toxic text")
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

Reward model output:
For non-toxic text
[{'label': 'nothate', 'score': 4.604450225830078}, {'label': 'hate', 'score': -4.116073131561279}]
[{'label': 'nothate', 'score': 0.999836802482605}, {'label': 'hate', 'score': 0.00016317504923790693}]
For toxic text
[{'label': 'hate', 'score': 0.37227070331573486}, {'label': 'nothate', 'score': -0.6921164393424988}]
[{'label': 'hate', 'score': 0.7435280084609985}, {'label': 'nothate', 'score': 0.25647199153900146}]


In [17]:
print(sentiment_pipe(non_toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(non_toxic_text, **reward_probabilities_kwargs))

[{'label': 'nothate', 'score': 4.604450225830078}, {'label': 'hate', 'score': -4.116073131561279}]
[{'label': 'nothate', 'score': 0.999836802482605}, {'label': 'hate', 'score': 0.00016317504923790693}]


In [18]:
print(sentiment_pipe(toxic_text, **reward_logits_kwargs))
print(sentiment_pipe(toxic_text, **reward_probabilities_kwargs))

[{'label': 'hate', 'score': 0.37227070331573486}, {'label': 'nothate', 'score': -0.6921164393424988}]
[{'label': 'hate', 'score': 0.7435280084609985}, {'label': 'nothate', 'score': 0.25647199153900146}]


## Evaluate

In [22]:
import torch
from transformers import AutoTokenizer, RobertaForSequenceClassification
import evaluate

# Check for GPU availability
device = 0 if torch.cuda.is_available() else "cpu"

# Pre-trained model name for toxicity classification
toxicity_model_name = "facebook/roberta-hate-speech-dynabench-r4-target"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(toxicity_model_name)
model = RobertaForSequenceClassification.from_pretrained(toxicity_model_name)

# Move the model to the appropriate device (CPU or GPU)
if device == "cpu":
    model = model.to("cpu")
else:
    model = model.to(f"cuda:{device}")

# Load the toxicity evaluator from the "evaluate" library
toxicity_evaluator = evaluate.load("toxicity",
                                    module_type="measurement",
                                    task="hateXplain")

# Define sample texts - non-toxic and potentially toxic
non_toxic_text = "This is a harmless sentence."
toxic_text = "#Person 1# tells Tommy that the movie was terrible, dumb and stupid."

# Tokenize the texts using the loaded tokenizer
non_toxic_inputs = tokenizer(non_toxic_text, return_tensors="pt")
toxic_inputs = tokenizer(toxic_text, return_tensors="pt")

# Move the input tensors to the same device as the model
if device == "cpu":
    non_toxic_inputs = {k: v.to("cpu") for k, v in non_toxic_inputs.items()}
    toxic_inputs = {k: v.to("cpu") for k, v in toxic_inputs.items()}
else:
    non_toxic_inputs = {k: v.to(f"cuda:{device}") for k, v in non_toxic_inputs.items()}
    toxic_inputs = {k: v.to(f"cuda:{device}") for k, v in toxic_inputs.items()}

# Get model predictions with disabled gradient calculation for efficiency
with torch.no_grad():
    non_toxic_outputs = model(**non_toxic_inputs)
    toxic_outputs = model(**toxic_inputs)

# Convert logits to predicted labels
non_toxic_predictions = torch.argmax(non_toxic_outputs.logits, dim=1)
toxic_predictions = torch.argmax(toxic_outputs.logits, dim=1)

# Convert predicted labels to string format
non_toxic_predictions_str = [tokenizer.decode(p.item()) for p in non_toxic_predictions]
toxic_predictions_str = [tokenizer.decode(p.item()) for p in toxic_predictions]

# Evaluate the outputs using the loaded toxicity evaluator
non_toxic_eval = toxicity_evaluator.compute(predictions=non_toxic_predictions_str, references=non_toxic_text)
toxic_eval = toxicity_evaluator.compute(predictions=toxic_predictions_str, references=toxic_text)

print("Non-toxic text evaluation:")
print(non_toxic_eval)
print("\nToxic text evaluation:")
print(toxic_eval)




Non-toxic text evaluation:
{'toxicity': [0.0006544557982124388]}

Toxic text evaluation:
{'toxicity': [0.042242664843797684]}


In [23]:
toxicity_score = toxicity_evaluator.compute(predictions=[
    non_toxic_text
])

print("Toxicity score for non-toxic text:")
print(toxicity_score["toxicity"])

toxicity_score = toxicity_evaluator.compute(predictions=[
    toxic_text
])

print("\nToxicity score for toxic text:")
print(toxicity_score["toxicity"])

Toxicity score for non-toxic text:
[0.00016317519475705922]

Toxicity score for toxic text:
[0.7435289621353149]


In [24]:
def evaluate_toxicity(model,
                      toxicity_evaluator,
                      tokenizer,
                      dataset,
                      num_samples):

    """
    Preprocess the dataset and split it into train and test parts.

    Parameters:
    - model (trl model): Model to be evaluated.
    - toxicity_evaluator (evaluate_modules toxicity metrics): Toxicity evaluator.
    - tokenizer (transformers tokenizer): Tokenizer to be used.
    - dataset (dataset): Input dataset for the evaluation.
    - num_samples (int): Maximum number of samples for the evaluation.

    Returns:
    tuple: A tuple containing two numpy.float64 values:
    - mean (numpy.float64): Mean of the samples toxicity.
    - std (numpy.float64): Standard deviation of the samples toxicity.
    """

    max_new_tokens=100

    toxicities = []
    input_texts = []
    for i, sample in tqdm(enumerate(dataset)):
        input_text = sample["query"]

        if i > num_samples:
            break

        input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids

        generation_config = GenerationConfig(max_new_tokens=max_new_tokens,
                                             tok_k=0.0,
                                             top_p=1.0,
                                             do_sample=True)

        response_token_ids = model.generate(input_ids=input_ids,
                                            generation_config=generation_config)

        generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)

        toxicity_score = toxicity_evaluator.compute(predictions=[(input_text + " " + generated_text)])

        toxicities.extend(toxicity_score["toxicity"])

    # Compute mean & std using np.
    mean = np.mean(toxicities)
    std = np.std(toxicities)

    return mean, std

And now perform the calculation of the model toxicity before fine-tuning/detoxification:

In [25]:
tokenizer = AutoTokenizer.from_pretrained(model_name, device_map="auto")

mean_before_detoxification, std_before_detoxification = evaluate_toxicity(model=ref_model,
                                                                          toxicity_evaluator=toxicity_evaluator,
                                                                          tokenizer=tokenizer,
                                                                          dataset=dataset["test"],
                                                                          num_samples=10)

print(f'toxicity [mean, std] before detox: [{mean_before_detoxification}, {std_before_detoxification}]')

11it [01:08,  6.20s/it]

toxicity [mean, std] before detox: [0.03603109280811623, 0.03913682690751614]





## PPO TRAINER

In [None]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

test_data = [{"key1": "value1", "key2": "value2", "key3": "value3"}]
print(f'Collator input: {test_data}')
print(f'Collator output: {collator(test_data)}')

Collator input: [{'key1': 'value1', 'key2': 'value2', 'key3': 'value3'}]
Collator output: {'key1': ['value1'], 'key2': ['value2'], 'key3': ['value3']}


In [27]:
learning_rate=1.41e-5
max_ppo_epochs=1
mini_batch_size=4
batch_size=16

config = PPOConfig(
    model_name=model_name,
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

ppo_trainer = PPOTrainer(config=config,
                         model=ppo_model,
                         ref_model=ref_model,
                         tokenizer=tokenizer,
                         dataset=dataset["train"],
                         data_collator=collator)

##  Fine-Tune the Model

In [28]:
output_min_length = 100
output_max_length = 400
output_length_sampler = LengthSampler(output_min_length, output_max_length)

generation_kwargs = {
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

reward_kwargs = {
    "top_k": None, # Return all scores.
    "function_to_apply": "none", # You want the raw logits without softmax.
    "batch_size": 16
}

max_ppo_steps = 10

for step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    # Break when you reach max_steps.
    if step >= max_ppo_steps:
        break

    prompt_tensors = batch["input_ids"]

    # Get response from FLAN-T5/PEFT LLM.
    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        max_new_tokens = output_length_sampler()

        generation_kwargs["max_new_tokens"] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)

        summary_tensors.append(summary.squeeze()[-max_new_tokens:])

    # This needs to be called "response".
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in summary_tensors]

    # Compute reward outputs.
    query_response_pairs = [q + r for q, r in zip(batch["query"], batch["response"])]
    rewards = sentiment_pipe(query_response_pairs, **reward_kwargs)

    # You use the `nothate` item because this is the score for the positive `nothate` class.
    reward_tensors = [torch.tensor(reward[not_hate_index]["score"]) for reward in rewards]

    # Run PPO step.
    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)

    print(f'objective/kl: {stats["objective/kl"]}')
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}')
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}')
    print('-'.join('' for x in range(100)))

0it [00:00, ?it/s]You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
1it [00:26, 26.57s/it]

objective/kl: 33.88230895996094
ppo/returns/mean: -0.7903618812561035
ppo/policy/advantages_mean: -5.413176573654255e-09
---------------------------------------------------------------------------------------------------


2it [01:05, 34.04s/it]

objective/kl: 31.419647216796875
ppo/returns/mean: -0.6617884039878845
ppo/policy/advantages_mean: -1.4894410149679516e-09
---------------------------------------------------------------------------------------------------


3it [01:30, 29.80s/it]

objective/kl: 31.082683563232422
ppo/returns/mean: -0.7173729538917542
ppo/policy/advantages_mean: 7.628644382862149e-09
---------------------------------------------------------------------------------------------------


4it [01:50, 25.81s/it]

objective/kl: 21.914356231689453
ppo/returns/mean: -0.20917607843875885
ppo/policy/advantages_mean: -8.551532815204155e-09
---------------------------------------------------------------------------------------------------


5it [02:15, 25.53s/it]

objective/kl: 26.173057556152344
ppo/returns/mean: -0.3390524387359619
ppo/policy/advantages_mean: 5.719476448007299e-09
---------------------------------------------------------------------------------------------------


6it [02:39, 24.90s/it]

objective/kl: 31.201862335205078
ppo/returns/mean: -0.6619513630867004
ppo/policy/advantages_mean: -2.3052741937590326e-08
---------------------------------------------------------------------------------------------------


7it [03:03, 24.88s/it]

objective/kl: 33.2901611328125
ppo/returns/mean: -0.8298996686935425
ppo/policy/advantages_mean: -4.608569970088183e-09
---------------------------------------------------------------------------------------------------


8it [03:27, 24.35s/it]

objective/kl: 27.60460662841797
ppo/returns/mean: -0.5186558365821838
ppo/policy/advantages_mean: -1.6903483057717494e-09
---------------------------------------------------------------------------------------------------


9it [03:48, 23.55s/it]

objective/kl: 27.871986389160156
ppo/returns/mean: -0.5890558958053589
ppo/policy/advantages_mean: -4.94391727556831e-09
---------------------------------------------------------------------------------------------------


10it [04:08, 24.87s/it]

objective/kl: 23.12503433227539
ppo/returns/mean: -0.18833088874816895
ppo/policy/advantages_mean: 3.1327278549042603e-09
---------------------------------------------------------------------------------------------------





## Evaluate the Model Quantitatively


In [52]:
def evaluate_toxicity(model,
                      toxicity_evaluator,
                      tokenizer,
                      dataset,
                      num_samples):

    # Get the device from one of the model's parameters
    device = next(model.parameters()).device

    max_new_tokens = 100

    toxicities = []
    input_texts = []
    for i, sample in tqdm(enumerate(dataset)):
        input_text = sample["query"]

        if i > num_samples:
            break

        input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids.to(device)

        generation_config = GenerationConfig(max_new_tokens=max_new_tokens,
                                             tok_k=0.0,
                                             top_p=1.0,
                                             do_sample=True)

        response_token_ids = model.generate(input_ids=input_ids,
                                            generation_config=generation_config)

        generated_text = tokenizer.decode(response_token_ids[0], skip_special_tokens=True)

        toxicity_score = toxicity_evaluator.compute(predictions=[(input_text + " " + generated_text)])

        toxicities.extend(toxicity_score["toxicity"])

    # Compute mean & std using np.
    mean = np.mean(toxicities)
    std = np.std(toxicities)

    return mean, std

# Use the corrected evaluate_toxicity function
mean_after_detoxification, std_after_detoxification = evaluate_toxicity(model=ppo_model,
                                                                        toxicity_evaluator=toxicity_evaluator,
                                                                        tokenizer=tokenizer,
                                                                        dataset=dataset["test"],
                                                                        num_samples=10)

print(f'toxicity [mean, std] after detox: [{mean_after_detoxification}, {std_after_detoxification}]')

11it [00:36,  3.35s/it]

toxicity [mean, std] after detox: [0.03264276627239517, 0.04217520740045629]





And compare the toxicity scores of the reference model (before detoxification) and fine-tuned model (after detoxification).

In [53]:
mean_improvement = (mean_before_detoxification - mean_after_detoxification) / mean_before_detoxification
std_improvement = (std_before_detoxification - std_after_detoxification) / std_before_detoxification

print(f'Percentage improvement of toxicity score after detoxification:')
print(f'mean: {mean_improvement*100:.2f}%')
print(f'std: {std_improvement*100:.2f}%')

Percentage improvement of toxicity score after detoxification:
mean: 9.40%
std: -7.76%


## Evaluate the Model Qualitatively


In [54]:
batch_size = 20
compare_results = {}

df_batch = dataset["test"][0:batch_size]

compare_results["query"] = df_batch["query"]
prompt_tensors = df_batch["input_ids"]

summary_tensors_ref = []
summary_tensors = []

# Get response from ppo and base model.
for i in tqdm(range(batch_size)):
    gen_len = output_length_sampler()
    generation_kwargs["max_new_tokens"] = gen_len

    summary = ref_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device),
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors_ref.append(summary)

    summary = ppo_model.generate(
        input_ids=torch.as_tensor(prompt_tensors[i]).unsqueeze(dim=0).to(device),
        **generation_kwargs
    ).squeeze()[-gen_len:]
    summary_tensors.append(summary)

# Decode responses.
compare_results["response_before"] = [tokenizer.decode(summary_tensors_ref[i]) for i in range(batch_size)]
compare_results["response_after"] = [tokenizer.decode(summary_tensors[i]) for i in range(batch_size)]

# Sentiment analysis of query/response pairs before/after.
texts_before = [d + s for d, s in zip(compare_results["query"], compare_results["response_before"])]
rewards_before = sentiment_pipe(texts_before, **reward_kwargs)
compare_results["reward_before"] = [reward[not_hate_index]["score"] for reward in rewards_before]

texts_after = [d + s for d, s in zip(compare_results["query"], compare_results["response_after"])]
rewards_after = sentiment_pipe(texts_after, **reward_kwargs)
compare_results["reward_after"] = [reward[not_hate_index]["score"] for reward in rewards_after]

100%|██████████| 20/20 [00:49<00:00,  2.49s/it]


In [55]:
pd.set_option('display.max_colwidth', 500)
df_compare_results = pd.DataFrame(compare_results)
df_compare_results["reward_diff"] = df_compare_results['reward_after'] - df_compare_results['reward_before']
df_compare_results_sorted = df_compare_results.sort_values(by=['reward_diff'], ascending=False).reset_index(drop=True)
df_compare_results_sorted

Unnamed: 0,query,response_before,response_after,reward_before,reward_after,reward_diff
0,"Summarize the following conversation. #Person1#: Hello? #Person2#: Hello? #Person1#: Can I speak to Li Hong, please? #Person2#: Speaking. #Person1#: Hi, Li Hong. This is Alice. #Person2#: Hi, Alice. How are you? #Person1#: Not bad. Li Hong, I am sorry that I can't go to see Mrs. Brown with you tomorrow morning. My mother is ill. I must take care of her. #Person2#: I'm sorry to hear that. You'd better stay at home. After all, we can visit Mrs. Brown later #Person1#: OK. Bye - bye. #Person2#: ...",<pad> Alice is not going to see Mrs. Brown with Li Hong tomorrow morning because her mother is sick. Li Hong apologizes. They will visit Mrs. Brown later.</s>,<pad> Alice could not see Mrs. Brown tomorrow morning as Alice's mother is ill.</s>,1.58157,2.710231,1.128661
1,"Summarize the following conversation. #Person1#: Judy, what is everybody talking about? #Person2#: Haven't you heard? Richard was fired by our manager. #Person1#: You're kidding. It can't be true. #Person2#: Believe it or not. Everybody is talking about it in the company. #Person1#: Really? I'm surprised. #Person2#: Me too. Summary: </s>",<pad> Judy criticizes Rich because Richard was fired and shows him his boss.</s>,<pad> Judy and Judy find it out that Richard was fired by their manager and that everybody knows about it. Judy thinks it's great.</s>,1.078422,1.707718,0.629296
2,"Summarize the following conversation. #Person1#: Hello. I want to reconfirm our flight to London. #Person2#: Yes, sir. Did you call the airline? #Person1#: Yes, I did. But I couldn't communicate with them in English. They speak only Spanish. So I need your help. #Person2#: Certainly, sir. What is the flight number and when are you leaving? #Person1#: We are taking IB 385 to London tomorrow at 1 p. m. #Person2#: Oh, I see, sir. We have the airline office inside the hotel. They have an English...","<pad> #Person2#'ll help #Person1# reconfirm their flight to London. #Person1# calls the airline. #Person2# gives #Person1# the flight number, and reminds she'll call 35 to confirm her flight.</s>",<pad> #Person1# asks #Person2# for help in confirming a flight to London tomorrow noon at the IB 385.</s>,1.666884,2.176962,0.510078
3,"Summarize the following conversation. #Person1#: I'd like to have this cashed, please. #Person2#: Please put you name and address here. May I see your passport? #Person1#: Yes. #Person2#: How would you like it? #Person1#: Ten hundreds and ten twenties, and the rest in small change, please. #Person2#: OK. Here you are. Summary: </s>",<pad> @pandeleting and enabling #Person1#'s passport to cash him.</s>,<pad> #Person1#'d like to have this cashed ten hundred dollars with the rest in small change.</s>,1.476176,1.837705,0.361529
4,"Summarize the following conversation. #Person1#: Mom, I just finished my paper. Can you proofread it before I hand it in? #Person2#: Sure, let's take a look. Sweetie, this is terrific. Your ideas are so original. #Person1#: Thanks. #Person2#: I can tell you worked hard on it. #Person1#: I really did! I started thinking about what I wanted to say three weeks ago. #Person2#: Well, it was definitely worth all the time. #Person1#: Let's just hope my teacher agrees. Summary: </s>","<pad> #Person1# tells mom she has just finished her paper and recommends those ideas because #Person1# told her she learned to make ""all the time"".</s>",<pad> #Person1# says her mother believes her original paper was really worth it. #Person1#'s teacher accepts the classic paper.</s>,2.028174,2.329811,0.301637
5,"Summarize the following conversation. #Person1#: Here is the final draft of our contract. I'm glad that we have reached an agreement on almost every term in our trade. #Person2#: Yes, it seems to me we have come quite a long way. However, let me take a close look at the final draft. #Person1#: Do you have some points to bring up? #Person2#: Well, everything we've discussed seems to be here. #Person1#: Yes, including a description of the shirts you want to purchase this time, the total amount...",<pad> #Person2# is glad everyone has reached an agreement on almost every term in their trade. #Person1# tells #Person2# the terms of the contract are there. #Person2# wants to sign the contrat because the sample 26 is the standard for others.</s>,<pad> #Person1# and #Person2# discuss the final draft of the contract. They focus on details at once on each clause in the contract and check over the quality standards. They are still evaluating the contract.</s>,3.252924,3.544171,0.291246
6,"Summarize the following conversation. #Person1#: Where shall I register, please? #Person2#: Here. Do you have a registration card? #Person1#: Yes. Here you are. #Person2#: Please register your information here and pay for it. And I'll make a medical record for you. #Person1#: OK. How much do I need to pay for the registration? #Person2#: Please pay ten yuan for the registration. #Person1#: Here is my money. #Person2#: This is your registration card. Please don't lose it and bring it whenever...","<pad> #Person2# asks #Person1# if #Person1# will be registered and pays ten yuan for the registration, #Person1# will make a medical record for @drugstore and help @drugstore to work.</s>",<pad> #Person1# asks #Person2# for directions about Center for Academic Medicine. #Person2# gives #Person1# access to the Dr. Merde and any other hospital's database.</s>,1.516118,1.783075,0.266957
7,"Summarize the following conversation. #Person1#: Oh, my God! What's this? #Person2#: What? #Person1#: Look! This window is open. #Person2#: Did you open it before we left? #Person1#: Are you kidding? It's winter. Why would I open it? #Person2#: I don't know. Wait. Is this yours? #Person1#: No! Oh, my God! Someone has broken into the house. #Person2#: It looks that way. That's probably why the door wasn't locked when we came in. #Person1#: I locked it when I left though. #Person2#: Yes, but t...","<pad> Allen wonders how an unnamed person broke into the house. Allen tells #Person1# and #Person2# that the robber broke in because #Person1# thought it was winter about it. Allen thinks someone stole the TV, but #2 suggests they look upstairs again.</s>",<pad> Allen and #Person1# talking about a broken door. Allen notices another robber broke his house window but exits one place. They decide to go upstairs.</s>,1.97235,2.218864,0.246514
8,"Summarize the following conversation. #Person1#: I'm forming a music band. #Person2#: Do you already know how to play an instrument? #Person1#: Uh... Yeah! I'Ve told you a thousand times that I'm learning to play the drums. Now that I know how to play well, I would like to form a rock band. #Person2#: Aside from yourself, who are the other members of the band? #Person1#: We have a guy who plays guitar, and another who plays bass. Although we still haven't found anyone to be our singer. You t...","<pad> #Person1# tells #Person2# #Person1# is forming a music band and the members of the band haven't found anyone to be the singer. #Person2# will audition for the band, where to keep the amplifiers, microphones or even the drums at #Person1#'s house.</s>",<pad> #Person1# wants to form a rock band. #Person1# tells #Person2# about the lack of room for the instruments. #Person2# invites #Person1# to audition this weekend at #Person1#'s house due to their musical talent.</s>,2.611892,2.807337,0.195445
9,"Summarize the following conversation. #Person1#: It smells like an ashtray in here! #Person2#: Hi honey! What's wrong? Why do you have that look on your face? #Person1#: What's wrong? I thought we agreed that you were gonna quit smoking. #Person2#: No! I said I was going to cut down which is very different. You can't just expect me to go cold turkey overnight! #Person1#: Look, there are other ways to quit. You can try the nicotine patch, or nicotine chewing gum. We spend a fortune on cigaret...","<pad> The hair aging is because #Person2# can't find the willpower, but #Person1# suggests #Person2# try nicotine patches or nicotine chewing gum, but #Person2# is on the edge of quitting because she's trying to save money despite having the willpower to quit.</s>",<pad> #Person1# tells honey #Person2# has the smell of an ashtray but Honey is not willing to quit. They agree more alternatives can be found.</s>,1.455218,1.629175,0.173957


In [56]:
import pandas as pd
df_compare_results_sorted.to_csv('google.csv', index=False)