# Reinforcement Learning from AI Feedback (RLAIF)

## Enhancing T5-Base Summarization with Proximal Policy Optimization (PPO) and PEFT Fine-Tuning

In [2]:
!pip install -q torch
!pip install -q transformers
!pip install -q datasets
!pip install -q trl
!pip install -q peft
!pip install -q numpy
!pip install -q pandas
!pip install -q tqdm
!pip install -q openai
!pip install -q wandb
!pip install -U -q sentencepiece

In [3]:
import torch

from transformers import AutoModelForSequenceClassification, AutoTokenizer, T5Tokenizer, T5ForConditionalGeneration

from torch.utils.data import DataLoader, Dataset as TorchDataset
from torch.optim import AdamW

from datasets import load_dataset, Dataset as HFDataset

from peft import PeftModel, PeftConfig,  TaskType

from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    PeftType,
    LoraConfig,
)

# AutoModelForCausalLMWithValueHead & AutoModelForSeq2SeqLMWithValueHead: A transformer model with an additional scalar output for each token which can be used as a value function in reinforcement learning.
# https://huggingface.co/docs/trl/models#trl.AutoModelForSeq2SeqLMWithValueHead

# trl: Transformer Reinforcement Learning library
import trl
from trl import PPOTrainer, PPOConfig, AutoModelForSeq2SeqLMWithValueHead # https://huggingface.co/docs/trl/quickstart
from trl import create_reference_model
from trl.core import LengthSampler

# import evaluate

import numpy as np
import pandas as pd

# tqdm library makes the loops show a smart progress meter.
from tqdm import tqdm
tqdm.pandas()


RuntimeError: ignored

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

In [None]:
import os
import getpass

openai_api_key = getpass.getpass("Enter your OpenAI API Key: ")
os.environ["OPENAI_API_KEY"] = openai_api_key
# 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

In [None]:
orig_dataset = load_dataset('CarperAI/openai_summarize_comparisons', split='test')

In [None]:
orig_dataset[10000]

# RLHF Fine-Tuning

In [None]:
policy_model_path = "JuanKO/rlhf_base_model"
policy_model_name = "t5-base"

policy_model = T5ForConditionalGeneration.from_pretrained(policy_model_path)
policy_model.to(device)
policy_tokenizer = T5Tokenizer.from_pretrained(policy_model_path)

In [None]:
lora_config = LoraConfig(
    r=8, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.10,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # T5
)

policy_peft_model = get_peft_model(policy_model, lora_config)
policy_peft_model.to(device)

In [None]:
policy_peft_model.print_trainable_parameters()

In [None]:
# https://huggingface.co/docs/trl/quickstart
ppo_model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(policy_peft_model,
                                                               torch_dtype=torch.bfloat16,
                                                               is_trainable=True)

ppo_model.to(device)

In [None]:
ref_model = create_reference_model(policy_model)
ref_model.to(device)

In [None]:
# Load the dataset
orig_dataset = load_dataset('CarperAI/openai_summarize_comparisons', split='test')

# Filter samples where the prompt length is less than or equal to 750
filtered_dataset = orig_dataset.filter(lambda example: len(example['prompt'].split()) <= 450) # By word
#filtered_dataset = orig_dataset.filter(lambda example: len(example['prompt']) <= 1250) # By character

# Shuffle and select the first 10K samples
#shuffled_dataset = orig_dataset.shuffle(seed=42).select(range(1000))
shuffled_dataset = filtered_dataset.shuffle(seed=42).select(range(2000))


# Extract the desired features.  Renaming chose to response to follow the ppo library requirements.
new_dataset_dict = {
    "prompt": shuffled_dataset["prompt"],
    "response": shuffled_dataset["chosen"]
}

# Convert the dictionary to a new Dataset
dataset = HFDataset.from_dict(new_dataset_dict)

# Split the new_dataset into train_dataset and eval_dataset
split_ratio = 0.8  # 80% for training, 20% for evaluation
num_train_samples = int(split_ratio * len(dataset))
train_dataset = dataset.select(range(num_train_samples))
eval_dataset = dataset.select(range(num_train_samples, len(dataset)))

In [None]:
print(train_dataset[0].keys())
print(eval_dataset[0].keys())

In [None]:
from transformers import T5Tokenizer

# Instantiate your tokenizer (replace T5Tokenizer with your model's tokenizer if different)
tokenizer = T5Tokenizer.from_pretrained("t5-small") # or whatever model you're using

def tokenize_function(example):
    # Tokenize the prompt and store it as input_ids. Also return the response.
    return {
        "input_ids": tokenizer(example["prompt"], return_tensors="pt", truncation=True, max_length=1024)["input_ids"].squeeze(),
        "response": example["response"],
    }

# Tokenize the training and evaluation datasets
train_dataset = train_dataset.map(tokenize_function, batched=False)
eval_dataset = eval_dataset.map(tokenize_function, batched=False)


In [None]:
# Lets check one sample of the train_dataset
print(train_dataset[0])  # print the first example from the training dataset

In [None]:
def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

test_data = [{"key1": "value1", "key2": "value2", "key3": "value3"}, {"key1": "value4", "key2": "value5", "key3": "value6"}]
print(f'Collator input: {test_data}')
print(f'Collator output: {collator(test_data)}')

# Lets sample what the collator generates:
sample_data = [train_dataset[i] for i in range(3)]  # take first three examples
collated_data = collator(sample_data)
print(collated_data.keys())

In [None]:
learning_rate=1e-4
max_ppo_epochs=5
mini_batch_size=2
batch_size=8

In [None]:
# Check out https://huggingface.co/docs/trl/trainer

config = PPOConfig(
    model_name=policy_model_name,
    learning_rate=learning_rate,
    ppo_epochs=max_ppo_epochs,
    mini_batch_size=mini_batch_size,
    batch_size=batch_size
)

In [None]:
# Check out https://huggingface.co/docs/trl/trainer

ppo_trainer = PPOTrainer(config=config,
                         model=ppo_model,
                         ref_model=ref_model,
                         tokenizer=policy_tokenizer,
                         dataset=train_dataset,
                         data_collator=collator)

In [None]:
# Some initial values
output_min_length = 128
output_max_length = 2048
output_length_sampler = LengthSampler(output_min_length, output_max_length)

# These hyperparams guide the generation of the completion in the policy model. We could add other params like temperature.
generation_kwargs = {
    "temperature": 0.5,
    "min_length": 5,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True
}

max_ppo_steps = 256

In [None]:
import openai
import re

def score_summaries(full_text, summarized_text):

  prompt = f"""### FULL TEXT:\n {full_text} \n
  ### SUMMARIZED TEXT: \n {summarized_text}"""

  response = openai.ChatCompletion.create(
      temperature = 0.,
      model="gpt-3.5-turbo",
      messages=[{"role": "system", "content": f"""You are an expert in text summarization. Below, you are given the full text and its summarization.
  Your role is to rate the provided summarization with scores ranging from 0 to 1, where: 0 is the lowest score, 1 is the highest score.
  Your response should only be a double precision number that represents the scoring rate.
  """},
      {"role": "user", "content": prompt}],
      request_timeout=60000
  )

  response = response['choices'][0]['message']['content']
  score    = float(re.findall(r"[-+]?(?:\d*\.*\d+)", response)[0])
  return score

In [None]:
orig_dataset[10000]['prompt']

In [None]:
objective_kl    = []
returns_mean    = []
advantages_mean = []

import time

start = time.time()

for step, batch in enumerate(ppo_trainer.dataloader):

    if step >= max_ppo_steps: # Break when we reach max_steps.
        break


    prompts = [policy_tokenizer.decode(tok) for tok in batch['input_ids']][0]
    prompt_tensors = batch["input_ids"]
    # print(batch['response'])
    # if step==0: break

    if isinstance(prompt_tensors, list) and all(isinstance(item, list) for item in prompt_tensors): # HACK!!! Check if original_prompt_tensors is a list of lists
        lengths = [len(seq) for seq in prompt_tensors] # Verify if sequences have fixed or variable length
        unique_lengths = set(lengths)

        if len(unique_lengths) > 1: # If sequences have variable lengths, pad them
            max_length = max(unique_lengths)
            original_prompt_tensors = [seq + [0] * (max_length - len(seq)) for seq in prompt_tensors]  # padding with zeros

        prompt_tensors = [torch.tensor(seq).to(device) for seq in prompt_tensors] # Convert original_prompt_tensors to individual tensors

    summary_tensors = []

    for prompt_tensor in prompt_tensors:
        prompt_tensor = torch.tensor(prompt_tensor).to(device)
        max_new_tokens = output_length_sampler()
        generation_kwargs["max_new_tokens"] = max_new_tokens
        summary = ppo_trainer.generate(prompt_tensor, **generation_kwargs)
        summary_tensors.append(summary.squeeze()[-max_new_tokens:])

    batch["response"] = [policy_tokenizer.decode(r.squeeze()) for r in summary_tensors]

    response = batch["response"]

    reward_tensors = []

    for prompt, summary in zip(prompts, response):
        score = score_summaries(prompt, response)
        # score = float(score)
        reward_tensors.append(torch.tensor(score))

    stats = ppo_trainer.step(prompt_tensors, summary_tensors, reward_tensors)
    ppo_trainer.log_stats(stats, batch, reward_tensors)

    print(f'objective/kl: {stats["objective/kl"]}') # Measures how different the policy's action distribution after the update is from the action distribution before the update. PPO tries to make these changes very small to avoid sudden changes.
    print(f'ppo/returns/mean: {stats["ppo/returns/mean"]}') # This is the average return achieved by the agent. Higher is better.
    print(f'ppo/policy/advantages_mean: {stats["ppo/policy/advantages_mean"]}') # Measures how much better an action is than the average action at a given state.
    print(f'STEP: {step}')

    objective_kl.append(stats["objective/kl"])
    returns_mean.append(stats["ppo/returns/mean"])
    advantages_mean.append(stats["ppo/policy/advantages_mean"])

    print('-'.join('' for x in range(100)))

end = time.time()
print(f'TIME: {end - start}')

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Data for plotting
t = np.array(returns_mean)
s = range(len(returns_mean))

fig, ax = plt.subplots()
ax.plot(s, t)

ax.set(xlabel='episodes', ylabel='mean return',
       title='Policy optimization')
# ax.grid()

fig.savefig("test.png")
plt.show()

## Saving the Model and Tokenizer

After the fine-tuning process, it's crucial to save the model's weights and the tokenizer's configuration for future use, whether it's for inference, further training, or sharing with the community.

### 1. Saving the Model

To preserve the state of your model post-training, use the `save_pretrained` method:


In [None]:
ppo_trainer.model.push_to_hub('PanoEvJ/T5_summarization_RLAIF', token='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')
policy_tokenizer.push_to_hub('PanoEvJ/T5_summarization_RLAIF', token='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')

In [None]:
objective_kl

In [None]:
returns_mean

In [None]:
advantages_mean