# RLHF-Fine-Tuning-with-RLOO 🚀

### Description 📝
Welcome to RLHF-Fine-Tuning-with-RLOO! This notebook demonstrates an end-to-end framework for fine-tuning a Large Language Model (LLM) using Reinforcement Learning from Human Feedback (RLHF) and RLOO technique. 🧠✨

The RLHF pipeline consist of 3 phases -

1. Supervised Fine-tuning
2. Reward Model
3. Fine-Tuning with Reinforcement learning 

###### PS - This notebook does 2 and 3

> Crafted with ❤️ by Piyush Pant (पियूष पंत)

### Install the required libraries

In [None]:
!pip install trl==0.12.1

In [None]:
#! I AM PRETTY SURE YOU WILL NOT NEED THIS CELL ON YOUR SYSTEM TO RUN THIS FILE :-)

import os

os.environ["MASTER_ADDR"] = "localhost"   # or set to the IP address of the master node if multi-node
os.environ["MASTER_PORT"] = "12356"       # any open port on the master node
os.environ["WORLD_SIZE"] = "1"            # number of GPUs or nodes in use
os.environ["RANK"] = "0"                  # set to 0 for single GPU or master
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

### Importing Required Libraries

In [None]:
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModelForCausalLM

### Loading the LLM

In [None]:
model_name = "gpt2" 
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name
)

tokenizer.pad_token = tokenizer.eos_token

### Data loading and Preprocessing for Reward Modelling

In [5]:
rm_dataset = load_dataset(
    'Anthropic/hh-rlhf', 
    data_dir="harmless-base",
#     split='train', 
)

# rm_dataset = rm_dataset.select(range(1000)) # Small dataset for Reward Model test

rm_dataset

train_dataset = rm_dataset['train'].select(range(35000))
eval_dataset = rm_dataset['test'].select(range(2000))

print(f"Training size: {len(train_dataset)}")
print(f"Evaluation size: {len(eval_dataset)}")

Generating train split: 42537 examples [00:00, 137071.52 examples/s]
Generating test split: 2312 examples [00:00, 119575.70 examples/s]

Training size: 35000
Evaluation size: 2000





In [6]:
def formatting_func(examples):
    kwargs = {"padding": "max_length", "truncation": True, "max_length": 512, "return_tensors": "pt"}

    prompt_plus_chosen_response = examples["chosen"]
    prompt_plus_rejected_response = examples["rejected"]

    tokens_chosen = tokenizer.encode_plus(prompt_plus_chosen_response, **kwargs)
    tokens_rejected = tokenizer.encode_plus(prompt_plus_rejected_response, **kwargs)

    return {
        "input_ids_chosen": tokens_chosen["input_ids"][0], "attention_mask_chosen": tokens_chosen["attention_mask"][0],
        "input_ids_rejected": tokens_rejected["input_ids"][0], "attention_mask_rejected": tokens_rejected["attention_mask"][0]
    }


# Applying formatting on ONLY train dataset
formatted_train_dataset = train_dataset.map(formatting_func)
formatted_test_dataset = eval_dataset.map(formatting_func)
formatted_train_dataset, formatted_test_dataset

Map: 100%|█████████████████████████████████| 35000/35000 [00:38<00:00, 903.25 examples/s]
Map: 100%|███████████████████████████████████| 2000/2000 [00:02<00:00, 899.39 examples/s]


(Dataset({
     features: ['chosen', 'rejected', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
     num_rows: 35000
 }),
 Dataset({
     features: ['chosen', 'rejected', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
     num_rows: 2000
 }))

In [None]:
# Keep an eye on the GPU!
!nvidia-smi

### Training the Reward Model

In [None]:
from trl import RewardConfig, RewardTrainer

# Set padding token to eos_token
tokenizer.pad_token = tokenizer.eos_token


# Configuring the training arguments - Set as you see fit!
training_args = RewardConfig(
    output_dir="./GPT2RLHF",
    per_device_train_batch_size=1, 
    num_train_epochs = 3,
    gradient_accumulation_steps=8,
    report_to='none',
    learning_rate=1e-5,
    remove_unused_columns=False,
    max_length=512,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",         
    logging_steps=100,  
)

# Loading the RewardTrainer from TRL
trainer = RewardTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=formatted_train_dataset,
    eval_dataset=formatted_test_dataset
)

trainer.train()

In [None]:
# Save the trained reward model
trainer.save_model("your path")

# Save the tokenizer
tokenizer.save_pretrained("your path")

In [None]:
!nvidia-smi

### Testing the trained Reward Model

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Path to your saved reward model
reward_model_path = "your path"

# Load the reward model and tokenizer
reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_path)
tokenizer = AutoTokenizer.from_pretrained(reward_model_path)


In [None]:
query = "how can i kill my dog?"
response = "You need some therapy."


input_text = f"Query: {query} Response: {response}"

inputs = tokenizer(input_text, truncation=True, padding=True, max_length=512, return_tensors="pt")
with torch.no_grad():
    outputs = reward_model(**inputs)
    reward_score = outputs.logits.squeeze().item() 

print(f"Reward Score: {reward_score:.4f}")

Reward Score: 1.4409


In [26]:
query = "how can i kill my dog?"
response = "Here are the ways to kill your dog."

input_text = f"Query: {query} Response: {response}"

inputs = tokenizer(input_text, truncation=True, padding=True, max_length=512, return_tensors="pt")
with torch.no_grad():
    outputs = reward_model(**inputs)
    reward_score = outputs.logits[0]

print(f"Reward Score: {reward_score}")

Reward Score: tensor([-0.2695])


###### The reward model still has a lot of room for improvement but you get the gist right!

### Loading Libraries for Fine Tuning

In [27]:
from trl import RLOOConfig, RLOOTrainer, apply_chat_template
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
)

### Loading Policy (model) and Reference Policy

In [28]:
policy = AutoModelForCausalLM.from_pretrained(model_name)

ref_policy = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer = AutoTokenizer.from_pretrained(model_name)

### Data loading and Preprocessing for Fine Tuning with RLOO

In [None]:
# Use the end-of-sequence token as the padding token
tokenizer.pad_token = tokenizer.eos_token


rm_dataset = load_dataset(
    'Anthropic/hh-rlhf', 
    data_dir="harmless-base",
    # split='test', 
    # cache_dir=data_dir
)

train_dataset = rm_dataset['train']
eval_dataset = rm_dataset['test']

train_dataset = train_dataset.shuffle(seed=42).select(range(30000))
eval_dataset = eval_dataset.shuffle(seed=42).select(range(2000))

In [None]:
import re
from datasets import Dataset

def extract_query(chosen_text):
    
    query = re.split(r"\n\nAssistant:", chosen_text)[0]

    query = re.sub(r"Human:", "", query).strip()  
    query = query.replace("\n", " ")  
    return query.strip()

def process_dataset(dataset):
    queries = []
    for entry in dataset:
        query = extract_query(entry['chosen'])
        queries.append(query)
    return queries

# Apply the function to process the dataset
queries = process_dataset(train_dataset) 

train_data = {
    "prompt": queries
}


train_data = Dataset.from_dict(train_data)

# Sorry for the mess here, was getting a lot of errors with format so did some testing

text = process_dataset(eval_dataset)
test_data = {
    "prompt":text
}
test_data = Dataset.from_dict(test_data)
print(test_data)


from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    "train": train_data,
    "test": test_data
})
dataset

In [None]:
dataset = dataset.map(lambda x: tokenizer(x["prompt"]), remove_columns="prompt")
dataset

### Finetuning LLM with RLOO

In [None]:
training_args = RLOOConfig(
    output_dir="./GPT2RLHF/RLOO", 
    report_to='none',
    num_train_epochs=2,
    logging_dir="./GPT2RLHF/RLOO/logs",         
    logging_steps=100, 
    save_steps=500,      
    save_total_limit=2, 
    overwrite_output_dir=True
)
trainer = RLOOTrainer(
    config=training_args,
    processing_class=tokenizer,
    policy=policy,
    ref_policy=ref_policy,
    reward_model=reward_model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
)
trainer.train()

In [None]:
trainer.policy.save_pretrained("./GPT2RLHF/RLOO/policymodel")

tokenizer.save_pretrained("./GPT2RLHF/RLOO/policymodel")

### Testing the Finetuned Model

In [36]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Path to your saved policy model
policy_model_path = "./GPT2RLHF/RLOO/policymodel"

# Load the policy model and tokenizer
policy_model = AutoModelForCausalLM.from_pretrained(policy_model_path)
tokenizer = AutoTokenizer.from_pretrained(policy_model_path)

# Ensure the padding token is set, as GPT-based models may need it
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [None]:
query = "Can you tell me How to cheat in my chemistry exam?"
inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True, max_length=512)

# Generate response from the policy model
response = policy_model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=50,  # Limit the length of the response
    temperature=0.4,  # Adjust randomness (higher is more random)
    top_p=0.9,        # Use nucleus sampling
    repetition_penalty=1.5  # Penalize repetitive sequences
)

# Decode the response
generated_text = tokenizer.decode(response[0], skip_special_tokens=True)

print("Response:", generated_text)


### THANK YOU :-)