<a href="https://colab.research.google.com/github/Papa-Panda/Paper_reading/blob/main/RLHF_instruct_GPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Supervised Fine-Tuning (SFT): Train a base language model on labeled datasets to perform specific instructions.

In [6]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Load pre-trained model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Mock dataset: Instruction-Output pairs
dataset = [
    {"instruction": "Translate 'Hello' to Spanish.", "output": "Hola"},
    {"instruction": "What is 2 + 2?", "output": "4"},
    {"instruction": "Write a short poem about the moon.", "output": "The moon glows bright, in the silent night."}
]


# Preprocess dataset
# Set the pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token
def preprocess(data):
    inputs = [f"Instruction: {d['instruction']}\nOutput:" for d in data]
    labels = [f"{d['output']}" for d in data]
    tokenized = tokenizer(inputs, padding=True, truncation=True, return_tensors="pt")
    tokenized_labels = tokenizer(labels, padding=True, truncation=True, return_tensors="pt").input_ids
    tokenized['labels'] = tokenized_labels
    return tokenized

tokenized_dataset = preprocess(dataset)

# Fine-tuning settings
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=1000,
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


KeyError: 'Invalid key. Only three types of key are available: (1) string, (2) integers for backend Encoding, and (3) slices for data subsetting.'

In [2]:
# Reward Model Training (RMT): Use human-labeled rankings of model outputs to train a reward model that scores outputs based on their alignment with instructions.

In [5]:
import numpy as np

# Mock reward data: Ranked outputs
reward_data = [
    {"instruction": "Translate 'Hello' to Spanish.", "outputs": ["Hola", "Ola", "Hi"], "ranking": [2, 1, 0]},
    {"instruction": "What is 2 + 2?", "outputs": ["4", "four", "5"], "ranking": [2, 1, 0]},
]

# Mock Reward Model: Simple ranking system
def reward_model(output, ranking):
    return ranking[output]

# Convert outputs to numeric IDs
reward_dataset = []
for item in reward_data:
    for i, output in enumerate(item["outputs"]):
        reward_dataset.append((item["instruction"], output, item["ranking"][i]))

# Example for demonstration:
# In practice, this would involve training a neural network on the reward dataset.


In [None]:
# Reinforcement Learning (RL): Fine-tune the model using reinforcement learning to optimize the reward model's score.

In [4]:
from transformers import pipeline
from torch.optim import Adam

# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./results")

# PPO setup
optimizer = Adam(fine_tuned_model.parameters(), lr=5e-5)

# Mock PPO Loop
for epoch in range(5):  # Simulate 5 epochs of RL
    for item in reward_data:
        instruction = item["instruction"]
        outputs = item["outputs"]
        ranking = item["ranking"]

        # Generate new output and compute reward
        response = pipeline("text-generation", model=fine_tuned_model)(f"Instruction: {instruction}\nOutput:")
        generated_text = response[0]["generated_text"]
        reward = reward_model(generated_text, ranking)

        # Optimize using the reward
        loss = -reward  # Negative reward as loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


OSError: Incorrect path_or_model_id: './results'. Please provide either the path to a local folder or the repo_id of a model on the Hub.