In [1]:
from dataclasses import dataclass, field
from typing import Optional

import torch
import pandas as pd
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
from torch.optim import Adam
from tqdm import tqdm
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    HfArgumentParser,
    RobertaForSequenceClassification,
    RobertaTokenizer,
)

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, create_reference_model, set_seed
from trl.core import LengthSampler

tqdm.pandas()

In [2]:
@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="ybelkada/gpt-j-6b-sharded-bf16", metadata={"help": "the model name"})
    log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
    learning_rate: Optional[float] = field(default=(1.47e-5) * 2, metadata={"help": "the learning rate"})
    mini_batch_size: Optional[int] = field(default=4, metadata={"help": "the PPO minibatch size"})
    batch_size: Optional[int] = field(default=16, metadata={"help": "the batch size"})
    gradient_accumulation_steps: Optional[int] = field(default=1, metadata={"help": "the number of gradient accumulation steps"})
    model_save_path: Optional[str] = field(default="./gpt-j-6B-detoxified-long-context-26-shl-1e4-final", metadata={"help": "the path to save the model"})
# Now you can use ScriptArguments
script_args = ScriptArguments(
    model_name="ybelkada/gpt-j-6b-sharded-bf16",
    log_with=None,
    learning_rate=(1.47e-5) * 2,
    mini_batch_size=4,
    gradient_accumulation_steps=1,
    model_save_path="./gpt-j-6B-detoxified-long-context-26-shl-1e4-final"
)

# Proceed with the configuration and the rest of your script as intended
config = PPOConfig(
    model_name=script_args.model_name,
    learning_rate=script_args.learning_rate,
    batch_size=32,  
    log_with=script_args.log_with,
    ppo_epochs=100,
    mini_batch_size=script_args.mini_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
)

In [3]:
def build_dataset(
    config, dataset_name="allenai/real-toxicity-prompts", input_min_text_length=5, input_max_text_length=10
):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        config (dict): Configuration parameters.
        dataset_name (str):
            The name of the dataset to be loaded.
        input_min_text_length (int):
            Minimum length of input text.
        input_max_text_length (int):
            Maximum length of input text.

    Returns:
        dataloader (torch.utils.data.DataLoader):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Load the dataset
    ds = load_dataset(dataset_name, split="train")

    # Check if the dataset contains the "text" column
    if "text" not in ds.column_names:
        raise KeyError("The dataset does not contain a column named 'text'.")

    # Tokenize and process the dataset
    tokenized_ds = ds.map(
        lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=input_max_text_length),
        batched=True
    )

    # Convert the tokenized dataset to PyTorch tensors
    tensor_ds = tokenized_ds.remove_columns(["text"]).with_format("torch")

    # Create a PyTorch DataLoader
    dataloader = torch.utils.data.DataLoader(
        tensor_ds,
        batch_size=config.batch_size,
        shuffle=True
    )

    return dataloader

In [4]:
# Load the IMDb dataset
dataset = load_dataset("imdb")

if dataset is None:
    print("Error: Failed to load dataset")
else:
    print("Dataset loaded successfully")

# Define input_min_text_length and input_max_text_length as per your requirements
input_min_text_length = 100
input_max_text_length = 500

# Define your filter function
def filter_fn(sample):
    text_length = len(sample["text"])
    return input_min_text_length <= text_length <= input_max_text_length

# Apply the filter function to the dataset
filtered_dataset = dataset["train"].filter(filter_fn, batched=False)

# Assuming you have a LengthSampler class defined elsewhere
input_size = LengthSampler(input_min_text_length, input_max_text_length)

Dataset loaded successfully


In [5]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
def tokenize(sample):
    sample["query"] = sample["text"]
    return sample
    
    ds = ds.map(tokenize, batched=False)
    ds.set_format(type="torch")

    ds = ds.train_test_split(test_size=0.2, shuffle=False)["train"]

    return ds

In [6]:
def build_dataset(
    config, dataset_name="allenai/real-toxicity-prompts", input_min_text_length=5, input_max_text_length=10
):
    """
    Build dataset for training. This builds the dataset from `load_dataset`, one should
    customize this function to train the model on its own dataset.

    Args:
        config (dict): Configuration parameters.
        dataset_name (str):
            The name of the dataset to be loaded.
        input_min_text_length (int):
            Minimum length of input text.
        input_max_text_length (int):
            Maximum length of input text.

    Returns:
        dataloader (torch.utils.data.DataLoader):
            The dataloader for the dataset.
    """
    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
    tokenizer.pad_token = tokenizer.eos_token

    # Load the dataset
    ds = load_dataset(dataset_name, split="train")

    # Check if the dataset contains the specified column
    target_column = "review"
    if target_column not in ds.column_names:
        raise KeyError(f"The dataset does not contain a column named '{target_column}'.")

    # Tokenize and process the dataset
    tokenized_ds = ds.map(
        lambda x: tokenizer(x[target_column], padding="max_length", truncation=True, max_length=input_max_text_length),
        batched=True
    )

    # Convert the tokenized dataset to PyTorch tensors
    tensor_ds = tokenized_ds.remove_columns([target_column]).with_format("torch")

    # Create a PyTorch DataLoader
    dataloader = torch.utils.data.DataLoader(
        tensor_ds,
        batch_size=config.batch_size,
        shuffle=True
    )

    return dataloader

In [7]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
dataset = load_dataset("imdb")

# Define batch size
batch_size = 4

# Create DataLoader with collate function
dataloader = DataLoader(
    dataset["train"],
    batch_size=batch_size,
    collate_fn=data_collator,
    shuffle=True
)

In [8]:
# set seed before initializing value head for deterministic eval
set_seed(config.seed)

# Now let's build the model, the reference model, and the tokenizer. We first load the model
# in bfloat16 to save memory using `transformers`.
model = AutoModelForCausalLM.from_pretrained(config.model_name, torch_dtype=torch.bfloat16)
# And then we pass the loaded model to `AutoModelForCausalLMWithValueHead`.
model = AutoModelForCausalLMWithValueHead.from_pretrained(model)

# We create a reference model by sharing 20 layers
ref_model = create_reference_model(model, num_shared_layers=20)

# We make sure to use `Adam` optimizer on the model parameters that require gradients.
optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.learning_rate)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
for sample in filtered_dataset:
    print(sample)

{'text': "My interest in Dorothy Stratten caused me to purchase this video. Although it had great actors/actresses, there were just too many subplots going on to retain interest. Plus it just wasn't that interesting. Dialogue was stiff and confusing and the story just flipped around too much to be believable. I was pretty disappointed in what I believe was one of Audrey Hepburn's last movies. I'll always love John Ritter best in slapstick. He was just too pathetic here.", 'label': 0}
{'text': "I think I will make a movie next weekend. Oh wait, I'm working..oh I'm sure I can fit it in. It looks like whoever made this film fit it in. I hope the makers of this crap have day jobs because this film sucked!!! It looks like someones home movie and I don't think more than $100 was spent making it!!! Total crap!!! Who let's this stuff be released?!?!?!", 'label': 0}
{'text': "Ned aKelly is such an important story to Australians but this movie is awful. It's an Australian story yet it seems like

In [10]:
for sample in filtered_dataset:
    print(sample["text"])
    print(sample["label"])
    # Print other fields as needed

My interest in Dorothy Stratten caused me to purchase this video. Although it had great actors/actresses, there were just too many subplots going on to retain interest. Plus it just wasn't that interesting. Dialogue was stiff and confusing and the story just flipped around too much to be believable. I was pretty disappointed in what I believe was one of Audrey Hepburn's last movies. I'll always love John Ritter best in slapstick. He was just too pathetic here.
0
I think I will make a movie next weekend. Oh wait, I'm working..oh I'm sure I can fit it in. It looks like whoever made this film fit it in. I hope the makers of this crap have day jobs because this film sucked!!! It looks like someones home movie and I don't think more than $100 was spent making it!!! Total crap!!! Who let's this stuff be released?!?!?!
0
Ned aKelly is such an important story to Australians but this movie is awful. It's an Australian story yet it seems like it was set in America. Also Ned was an Australian yet

In [11]:
sample_index = 0  # Choose the index of the sample you want to retrieve
sample = filtered_dataset.select([sample_index])
print(sample)

Dataset({
    features: ['text', 'label'],
    num_rows: 1
})


In [12]:
print("Text:", sample["text"])
print("Label:", sample["label"])

Text: ["My interest in Dorothy Stratten caused me to purchase this video. Although it had great actors/actresses, there were just too many subplots going on to retain interest. Plus it just wasn't that interesting. Dialogue was stiff and confusing and the story just flipped around too much to be believable. I was pretty disappointed in what I believe was one of Audrey Hepburn's last movies. I'll always love John Ritter best in slapstick. He was just too pathetic here."]
Label: [0]


In [13]:
def collator(data):
    input_texts = [item["text"] for item in data]
    labels = [item["label"] for item in data]

    # Tokenize input texts
    tokenized_texts = tokenizer(input_texts, padding=True, truncation=True, return_tensors="pt")

    # Extract input_ids and attention_masks
    input_ids = tokenized_texts["input_ids"]
    attention_masks = tokenized_texts["attention_mask"]

    # Convert labels to tensors
    label_tensors = torch.tensor(labels)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": label_tensors
    }


    return batch

In [14]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

imdb_dataset = load_dataset("imdb")

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(imdb_dataset['train'])

# Define a custom PyTorch dataset class
class CustomDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data.iloc[idx]
# Create an instance of the custom dataset class
custom_dataset = CustomDataset(df)
# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = PPOTrainer(
    config,
    model,
    ref_model=ref_model,
    tokenizer=tokenizer,
    dataset=custom_dataset,
    data_collator=collator,
    optimizer=optimizer,
)

In [15]:
# We then build the reward pipeline, we will use the toxicity model to compute the reward.
# We first load the toxicity model and tokenizer.
toxicity_model_id = "facebook/roberta-hate-speech-dynabench-r4-target"
toxicity_tokenizer = RobertaTokenizer.from_pretrained(toxicity_model_id)
# We load the toxicity model in fp16 to save memory.
toxicity_model = RobertaForSequenceClassification.from_pretrained(toxicity_model_id, torch_dtype=torch.float16).to(
    ppo_trainer.accelerator.device
)

In [16]:
# We then define the arguments to pass to the `generate` function. These arguments
# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
# the `generate` function of the trained model.
generation_kwargs = {
    "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
}
output_min_length = 20
output_max_length = 30
output_length_sampler = LengthSampler(output_min_length, output_max_length)

model_save_path = script_args.model_save_path

In [None]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    # Get response from the policy model
    response_tensors = []
    for query in query_tensors:
        gen_len = output_length_sampler()
        generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **generation_kwargs)
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    # Compute sentiment score
    texts = batch["response"]
    toxicity_inputs = toxicity_tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(
        ppo_trainer.accelerator.device
    )
    logits = toxicity_model(**toxicity_inputs).logits.float()
    toxicity_labels = (logits[:, 0]).tolist()

    rewards = [torch.tensor(output) for output in toxicity_labels]

    # Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    # Save model every 100 epochs
    if epoch % 100 == 0:
        if ppo_trainer.accelerator.is_main_process:
            ppo_trainer.save_pretrained(model_save_path)

0it [00:00, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
