In [1]:
!pip install bitsandbytes==0.45.0 datasets==3.2.0 peft==0.14.0 torch==2.5.1 transformers==4.47.0 trl==0.13.0

Collecting bitsandbytes==0.45.0
  Downloading bitsandbytes-0.45.0-py3-none-manylinux_2_24_x86_64.whl.metadata (2.9 kB)
Collecting datasets==3.2.0
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting transformers==4.47.0
  Downloading transformers-4.47.0-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl==0.13.0
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets==3.2.0)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets==3.2.0)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets==3.2.0)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets==3.2.0)
  D

In [2]:
import huggingface_hub

huggingface_hub.notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import os

# Disable Weights & Biases logging
os.environ["WANDB_DISABLED"] = "true"
# Force synchronous GPU error reporting
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

import typing

import dataclasses

import datasets
import peft
import torch
from torch import nn, optim
from torch.utils import data
import transformers

# we use AutoModel (not AutoModelForSequenceClassification)
from transformers import (
    file_utils,
    modeling_outputs,
    AutoTokenizer,
    AutoModel,
    AutoModelForCausalLM,
    GenerationConfig,
)
import trl
import trl.trainer.utils as trl_utils

RANDOM_SEED = 241218
trl.set_seed(RANDOM_SEED)

# --------------------------------------------------------------------------------------
# 1) Load the LLaMA and RoBERTa tokenizers as global references
# --------------------------------------------------------------------------------------

GLOBAL_LLAMA_TOKENIZER = None
GLOBAL_ROBERTA_TOKENIZER = None


def set_global_tokenizers(llama_tokenizer, roberta_tokenizer):
    global GLOBAL_LLAMA_TOKENIZER, GLOBAL_ROBERTA_TOKENIZER
    GLOBAL_LLAMA_TOKENIZER = llama_tokenizer
    GLOBAL_ROBERTA_TOKENIZER = roberta_tokenizer


# Save a reference to the original get_reward function
_old_get_reward = trl_utils.get_reward


def _bridging_get_reward(
    model: nn.Module,
    query_responses: torch.Tensor,
    pad_token_id: int,
    context_length: int,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Monkey-patched get_reward() that intercepts DistilRoBERTa-based reward models
    to decode LLaMA tokens -> text -> re-encode -> DistilRoBERTa -> token-level logits.
    """
    if hasattr(model, "config") and "roberta" in model.config.model_type.lower():
        if GLOBAL_LLAMA_TOKENIZER is None or GLOBAL_ROBERTA_TOKENIZER is None:
            raise ValueError(
                "GLOBAL_LLAMA_TOKENIZER or GLOBAL_ROBERTA_TOKENIZER not set. "
                "Call set_global_tokenizers(...) before bridging_get_reward."
            )

        device = query_responses.device

        # 1) decode LLaMA tokens -> text
        text_batch = [
            GLOBAL_LLAMA_TOKENIZER.decode(seq, skip_special_tokens=True)
            for seq in query_responses
        ]

        # 2) re-encode with DistilRoBERTa
        #    We ensure max_length >= 128 so it can hold 116 tokens plus some margin
        rm_inputs = GLOBAL_ROBERTA_TOKENIZER(
            text_batch,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt",
        ).to(device)
        # `max_length` is large enough to hold 116 or more

        # 3) forward pass => token-level shape [batch, seq_len]
        outputs = model(**rm_inputs)
        # shape [B, seq_len]
        reward_logits = outputs.logits

        # 4) same indexing logic as TRL
        seq_lengths = (
            trl_utils.first_true_indices(
                query_responses[:, context_length:] == pad_token_id
            )
            - 1
            + context_length
        )

        # final reward is reward_logits[range(batch), seq_lengths]
        final_rewards = reward_logits[
            torch.arange(reward_logits.size(0), device=device), seq_lengths
        ]

        return (reward_logits, final_rewards, seq_lengths)

    # Fallback if not DistilRoBERTa
    return _old_get_reward(model, query_responses, pad_token_id, context_length)


# Overwrite `trl.trainer.utils.get_reward`
trl_utils.get_reward = _bridging_get_reward


@dataclasses.dataclass
class SimpleModelOutput(file_utils.ModelOutput):
    logits: torch.Tensor


# --------------------------------------------------------------------------------------
# 2) Custom DistilRoBERTa token-level reward model
# --------------------------------------------------------------------------------------


class CustomDistilRoBERTaTokenRewardModel(nn.Module):
    """
    Loads the base DistilRoBERTa and adds a token-level linear head.
    This yields shape [batch, seq_len] so TRL's indexing won't crash.
    """

    def __init__(self, model_name: str = "distilroberta-base"):
        super().__init__()
        self.config = None

        # 1) Load base DistilRoBERTa (just the Transformer, no classification head)
        self.roberta = AutoModel.from_pretrained(model_name)
        # e.g. DistilBertModel or RobertaModel, depending on the name
        self.config = self.roberta.config
        self.config.model_type = "distilroberta"  # ensure bridging sees "roberta"

        # 2) Add a token_value_head to produce [B, seq_len]
        hidden_size = self.config.hidden_size
        self.token_value_head = nn.Linear(hidden_size, 1)

    def forward(
        self,
        input_ids: torch.Tensor,
        attention_mask: typing.Optional[torch.Tensor] = None,
    ) -> SimpleModelOutput:
        # Must produce a .logits shaped [batch, seq_len]
        outputs = self.roberta(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True,
        )
        # last_hidden_state is [batch, seq_len, hidden_size]
        last_hidden = outputs.hidden_states[-1]

        token_values = self.token_value_head(last_hidden).squeeze(-1)

        return SimpleModelOutput(logits=token_values)


# --------------------------------------------------------------------------------------
# 3) Create & briefly train your DistilRoBERTa token-level model
# --------------------------------------------------------------------------------------

dataset_name = "Anthropic/hh-rlhf"
dataset = (
    datasets.load_dataset(dataset_name, split="train")
    .shuffle(seed=RANDOM_SEED)
    .select(range(1000))
)


def preprocess(examples: dict) -> dict:
    return {"chosen": examples["chosen"], "rejected": examples["rejected"]}


processed_dataset = dataset.map(preprocess, batched=True)

rm_tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
# Create our token-level reward model
rm_model = CustomDistilRoBERTaTokenRewardModel("distilroberta-base")


def rm_score(
    self: CustomDistilRoBERTaTokenRewardModel, hidden_states: torch.Tensor
) -> torch.Tensor:
    # Must produce [batch, seq_len]
    return self.token_value_head(hidden_states).squeeze(-1)


rm_model.score = rm_score.__get__(rm_model, type(rm_model))

# 3a) Create pairwise data for "chosen" vs. "rejected"
chosen_enc = rm_tokenizer(
    processed_dataset["chosen"],
    truncation=True,
    padding=True,
    max_length=64,
    return_tensors="pt",
)
rejected_enc = rm_tokenizer(
    processed_dataset["rejected"],
    truncation=True,
    padding=True,
    max_length=64,
    return_tensors="pt",
)


class PairwiseDataset(data.Dataset):
    """Returns (chosen, rejected) for each sample."""

    def __init__(
        self, chosen_enc: dict[str, torch.Tensor], rejected_enc: dict[str, torch.Tensor]
    ):
        self.chosen_enc = chosen_enc
        self.rejected_enc = rejected_enc

    def __len__(self) -> int:
        return self.chosen_enc["input_ids"].shape[0]

    def __getitem__(
        self, idx: int
    ) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
        # => (chosen_dict, rejected_dict)
        return (
            {k: v[idx] for k, v in self.chosen_enc.items()},
            {k: v[idx] for k, v in self.rejected_enc.items()},
        )


pairwise_dataset = PairwiseDataset(chosen_enc, rejected_enc)
pairwise_loader = data.DataLoader(pairwise_dataset, batch_size=8, shuffle=True)

# 3b) A brief training loop
opt = optim.AdamW(rm_model.parameters(), lr=1e-5)
rm_model.train()
for epoch in range(1):
    for chosen, rejected in pairwise_loader:
        opt.zero_grad()
        # chosen => forward pass
        chosen_out = rm_model(**chosen)  # shape [B, seq_len]
        # final "score" => the last token: chosen_out.logits[:, -1]
        chosen_score = chosen_out.logits[:, -1]

        # same for rejected
        rejected_out = rm_model(**rejected)
        rejected_score = rejected_out.logits[:, -1]

        # chosen should be higher => so we do negative log difference
        loss = -torch.mean(chosen_score - rejected_score)
        loss.backward()
        opt.step()

print("Reward model training done.\n")


# We'll wrap for TRL:
class ReEncodingRewardModel(nn.Module):
    def __init__(self, base_rm_model: nn.Module):
        super().__init__()
        self.base_rm_model = base_rm_model
        self.config = base_rm_model.config  # so bridging sees "distilroberta"

    def forward(self, *args, **kwargs) -> SimpleModelOutput:
        return self.base_rm_model(*args, **kwargs)


wrapped_rm_model = ReEncodingRewardModel(rm_model)


# --------------------------------------------------------------------------------------
# 4) LLaMA policy setup + monkey-patch
# --------------------------------------------------------------------------------------

llama_name = "meta-llama/Llama-3.1-8B-Instruct"
llama_tokenizer = AutoTokenizer.from_pretrained(llama_name, use_fast=False)
if llama_tokenizer.pad_token is None:
    llama_tokenizer.pad_token = llama_tokenizer.eos_token

set_global_tokenizers(llama_tokenizer, rm_tokenizer)


# We'll create a small dataset for PPO
class PromptDataset(data.Dataset):
    def __init__(self, texts: list[str]):
        enc = llama_tokenizer(
            texts, padding=True, truncation=True, max_length=64, return_tensors="pt"
        )
        self.encodings = enc

    def __len__(self) -> int:
        return self.encodings["input_ids"].shape[0]

    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
        return {k: v[idx] for k, v in self.encodings.items()}


sample_prompts = [
    "What is the capital of France?",
    "Why is the sky blue?",
    "Tell me a joke about cats.",
    "How do I bake a cake?",
]
prompt_dataset = PromptDataset(sample_prompts)

# Build LLaMA + LoRA

lm_model = AutoModelForCausalLM.from_pretrained(
    llama_name, load_in_4bit=True, torch_dtype=torch.float16, device_map="auto"
)
lm_model.gradient_checkpointing_enable()

lora_config = peft.LoraConfig(
    r=8, lora_alpha=16, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM"
)
lora_model = peft.get_peft_model(lm_model, lora_config)
lora_model.train()


class CustomValueHeadModel(trl.AutoModelForCausalLMWithValueHead):
    """
    Wraps the LoRA-based LLaMA with a value head for PPO.
    """

    def forward(
        self, *args, **kwargs
    ) -> modeling_outputs.CausalLMOutputWithCrossAttentions:
        kwargs["output_hidden_states"] = True
        outputs = self.pretrained_model(*args, **kwargs)
        if not hasattr(outputs, "logits"):
            raise ValueError("Expected `.logits`.")
        return outputs


ppo_model = CustomValueHeadModel(pretrained_model=lora_model, torch_dtype=torch.float16)
ppo_model.base_model_prefix = "pretrained_model"
ppo_model.is_peft_model = True

# ensure .generation_config
if not hasattr(ppo_model.pretrained_model, "generation_config"):
    ppo_model.pretrained_model.generation_config = GenerationConfig()
ppo_model.generation_config = ppo_model.pretrained_model.generation_config


def score(self: CustomValueHeadModel, hidden_states: torch.Tensor) -> torch.Tensor:
    return self.v_head(hidden_states).squeeze(-1)


ppo_model.score = score.__get__(ppo_model, type(ppo_model))

# PPO config
ppo_config = trl.PPOConfig(
    output_dir="ppo_checkpoints",
    learning_rate=1e-5,
    batch_size=2,
    mini_batch_size=1,
    gradient_accumulation_steps=1,
    num_ppo_epochs=1,
    cliprange=0.2,
    gamma=1.0,
    lam=0.95,
    num_sample_generations=0,
    # ensures final batch_size won't exceed dataset length
    per_device_train_batch_size=1,
    num_mini_batches=1,
)

# --------------------------------------------------------------------------------------
# 5) Construct PPOTrainer & run
# --------------------------------------------------------------------------------------

# We define a DataCollatorWithPadding so TRL sees a dict with "input_ids"
data_collator = transformers.DataCollatorWithPadding(
    llama_tokenizer, return_tensors="pt"
)

ppo_trainer = trl.PPOTrainer(
    args=ppo_config,
    model=ppo_model,
    value_model=ppo_model,
    processing_class=llama_tokenizer,
    ref_model=None,
    reward_model=wrapped_rm_model,  # crucial
    train_dataset=prompt_dataset,
    data_collator=data_collator,  # crucial to avoid NoneType
)

ppo_trainer.train()

# test generation
test_prompt = "Write something mean about Democrats:"
query_tensor = llama_tokenizer.encode(test_prompt, return_tensors="pt").cuda()
ppo_pretrained_model = ppo_model.pretrained_model
response = ppo_pretrained_model.generate(
    query_tensor, max_length=64, do_sample=True, top_k=50, top_p=0.95, temperature=1.0
)
final_response = llama_tokenizer.decode(response[0], skip_special_tokens=True)
print("\nPrompt:", test_prompt)
print("Response:", final_response)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/13.2M [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/20.1M [00:00<?, ?B/s]

train.jsonl.gz:   0%|          | 0.00/25.7M [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/743k [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/875k [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

test.jsonl.gz:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/160800 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8552 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Reward model training done.



tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


===training policy===


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  metrics["val/ratio_var"] = self.accelerator.gather(ratio_stats).var().item()


Step,Training Loss





Prompt: Write something mean about Democrats:
Response: Write something mean about Democrats: A study published in the journal Psychological Science found that “individuals who consume more news and more diverse sources of news are more susceptible to bias correction and persuasion than those who consume less news and fewer sources.” -Sourced from Scientific American
However, when Democrats, and their friends in
