In [None]:
!pip install transformers trl datasets accelerate


Collecting trl
  Downloading trl-0.13.0-py3-none-any.whl.metadata (11 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading trl-0.13.0-py3-none-any.whl (293 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m293.4/293.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m‚îÅ‚îÅ‚

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from trl import RewardTrainer, RewardConfig
from datasets import Dataset
import pandas as pd

# Step 1: Prepare the dataset
raw_data = [
    {
        "prompt": "Tell me a joke.",
        "response_1": "Why did the chicken cross the road?",
        "response_2": "Knock, knock!",
        "label": 1,  # 1 means response_1 is preferred
    },
    {
        "prompt": "What is AI?",
        "response_1": "Artificial Intelligence.",
        "response_2": "A type of plant.",
        "label": 0,  # 0 means response_2 is preferred
    },
]

# Convert raw data into a DataFrame and preprocess it
df = pd.DataFrame(raw_data)

# Create a dataset with "chosen" and "rejected" columns
def map_responses(row):
    if row["label"] == 1:
        return {"prompt": row["prompt"], "chosen": row["response_1"], "rejected": row["response_2"]}
    else:
        return {"prompt": row["prompt"], "chosen": row["response_2"], "rejected": row["response_1"]}

processed_data = df.apply(map_responses, axis=1).to_list()
dataset = Dataset.from_list(processed_data)

# Step 2: Initialize the tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Step 3: Tokenize the dataset
def tokenize_function(batch):
    # Tokenize "chosen" and "rejected" responses with the prompt
    tokenized_chosen = tokenizer(
        batch["prompt"], batch["chosen"], truncation=True, padding="max_length", max_length=128
    )
    tokenized_rejected = tokenizer(
        batch["prompt"], batch["rejected"], truncation=True, padding="max_length", max_length=128
    )

    # Return tokenized results
    return {
        "input_ids_chosen": tokenized_chosen["input_ids"],
        "attention_mask_chosen": tokenized_chosen["attention_mask"],
        "input_ids_rejected": tokenized_rejected["input_ids"],
        "attention_mask_rejected": tokenized_rejected["attention_mask"],
    }

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Step 4: Initialize the reward model
reward_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Step 5: Define RewardConfig
reward_config = RewardConfig(
    output_dir="./reward_model_output",  # Directory to save model
    max_length=128,  # Maximum sequence length
    per_device_train_batch_size=8,  # Training batch size
    per_device_eval_batch_size=8,  # Evaluation batch size
    learning_rate=5e-5,  # Learning rate
    num_train_epochs=3,  # Number of training epochs
    evaluation_strategy="steps",  # Evaluate periodically
    logging_strategy="steps",  # Logging configuration
    logging_steps=10,  # Log progress every 10 steps
    save_steps=500,  # Save model every 500 steps
    remove_unused_columns=False,  # Required for pretokenized datasets
    report_to="none",  # Report to TensorBoard
)

# Step 6: Define DataCollator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 7: Create RewardTrainer
trainer = RewardTrainer(
    model=reward_model,
    args=reward_config,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,  # Pass tokenizer for preprocessing
)

# Step 8: Train the reward model
trainer.train()

# Evaluate the reward model
results = trainer.evaluate()
print("Evaluation Results:", results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = RewardTrainer(
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss


Evaluation Results: {'eval_loss': 0.4893132150173187, 'eval_accuracy': 1.0, 'eval_runtime': 1.7721, 'eval_samples_per_second': 1.129, 'eval_steps_per_second': 0.564, 'epoch': 3.0}


In [None]:
# Evaluate the reward model
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.4893132150173187, 'eval_accuracy': 1.0, 'eval_runtime': 3.1124, 'eval_samples_per_second': 0.643, 'eval_steps_per_second': 0.321, 'epoch': 3.0}


In [None]:
output = trainer.predict(tokenized_dataset)
predictions = output.predictions
print(predictions)  # Output: model's predicted values/logits for the dataset

[[0.6745839  0.3254161 ]
 [0.55712414 0.44287583]]


In [None]:
# Assuming `output.predictions` contains the predicted scores for chosen and rejected responses
score_chosen = predictions[:, 0]  # Predicted score for the chosen response
score_rejected = predictions[:, 1]  # Predicted score for the rejected response

# Compute the margin as the difference between chosen and rejected scores
margin = score_chosen - score_rejected

In [None]:
print(margin)

[0.34916782 0.11424831]
