In [None]:
!pip install trl > /dev/null

### Reward Trainer

**Objective**: To classify whether the generated statement is well formed, grammatically acceptable, and rules following. 

**Rules Following**: These rules are taught using do's and don't via Reward Trainer

The reward model should be trained on a dataset of paired examples, where each example is a tuple of two sequences. The reward model should be trained to predict which example in the pair is more relevant to the task at hand.

The reward trainer expects a very specific format for the dataset. The dataset should contain two 4 entries at least if you don’t use the default RewardDataCollatorWithPadding data collator. 

Therefore the final dataset object should contain two 4 entries at least if you use 
the default RewardDataCollatorWithPadding data collator. The entries should be named:

input_ids_chosen

attention_mask_chosen

input_ids_rejected

attention_mask_rejected

You should pass an **AutoModelForSequenceClassification model** to the RewardTrainer, along with a RewardConfig which configures the hyperparameters of the training.

Two Passes:

In the first pass, we feed in prompt and chosen response to the Reward Model, the output is Rchosen. In the second pass, we feed in the same prompt along with the rejected response. The output, in this case, is Rrejected

For a very high reward score for chosen response and a low reward score for rejected response, the loss would be 0.

loss = -log(sig(RCho - RRej))

https://github.com/ibm-ecosystem-engineering/SuperKnowa/blob/main/7.%20RLHF%20Model/notebooks/RLHFImplementation.ipynb

In [1]:
from peft import LoraConfig, TaskType
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments
)
from trl import RewardTrainer, RewardConfig
from datasets import load_dataset
import inspect
from rich import print

In [None]:
dataset = load_dataset("Anthropic/hh-rlhf",
                       data_dir="harmless-base")
dataset = load_dataset("Anthropic/hh-rlhf",
                       data_dir="red-team-attempts")

In [2]:
dataset = load_dataset("Anthropic/hh-rlhf")
dataset = dataset['test'].train_test_split(test_size=0.3)
dataset

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 5986
    })
    test: Dataset({
        features: ['chosen', 'rejected'],
        num_rows: 2566
    })
})

In [3]:
print(dataset['train'][0]['rejected'])

#### Dataset prep

- Questions that requires answers are listed first

- Answers are "generated" for the questions from models 

- These answers are annotated with feedback using other models / humans 

- Answers with higher feedback are chosen and lesser feedback rejected

- Both rejected and chosen answers along with questions are collated into dataset

In [3]:
id2Label={1:"chosen",0:"rejected"}
label2id = {"chosen":1,"rejected":0}
# model_used = "distilbert-base-uncased"  # Its failing due to a different issue
model_used = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_used)
tokenizer.pad_token = tokenizer.eos_token 

In [4]:
def pre_process(row):
    chosen = tokenizer(row['chosen'], max_length=512, truncation=True)
    # here the chosen text is being tokenized
    rejected = tokenizer(row['rejected'], max_length=512, truncation=True)
    # here the rejected text is being tokenized
    final = {}
    final['input_ids_chosen'] = chosen['input_ids']
    final['attention_mask_chosen'] = chosen['attention_mask']
    final['input_ids_rejected'] = rejected['input_ids'] 
    final['attention_mask_rejected'] = rejected['attention_mask']
    return final

In [5]:
dataset = dataset.map(pre_process,
                      remove_columns=['chosen','rejected'],)
dataset

Map:   0%|          | 0/5986 [00:00<?, ? examples/s]

Map:   0%|          | 0/2566 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 5986
    })
    test: Dataset({
        features: ['input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 2566
    })
})

In [8]:
dataset['train'][0]

{'input_ids_chosen': [101,
  2529,
  1024,
  2054,
  2003,
  20868,
  17728,
  3468,
  6812,
  2884,
  8715,
  1029,
  3353,
  1024,
  20868,
  17728,
  3468,
  6812,
  2884,
  8715,
  1006,
  21307,
  2015,
  1007,
  2003,
  1037,
  11888,
  4650,
  2007,
  8030,
  2008,
  2421,
  21419,
  3255,
  1010,
  22939,
  12171,
  20192,
  1010,
  9530,
  16643,
  24952,
  2239,
  1010,
  1998,
  17964,
  1012,
  102],
 'attention_mask_chosen': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'input_ids_rejected': [101,
  2529,
  1024,
  2054,
  2003,
  20868,
  17728,
  3468,
  6812,
  2884,
  8715,
  1029,
  3353,
  1024,
  2023,
  2003,
  1037,
  2691,
  4650,
  2008,
  5320,
  21419,
  3255,
  1998,
  22939,
  12171,
  14490,
  2050,
  1012,
  102],
 'attention_mask_rejected': [1,
  1,
  1,
  1,
  1,

#### Working on dataset creation

In [None]:
# How to create the dataset into chosen / rejected format
#  ['ax', 'cola', 'mnli', 'mnli_matched', 
# 'mnli_mismatched', 'mrpc', 'qnli', 'qqp', 'rte', 'sst2', 'stsb', 'wnli']
glue_cola = load_dataset("glue", 'cola')

In [None]:
glue_cola['train'].features['label']

In [None]:
import pandas as pd
from operator import itemgetter

df = pd.read_csv('reward_trainer_feedback.csv',
                 encoding="ISO-8859-1")
# https://stackoverflow.com/questions/19699367/for-line-in-results-in-unicodedecodeerror-utf-8-codec-cant-decode-byte

In [None]:
df['tup'] = list(zip(df['answer'], df['feedback']))
df.head(1)

In [None]:
#grouping together all the answers for a given question along with its feedback
df_g = df.groupby('question')['tup'].apply(list).reset_index()
df_g.head(1)

In [None]:
df_g['tup'][0]

In [None]:
df_g['question'][0]

In [None]:
# sort each group based on the feedback score
df_g["sorted_tup"] = df_g["tup"].apply(lambda x :sorted(x,key=itemgetter(0)))

In [None]:
df_g

In [None]:
# answer with highest feedback score is "chosen"
df_g["chosen"] = df_g["sorted_tup"].apply(lambda x: x[-1][0])
df_g["chosen_score"] = df_g["sorted_tup"].apply(lambda x: x[-1][1])

# answer with highest feedback score is "rejected"
df_g["rejected"] = df_g["sorted_tup"].apply(lambda x: x[0][0])
df_g["rejected_score"] = df_g["sorted_tup"].apply(lambda x: x[0][1])

In [None]:
df_g = df_g.dropna()

df_g = df_g[(df_g['chosen_score']>=4.0) & (df_g['rejected_score']<4.0)]

df_g

#### Training models

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(model_used, 
                                                           id2label=id2Label,
                                                          label2id=label2id)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
model

GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [8]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [12]:
def add_margin(row):
    # Assume you have a score_chosen and score_rejected columns that you want to use to compute the margin
    return {'margin': row['chosen'] - row['rejected']}

# dataset = dataset.map(add_margin)

# Code adds a margin to the loss in the margin column to the dataset. 
# The reward collator will automatically pass it through and the 
# loss will be computed accordingly.

# https://huggingface.co/papers/2307.09288

In [7]:
args = TrainingArguments(
    output_dir='/home/aicoder/training/reward_trainer/',
    push_to_hub=False,
    report_to="none",
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    evaluation_strategy='steps',
    eval_steps=200,
    save_strategy='epoch',
    # save_steps=200,
    num_train_epochs=1
)

In [None]:
reward_source = inspect.getsource(RewardTrainer)
print(reward_source)

In [8]:
trainer = RewardTrainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    # peft_config=peft_config,
)



In [9]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
200,No log,0.688109,0.547935
400,No log,0.690699,0.539361
600,0.747400,0.686684,0.557677
800,0.747400,0.689421,0.547545
1000,0.706600,0.702447,0.528839
1200,0.706600,0.778572,0.558846
1400,0.706600,0.727224,0.558457
1600,0.829400,0.720885,0.545207
1800,0.829400,0.838647,0.52689
2000,0.847000,0.905218,0.531567




TrainOutput(global_step=5986, training_loss=0.9576052262000005, metrics={'train_runtime': 1065.2839, 'train_samples_per_second': 5.619, 'train_steps_per_second': 5.619, 'total_flos': 0.0, 'train_loss': 0.9576052262000005, 'epoch': 1.0})

In [10]:
tokenizer = AutoTokenizer.from_pretrained("/home/aicoder/training/reward_trainer/checkpoint-5986/")
model = AutoModelForSequenceClassification.from_pretrained("/home/aicoder/training/reward_trainer/checkpoint-5986/")

In [11]:
testing_stmt = "There is superb park in the vicinity"
# negative statement for test
nega_stmt = "This is not a very good place to spend time"
# non statement
not_stmt = 'make wsork nedo theko orga fuaga'

In [13]:
model.config

GPT2Config {
  "_name_or_path": "/home/aicoder/training/reward_trainer/checkpoint-5986/",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2ForSequenceClassification"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "id2label": {
    "0": "rejected",
    "1": "chosen"
  },
  "initializer_range": 0.02,
  "label2id": {
    "chosen": 1,
    "rejected": 0
  },
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dt

In [8]:
from transformers import pipeline

reward_tokenizer = AutoTokenizer.from_pretrained("lvwerra/distilbert-imdb")
reward_model = AutoModelForSequenceClassification.from_pretrained("lvwerra/distilbert-imdb")

In [33]:
print(reward_model.config)

In [30]:
reward_model.config.id2label

{0: 'NEGATIVE', 1: 'POSITIVE'}

In [16]:
tokenized_stmt = tokenizer(testing_stmt, return_tensors='pt')
reward_output = model(**tokenized_stmt)
reward_output.logits

tensor([[-3.3027, -3.1987]], grad_fn=<IndexBackward0>)

In [17]:
tokenized_stmt = tokenizer(nega_stmt, return_tensors='pt')
reward_output = model(**tokenized_stmt)
reward_output.logits

tensor([[-4.5104, -4.2697]], grad_fn=<IndexBackward0>)

In [18]:
tokenized_stmt = tokenizer(not_stmt, return_tensors='pt')
reward_output = model(**tokenized_stmt)
reward_output.logits

tensor([[-3.0852, -3.0673]], grad_fn=<IndexBackward0>)

In [20]:
reward_output.keys()

odict_keys(['logits', 'past_key_values'])

In [None]:
pipe = pipeline(task='text-classification', model="lvwerra/distilbert-imdb")
pipe(testing_stmt)

In [None]:
pipe = pipeline(task='text-classification', model="lvwerra/distilbert-imdb")
pipe(nega_stmt)

In [10]:
test_model1 = "bigscience/bloomz-560m"
test_model2 = "/home/aicoder/training/reward_trainer/checkpoint-2400/"

In [11]:
reward_cp_tokenizer = AutoTokenizer.from_pretrained(test_model2)
reward_cp_model = AutoModelForSequenceClassification.from_pretrained(test_model2)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
print(reward_cp_model.config)

In [32]:
reward_cp_model.config.id2label

{0: 'LABEL_0', 1: 'LABEL_1'}

In [17]:
tokenized_stmt = reward_cp_tokenizer(testing_stmt, return_tensors='pt')
reward_output = reward_cp_model(**tokenized_stmt)
reward_output.logits

tensor([[1.9391, 2.3142]])

In [18]:
tokenized_stmt = reward_cp_tokenizer(nega_stmt, return_tensors='pt')
reward_output = reward_cp_model(**tokenized_stmt)
reward_output.logits

tensor([[2.6312, 2.7609]])

In [19]:
tokenized_stmt = reward_cp_tokenizer(not_stmt, return_tensors='pt')
reward_output = reward_cp_model(**tokenized_stmt)
reward_output.logits

tensor([[2.6594, 3.8369]])

In [None]:


pipe = pipeline(task='text-classification', model=test_model2)
pipe(not_stmt)