In [1]:
%pwd

'd:\\software_3\\Generative_models\\Text_models\\chat_gpt2\\RLHF_DPO_Preference_alignment\\RLHF_with_PPO'

In [2]:
import os

os.chdir("../../")

In [3]:
%pwd

'd:\\software_3\\Generative_models\\Text_models\\chat_gpt2'

# Reward Model for the RLHF (ppo, GRPO)

This Notebook implements the Bradly-Terry reward model for the feedback which was from this paper [Bradley–Terry and Multi-Objective Reward Modeling Are Complementary](https://arxiv.org/html/2507.07375v1) this model gives the higher preference to teh chose reward and lower to the rejected rewards.

In [None]:
import os
import json
import urllib
from tqdm import tqdm 
from functools import partial
import torch
from torch.utils.data import Dataset, DataLoader
from gpt import GPTModel
from model_args import BASE_CONFIG
from utils.load_and_save_models import save_model
from utils.token_converter import get_tokenizer

In [5]:
def download_and_load_file(file_path, url):

    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()

    with open(file_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    return data


file_path = "preference_reward_model_data.json"
url = " "

data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 5000


In [6]:
data[0]

{'rejected': [{'content': {'content': 'Is having your medical records online safe?',
    'role': 'user'}},
  {'content': {'content': 'You mean being able to share them with your doctor, or making them public to the internet in general?',
    'role': 'assistant'}}],
 'chosen': [{'content': {'content': 'Is having your medical records online safe?',
    'role': 'user'}},
  {'content': {'content': 'Hm, I think so! If you have your records online, it makes it easier for you to access them, and also for your doctor to share updates with you, and for you to share updates with your doctor. It also makes it easier for you to share your information with other people, if you want, for example to join a health advocacy or support group. The only risk I see is that if someone else has access to your records, they might abuse that access, or they might accidentally make a change to your records that you don’t want. But overall, I think online access is a good thing.',
    'role': 'assistant'}}]}

In [7]:
data[0]["chosen"][0]["content"]["content"]

'Is having your medical records online safe?'

In [8]:
def process_data(data):
    """Process data to create chosen/rejected pairs with consistent prompt formatting"""
    new_data = []
    for i in range(1000):
        js = {}
        # Extract prompt and responses
        prompt = data[i]["chosen"][0]["content"]["content"]
        chosen_response = data[i]["chosen"][1]["content"]["content"]
        rejected_response = data[i]["rejected"][1]["content"]["content"]
        
        # Format consistently
        js["chosen"] = f"prompt: {prompt} ### \n\n Response: {chosen_response}"
        js["rejected"] = f"prompt: {prompt} ### \n\n Response: {rejected_response}"
        new_data.append(js)
    return new_data

processed_data = process_data(data)

In [9]:
data = processed_data

In [10]:
data[0]

{'chosen': 'prompt: Is having your medical records online safe? ### \n\n Response: Hm, I think so! If you have your records online, it makes it easier for you to access them, and also for your doctor to share updates with you, and for you to share updates with your doctor. It also makes it easier for you to share your information with other people, if you want, for example to join a health advocacy or support group. The only risk I see is that if someone else has access to your records, they might abuse that access, or they might accidentally make a change to your records that you don’t want. But overall, I think online access is a good thing.',
 'rejected': 'prompt: Is having your medical records online safe? ### \n\n Response: You mean being able to share them with your doctor, or making them public to the internet in general?'}

In [11]:
def split_data(data):
    train_portion = int(len(data) * 0.85)
    test_portion = int(len(data) * 0.1)
    val_portion = len(data) - train_portion - test_portion

    train_data = data[:train_portion]
    test_data = data[train_portion:train_portion + test_portion]
    val_data = data[train_portion + test_portion:]
    
    print("Training set length:", len(train_data))
    print("Validation set length:", len(val_data))
    print("Test set length:", len(test_data))

    return train_data, test_data, val_data

train_data, tes_data, val_data = split_data(data)


Training set length: 850
Validation set length: 50
Test set length: 100


In [12]:
train_data[0]

{'chosen': 'prompt: Is having your medical records online safe? ### \n\n Response: Hm, I think so! If you have your records online, it makes it easier for you to access them, and also for your doctor to share updates with you, and for you to share updates with your doctor. It also makes it easier for you to share your information with other people, if you want, for example to join a health advocacy or support group. The only risk I see is that if someone else has access to your records, they might abuse that access, or they might accidentally make a change to your records that you don’t want. But overall, I think online access is a good thing.',
 'rejected': 'prompt: Is having your medical records online safe? ### \n\n Response: You mean being able to share them with your doctor, or making them public to the internet in general?'}

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [14]:
class RewardDataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
        
        # Pre-tokenize texts
        self.encoded_texts = []
        for entry in data:
            chosen_full_text = entry["chosen"]
            rejected_full_text = entry["rejected"]
            
            # For tiktoken, encode() returns a list of token IDs directly
            chosen_full_tokens = tokenizer.encode(chosen_full_text)
            rejected_full_tokens = tokenizer.encode(rejected_full_text)
            
            # Add EOS token (GPT-2 uses token ID 50256 as EOS)
            eos_token_id = 50256
            chosen_full_tokens = chosen_full_tokens + [eos_token_id]
            rejected_full_tokens = rejected_full_tokens + [eos_token_id]
            
            # Find prompt end position for masking
            prompt_end_marker = " ### \n\n Response:"
            try:
                prompt_part = entry["chosen"].split(prompt_end_marker)[0] + prompt_end_marker
                prompt_tokens = tokenizer.encode(prompt_part)
            except:
                # Fallback if split fails
                prompt_tokens = tokenizer.encode(entry["chosen"])[:len(tokenizer.encode(entry["chosen"])) // 2]
            
            self.encoded_texts.append({
                "chosen": chosen_full_tokens,
                "rejected": rejected_full_tokens,
                "prompt_len": len(prompt_tokens)
            })

    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)

In [15]:
def custom_collate_fn(
    batch,
    pad_token_id=50256,
    allowed_max_length=None,
    mask_prompt_tokens=True,
    device="cpu"
):
    """Fixed collate function with proper masking"""
    
    batch_data = {
        "chosen": [],
        "rejected": [],
        "rejected_mask": [],
        "chosen_mask": []
    }

    # Determine the longest sequence to set a common padding length
    max_length_common = 0
    if batch:
        for key in ["chosen", "rejected"]:
            current_max = max(len(item[key]) for item in batch)
            max_length_common = max(max_length_common, current_max)

    # Process each item in the batch
    for item in batch:
        prompt_len = item["prompt_len"]
        
        for key in ["chosen", "rejected"]:
            # Adjust the padding according to the common max length
            sequence = item[key]
            padded = sequence + [pad_token_id] * (max_length_common - len(sequence))
            
            # Create attention mask (True for real tokens, False for padding)
            mask = torch.ones(len(padded)).bool()
            mask[len(sequence):] = False  # Set padding tokens to False
            
            # If mask_prompt_tokens is True, set prompt tokens to False
            # We only want to compute loss on the response tokens
            if mask_prompt_tokens:
                loss_mask = mask.clone()
                loss_mask[:prompt_len] = False  # Don't compute loss on prompt tokens
                batch_data[f"{key}_mask"].append(loss_mask)
            else:
                batch_data[f"{key}_mask"].append(mask)
            
            batch_data[key].append(torch.tensor(padded))

    # Stack all tensors
    for key in ["chosen", "rejected", "chosen_mask", "rejected_mask"]:
        tensor_stack = torch.stack(batch_data[key])
        
        if allowed_max_length is not None:
            tensor_stack = tensor_stack[:, :allowed_max_length]
        
        batch_data[key] = tensor_stack.to(device)

    return batch_data

In [19]:
tokenizer = get_tokenizer()

example_dataset = RewardDataset(val_data, tokenizer)

example_dataloader = DataLoader(
    example_dataset,
    batch_size=2,
    collate_fn=customized_collate_func,
    shuffle=False
)

AttributeError: 'Encoding' object has no attribute 'eos_token_id'

In [18]:
for batch in example_dataloader:
    break
print(batch.keys())

dict_keys(['chosen', 'rejected', 'rejected_mask', 'chosen_mask'])


In [20]:
def decode_tokens_from_batch(token_ids, tokenizer):
    ids_in_python_list = token_ids.flatten().tolist()
    return tokenizer.decode(ids_in_python_list)

In [21]:
text = decode_tokens_from_batch(
    token_ids=batch["rejected"][0],  # [0] for the first entry in the batch
    tokenizer=tokenizer,
)
print(text)

NameError: name 'batch' is not defined

In [22]:
batch["chosen_mask"][0].shape

NameError: name 'batch' is not defined

Bradly-terry Loss function



In [16]:
def compute_loss(model, batch):
    """Compute Bradley-Terry loss with proper reward extraction"""
    
    # Get logits from the model
    chosen_logits = model(batch["chosen"])
    rejected_logits = model(batch["rejected"])
    
    # Extract rewards - take the last non-padded token's reward for each sequence
    chosen_rewards = []
    rejected_rewards = []
    
    for i in range(chosen_logits.size(0)):  # For each item in batch
        # Find the last non-padded token position
        chosen_mask = batch["chosen_mask"][i]
        rejected_mask = batch["rejected_mask"][i]
        
        # Get the last True position (last non-padded token)
        chosen_last_pos = chosen_mask.sum() - 1
        rejected_last_pos = rejected_mask.sum() - 1
        
        # Extract reward from the last position
        chosen_reward = chosen_logits[i, chosen_last_pos, 0]  # Assuming single output
        rejected_reward = rejected_logits[i, rejected_last_pos, 0]
        
        chosen_rewards.append(chosen_reward)
        rejected_rewards.append(rejected_reward)
    
    chosen_rewards = torch.stack(chosen_rewards)
    rejected_rewards = torch.stack(rejected_rewards)
    
    # Bradley-Terry model: P(chosen > rejected) = σ(r_chosen - r_rejected)
    # Loss = -log(σ(r_chosen - r_rejected))
    loss = -torch.nn.functional.logsigmoid(chosen_rewards - rejected_rewards).mean()
    
    return loss, chosen_rewards.mean(), rejected_rewards.mean()


def compute_bradley_terry_probability(r_chosen, r_rejected):
    """Compute P(chosen > rejected) using Bradley-Terry Model"""
    return torch.sigmoid(r_chosen - r_rejected)

    

In [17]:
def train_reward_model(model, optimizer, device, train_data, val_data, tokenizer, start_context, epochs=3, batch_size=4, lr=5e-5):
    """Fixed training loop with proper evaluation"""
    
    # Create collate function
    customized_collate_func = partial(
        custom_collate_fn,
        device=device,
        mask_prompt_tokens=True,
        allowed_max_length=1024
    )
    
    train_dataset = RewardDataset(train_data, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=customized_collate_func)

    val_dataset = RewardDataset(val_data, tokenizer)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=customized_collate_func)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        total_chosen_reward = 0
        total_rejected_reward = 0
        total_bt_prob = 0

        train_iter = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch} [Train]")
        for batch_idx, batch in train_iter:
            optimizer.zero_grad()
            
            loss, chosen_reward, rejected_reward = compute_loss(model, batch)
            bt_prob = torch.sigmoid(chosen_reward - rejected_reward).item()
            
            loss.backward()
            
            # Add gradient clipping to prevent instability
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            
            total_loss += loss.item()
            total_chosen_reward += chosen_reward.item()
            total_rejected_reward += rejected_reward.item()
            total_bt_prob += bt_prob
            
            if batch_idx % 10 == 0:
                train_iter.set_postfix({"loss": loss.item(), "BT_prob": bt_prob})

        avg_loss = total_loss / len(train_loader)
        avg_chosen = total_chosen_reward / len(train_loader)
        avg_rejected = total_rejected_reward / len(train_loader)
        avg_bt_prob = total_bt_prob / len(train_loader)

        print(f'Epoch {epoch}: Loss={avg_loss:.4f}, '
              f'Chosen={avg_chosen:.4f}, Rejected={avg_rejected:.4f}, '
              f'BT_Prob={avg_bt_prob:.4f}')

        # Validation
        model.eval()
        val_loss = 0
        val_chosen_reward = 0
        val_rejected_reward = 0
        val_bt_prob = 0

        val_iter = tqdm(val_dataloader, total=len(val_dataloader), desc=f"Epoch {epoch} [Val]")
        with torch.no_grad():
            for batch in val_iter:
                loss, chosen_reward, rejected_reward = compute_loss(model, batch)
                bt_prob = torch.sigmoid(chosen_reward - rejected_reward).item()
                val_loss += loss.item()
                val_chosen_reward += chosen_reward.item()
                val_rejected_reward += rejected_reward.item()
                val_bt_prob += bt_prob
                val_iter.set_postfix({"val_loss": loss.item(), "val_BT_prob": bt_prob})

        val_avg_loss = val_loss / len(val_dataloader)
        val_avg_chosen = val_chosen_reward / len(val_dataloader)
        val_avg_rejected = val_rejected_reward / len(val_dataloader)
        val_avg_bt_prob = val_bt_prob / len(val_dataloader)

        print(f'Validation: Loss={val_avg_loss:.4f}, '
              f'Chosen={val_avg_chosen:.4f}, Rejected={val_avg_rejected:.4f}, '
              f'BT_Prob={val_avg_bt_prob:.4f}')

        # Test model on sample context
        if start_context:
            model.eval()
            with torch.no_grad():
                start_tokens = tokenizer.encode(start_context)
                start_tensor = torch.tensor(start_tokens).unsqueeze(0).to(device)
                feedback = model(start_tensor)
                print(f"Sample reward: {feedback[0, -1, 0].item():.4f}")

    return model

In [7]:
def setup_reward_model(base_model, base_config, device):
    """Setup reward model from pre-trained base model"""
    
    # Freeze all parameters first
    for param in base_model.parameters():
        param.requires_grad = False
    
    # Replace output head with reward head (single scalar output)
    num_classes = 1
    base_model.out_head = torch.nn.Linear(
        in_features=base_config["emb_dim"], 
        out_features=num_classes
    )
    
    # Unfreeze last transformer block for fine-tuning
    for param in base_model.trf_blocks[-1].parameters():
        param.requires_grad = True
    
    # Unfreeze final normalization layer
    for param in base_model.final_norm.parameters():
        param.requires_grad = True
    
    # Unfreeze the new output head
    for param in base_model.out_head.parameters():
        param.requires_grad = True
    
    base_model.to(device)
    return base_model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_path = "gpt_models\\instruct-GPT2-355M-SFT.pth"
model = GPTModel(BASE_CONFIG)
model.load_state_dict(
    torch.load(
    model_path,
    map_location=torch.device("cpu"),
    weights_only=True
    )['model_state_dict']
)
model = setup_reward_model(base_model=model, base_config=BASE_CONFIG, device=device)

In [20]:
print(model)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.0, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.0, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [21]:
tokenizer = get_tokenizer()
start_context = tes_data[0]["chosen"]
epochs = 20
batch_size = 10
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [22]:
reward_model = train_reward_model(model,
    optimizer,
    device,
    train_data,
    val_data,
    tokenizer,
    start_context,
    epochs,
    batch_size,
)


Epoch 0 [Train]: 100%|██████████| 85/85 [08:33<00:00,  6.04s/it, loss=0.486, BT_prob=0.642]


Epoch 0: Loss=0.7621, Chosen=0.3019, Rejected=0.2826, BT_Prob=0.5048


Epoch 0 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.09s/it, val_loss=0.681, val_BT_prob=0.53] 


Validation: Loss=0.8587, Chosen=0.0524, Rejected=0.2787, BT_Prob=0.4443
Sample reward: 0.0251


Epoch 1 [Train]: 100%|██████████| 85/85 [08:25<00:00,  5.94s/it, loss=0.662, BT_prob=0.54] 


Epoch 1: Loss=0.6717, Chosen=0.2282, Rejected=0.0585, BT_Prob=0.5415


Epoch 1 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it, val_loss=0.711, val_BT_prob=0.512]


Validation: Loss=0.8476, Chosen=0.0457, Rejected=0.2510, BT_Prob=0.4493
Sample reward: 0.0597


Epoch 2 [Train]: 100%|██████████| 85/85 [08:18<00:00,  5.86s/it, loss=0.551, BT_prob=0.612]


Epoch 2: Loss=0.5820, Chosen=0.3109, Rejected=-0.0765, BT_Prob=0.5944


Epoch 2 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.10s/it, val_loss=0.724, val_BT_prob=0.508]


Validation: Loss=0.8306, Chosen=0.1994, Rejected=0.3487, BT_Prob=0.4631
Sample reward: 0.3910


Epoch 3 [Train]: 100%|██████████| 85/85 [08:11<00:00,  5.78s/it, loss=0.532, BT_prob=0.649]


Epoch 3: Loss=0.4744, Chosen=0.5864, Rejected=-0.1480, BT_Prob=0.6730


Epoch 3 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.12s/it, val_loss=0.779, val_BT_prob=0.487]


Validation: Loss=0.8791, Chosen=0.3001, Rejected=0.4733, BT_Prob=0.4573
Sample reward: 0.7441


Epoch 4 [Train]: 100%|██████████| 85/85 [08:16<00:00,  5.84s/it, loss=0.24, BT_prob=0.821] 


Epoch 4: Loss=0.3687, Chosen=0.8117, Rejected=-0.3566, BT_Prob=0.7567


Epoch 4 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.10s/it, val_loss=0.763, val_BT_prob=0.494]


Validation: Loss=0.9076, Chosen=0.2356, Rejected=0.3829, BT_Prob=0.4639
Sample reward: 1.1603


Epoch 5 [Train]: 100%|██████████| 85/85 [08:23<00:00,  5.92s/it, loss=0.326, BT_prob=0.829]


Epoch 5: Loss=0.2804, Chosen=1.0718, Rejected=-0.6096, BT_Prob=0.8368


Epoch 5 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it, val_loss=0.845, val_BT_prob=0.466]


Validation: Loss=0.9825, Chosen=0.4267, Rejected=0.5807, BT_Prob=0.4633
Sample reward: 1.7561


Epoch 6 [Train]: 100%|██████████| 85/85 [08:37<00:00,  6.09s/it, loss=0.233, BT_prob=0.93] 


Epoch 6: Loss=0.2160, Chosen=1.2842, Rejected=-1.0305, BT_Prob=0.9008


Epoch 6 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it, val_loss=0.954, val_BT_prob=0.431]


Validation: Loss=1.0543, Chosen=0.7271, Rejected=0.8331, BT_Prob=0.4759
Sample reward: 2.4781


Epoch 7 [Train]: 100%|██████████| 85/85 [08:50<00:00,  6.24s/it, loss=0.269, BT_prob=0.93]  


Epoch 7: Loss=0.1727, Chosen=1.8201, Rejected=-1.2675, BT_Prob=0.9481


Epoch 7 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it, val_loss=0.875, val_BT_prob=0.483]


Validation: Loss=1.1414, Chosen=0.3699, Rejected=0.4583, BT_Prob=0.4808
Sample reward: 2.0601


Epoch 8 [Train]: 100%|██████████| 85/85 [08:35<00:00,  6.06s/it, loss=0.149, BT_prob=0.976] 


Epoch 8: Loss=0.1484, Chosen=2.1939, Rejected=-1.6984, BT_Prob=0.9732


Epoch 8 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.12s/it, val_loss=0.917, val_BT_prob=0.484]


Validation: Loss=1.2358, Chosen=0.6818, Rejected=0.7506, BT_Prob=0.4873
Sample reward: 2.9444


Epoch 9 [Train]: 100%|██████████| 85/85 [08:28<00:00,  5.98s/it, loss=0.0726, BT_prob=0.995]


Epoch 9: Loss=0.1363, Chosen=2.4275, Rejected=-2.2135, BT_Prob=0.9874


Epoch 9 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.12s/it, val_loss=0.969, val_BT_prob=0.466]


Validation: Loss=1.3025, Chosen=0.5825, Rejected=0.6022, BT_Prob=0.4989
Sample reward: 2.4487


Epoch 10 [Train]: 100%|██████████| 85/85 [08:43<00:00,  6.16s/it, loss=0.0708, BT_prob=0.997] 


Epoch 10: Loss=0.1315, Chosen=2.5865, Rejected=-2.5667, BT_Prob=0.9909


Epoch 10 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it, val_loss=0.999, val_BT_prob=0.463]


Validation: Loss=1.3497, Chosen=0.4353, Rejected=0.4329, BT_Prob=0.5046
Sample reward: 2.4292


Epoch 11 [Train]: 100%|██████████| 85/85 [08:23<00:00,  5.92s/it, loss=0.139, BT_prob=0.998] 


Epoch 11: Loss=0.1300, Chosen=2.6636, Rejected=-2.8663, BT_Prob=0.9950


Epoch 11 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.12s/it, val_loss=1.02, val_BT_prob=0.462]


Validation: Loss=1.3982, Chosen=0.4481, Rejected=0.4188, BT_Prob=0.5111
Sample reward: 2.5181


Epoch 12 [Train]: 100%|██████████| 85/85 [08:42<00:00,  6.15s/it, loss=0.00154, BT_prob=0.999]


Epoch 12: Loss=0.1288, Chosen=2.8099, Rejected=-3.0371, BT_Prob=0.9945


Epoch 12 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it, val_loss=1.04, val_BT_prob=0.454] 


Validation: Loss=1.4293, Chosen=0.5051, Rejected=0.4546, BT_Prob=0.5155
Sample reward: 2.5939


Epoch 13 [Train]: 100%|██████████| 85/85 [08:25<00:00,  5.95s/it, loss=0.14, BT_prob=0.997]  


Epoch 13: Loss=0.1283, Chosen=2.9789, Rejected=-3.1109, BT_Prob=0.9965


Epoch 13 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it, val_loss=1.05, val_BT_prob=0.456]


Validation: Loss=1.4583, Chosen=0.5206, Rejected=0.4538, BT_Prob=0.5190
Sample reward: 2.6190


Epoch 14 [Train]: 100%|██████████| 85/85 [08:33<00:00,  6.04s/it, loss=0.209, BT_prob=0.995]  


Epoch 14: Loss=0.1279, Chosen=3.0600, Rejected=-3.2501, BT_Prob=0.9968


Epoch 14 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.12s/it, val_loss=1.06, val_BT_prob=0.451]


Validation: Loss=1.4846, Chosen=0.5201, Rejected=0.4399, BT_Prob=0.5214
Sample reward: 2.6117


Epoch 15 [Train]: 100%|██████████| 85/85 [08:37<00:00,  6.09s/it, loss=0.347, BT_prob=0.982] 


Epoch 15: Loss=0.1278, Chosen=3.1557, Rejected=-3.3393, BT_Prob=0.9974


Epoch 15 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.10s/it, val_loss=1.07, val_BT_prob=0.452]


Validation: Loss=1.5129, Chosen=0.5711, Rejected=0.4836, BT_Prob=0.5232
Sample reward: 2.7057


Epoch 16 [Train]: 100%|██████████| 85/85 [08:36<00:00,  6.08s/it, loss=0.139, BT_prob=0.998] 


Epoch 16: Loss=0.1277, Chosen=3.2278, Rejected=-3.4362, BT_Prob=0.9978


Epoch 16 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it, val_loss=1.08, val_BT_prob=0.448]


Validation: Loss=1.5357, Chosen=0.5223, Rejected=0.4258, BT_Prob=0.5248
Sample reward: 2.6612


Epoch 17 [Train]: 100%|██████████| 85/85 [08:38<00:00,  6.11s/it, loss=0.208, BT_prob=0.997] 


Epoch 17: Loss=0.1276, Chosen=3.2523, Rejected=-3.5706, BT_Prob=0.9980


Epoch 17 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it, val_loss=1.1, val_BT_prob=0.441] 


Validation: Loss=1.5571, Chosen=0.4806, Rejected=0.3771, BT_Prob=0.5257
Sample reward: 2.6792


Epoch 18 [Train]: 100%|██████████| 85/85 [08:20<00:00,  5.89s/it, loss=0.0696, BT_prob=1]   


Epoch 18: Loss=0.1275, Chosen=3.2808, Rejected=-3.6829, BT_Prob=0.9981


Epoch 18 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.11s/it, val_loss=1.11, val_BT_prob=0.443]


Validation: Loss=1.5751, Chosen=0.4884, Rejected=0.3746, BT_Prob=0.5277
Sample reward: 2.7087


Epoch 19 [Train]: 100%|██████████| 85/85 [08:16<00:00,  5.84s/it, loss=0.347, BT_prob=0.985] 


Epoch 19: Loss=0.1275, Chosen=3.3835, Rejected=-3.7153, BT_Prob=0.9985


Epoch 19 [Val]: 100%|██████████| 5/5 [00:20<00:00,  4.12s/it, val_loss=1.12, val_BT_prob=0.438]

Validation: Loss=1.5912, Chosen=0.4863, Rejected=0.3613, BT_Prob=0.5296
Sample reward: 2.7306





In [23]:
save_model(reward_model, "RLHF_reward_model.pth")

Pretrained model has been saved successfully at d:\software_3\Generative_models\Text_models\chat_gpt2\gpt_models\RLHF_reward_model.pth


In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model_path = "gpt_models\\RLHF_reward_model.pth"
model.load_state_dict(
    torch.load(
    model_path,
    map_location=torch.device("cpu"),
    weights_only=True
    )
)
#model = setup_reward_model(base_model=model, base_config=BASE_CONFIG, device=device)

<All keys matched successfully>

In [15]:
def evaluate_response(model, tokenizer, prompt, response, device):
    model.eval()
    with torch.no_grad():
        start_tokens = f"{prompt} \n\nResponse: {response}"
        start_tokens = tokenizer.encode(start_tokens)
        start_tensor = torch.tensor(start_tokens).unsqueeze(0).to(device)
        reward = model(start_tensor)
        # reward shape: [1, seq_len, 1], get the last token's reward
        return reward[0, -1, 0].item()

In [25]:
prompt = "Is having your medical records online safe?"
response1 = "You mean being able to share them with your doctor, or making them public to the internet in general?"
response2 = "Hm, I think so! If you have your records online, it makes it easier for you to access them, and also for your doctor to share updates with you, and for you to share updates with your doctor. It also makes it easier for you to share your information with other people, if you want, for example to join a health advocacy or support group. The only risk I see is that if someone else has access to your records, they might abuse that access, or they might accidentally make a change to your records that you don\u2019t want. But overall, I think online access is a good thing."
    
score1 = evaluate_response(model, tokenizer, prompt, response1, device)
score2 = evaluate_response(model, tokenizer, prompt, response2, device)
    
    # Compute Bradley-Terry probability
bt_prob = torch.sigmoid(torch.tensor(score2-score1)).item()
    
print(f"Response 1 score: {score1}")
print(f"Response 2 score: {score2}")
print(f"P(Response 2 > Response 1) = {bt_prob:.4f}")
print(f"Chosen response has higher score: {score2 > score1}")

Response 1 score: -6.01485538482666
Response 2 score: -4.657999515533447
P(Response 2 > Response 1) = 0.7952
Chosen response has higher score: True
