In [None]:
# Installation
# !pip install torch transformers trl peft numpy pandas tqdm

In [None]:
# Imports and Setup
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import (
    GPT2LMHeadModel, 
    GPT2Tokenizer,
    AutoModelForCausalLM,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Tuple, Optional
import json
from tqdm import tqdm
from dataclasses import dataclass
import os
from copy import deepcopy

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load models
print("Loading models...")
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

# Reference model (frozen copy for KL penalty)
ref_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
ref_model.eval()
for param in ref_model.parameters():
    param.requires_grad = False

embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
print(" Models loaded")


In [None]:
# CELL 3: Reward Model 
class RewardModel(nn.Module):
    """
    Reward model for RLHF.
    Takes text embeddings and predicts scalar reward.
    """
    
    def __init__(
        self,
        input_dim: int = 768,
        hidden_dims: List[int] = [256, 128, 64],
        dropout: float = 0.2
    ):
        super(RewardModel, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        # Output layer (scalar reward, no activation)
        layers.append(nn.Linear(prev_dim, 1))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        """
        Args:
            x: Embeddings (batch_size, embedding_dim)
        
        Returns:
            Rewards (batch_size,)
        """
        return self.network(x).squeeze(-1)
    
    def get_reward(self, texts: List[str], embedding_model) -> np.ndarray:
        """Get rewards for texts."""
        self.eval()
        with torch.no_grad():
            embeddings = embedding_model.encode(texts)
            embeddings_tensor = torch.FloatTensor(embeddings).to(device)
            rewards = self.forward(embeddings_tensor)
            return rewards.cpu().numpy()

# Initialize reward model
reward_model = RewardModel(input_dim=768).to(device)
print(" Reward model initialized")

In [None]:
# Load or Train Reward Model
def train_reward_model_from_feedback(
    feedback_file: str = 'data/logs/feedback.jsonl',
    embedding_model = None,
    num_epochs: int = 50,
    lr: float = 0.001
) -> RewardModel:
    """
    Train reward model from logged feedback.
    
    Args:
        feedback_file: Path to feedback logs
        embedding_model: Sentence transformer
        num_epochs: Training epochs
        lr: Learning rate
    
    Returns:
        Trained reward model
    """
    print(f"Training reward model from {feedback_file}...")
    
    # Load feedback
    feedback_data = []
    if os.path.exists(feedback_file):
        with open(feedback_file, 'r') as f:
            for line in f:
                entry = json.loads(line.strip())
                if entry['event_type'] in ['select', 'rate', 'reject']:
                    feedback_data.append(entry)
    
    if len(feedback_data) < 10:
        print("Not enough feedback data, using synthetic data...")
        # Create synthetic data
        positive_texts = [
            "The gentle breeze whispers through ancient trees",
            "Moonlight dances on the tranquil lake",
            "Mountains stand tall in majestic silence",
            "Stars shimmer across the velvet night sky",
            "Rivers flow endlessly toward distant shores"
        ]
        negative_texts = [
            "The weather is okay today",
            "There are trees and stuff",
            "Water is wet and blue",
            "It's night and there are stars",
            "Mountains are big rocks"
        ]
        
        texts = positive_texts + negative_texts
        ratings = [0.9] * len(positive_texts) + [0.2] * len(negative_texts)
    else:
        texts = [e['data']['text'] for e in feedback_data]
        ratings = []
        for e in feedback_data:
            if 'rating' in e['data']:
                ratings.append(e['data']['rating'])
            elif e['event_type'] == 'select':
                ratings.append(e['data'].get('score', 0.8))
            elif e['event_type'] == 'reject':
                ratings.append(0.2)
            else:
                ratings.append(0.5)
    
    # Compute embeddings
    embeddings = embedding_model.encode(texts)
    X = torch.FloatTensor(embeddings).to(device)
    y = torch.FloatTensor(ratings).to(device)
    
    # Create reward model
    model = RewardModel(input_dim=embeddings.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    # Train
    print(f"Training on {len(texts)} samples...")
    best_loss = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        
        predictions = model(X)
        loss = criterion(predictions, y)
        
        loss.backward()
        optimizer.step()
        
        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs} - Loss: {loss.item():.4f}")
        
        if loss.item() < best_loss:
            best_loss = loss.item()
    
    print(f"✓ Reward model trained (best loss: {best_loss:.4f})")
    return model

# Train reward model
reward_model = train_reward_model_from_feedback(
    embedding_model=embedding_model,
    num_epochs=50
)

# Test reward model
print("\n--- Testing Reward Model ---")
test_texts = [
    "The majestic mountain rises above the clouds",
    "There is a mountain",
    "Silver moonlight illuminates the quiet forest",
    "The moon is in the sky"
]

rewards = reward_model.get_reward(test_texts, embedding_model)
for text, reward in zip(test_texts, rewards):
    print(f"Reward: {reward:.3f} | {text}")

In [None]:
# PPO Components - Value Function
class ValueHead(nn.Module):
    """
    Value function for PPO.
    Estimates V(s) for a given state (hidden states).
    """
    
    def __init__(self, input_dim: int = 768):
        super(ValueHead, self).__init__()
        self.value_head = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 1)
        )
    
    def forward(self, hidden_states):
        """
        Args:
            hidden_states: (batch, seq_len, hidden_dim)
        
        Returns:
            values: (batch, seq_len)
        """
        return self.value_head(hidden_states).squeeze(-1)

# Create value function
value_model = ValueHead(input_dim=768).to(device)
print(" Value function initialized")

In [None]:
# PPO Data Structures
@dataclass
class PPOExperience:
    """Single experience for PPO training."""
    query: str
    response: str
    response_tokens: torch.Tensor
    log_probs: torch.Tensor
    values: torch.Tensor
    reward: float
    advantages: Optional[torch.Tensor] = None
    returns: Optional[torch.Tensor] = None

class PPOMemory:
    """Experience replay buffer for PPO."""
    
    def __init__(self):
        self.experiences = []
    
    def add(self, experience: PPOExperience):
        self.experiences.append(experience)
    
    def compute_advantages(self, gamma: float = 0.99, lam: float = 0.95):
        """Compute GAE advantages for all experiences."""
        for exp in self.experiences:
            seq_len = len(exp.values)
            
            # Compute TD errors
            rewards_to_go = torch.zeros(seq_len, device=device)
            advantages = torch.zeros(seq_len, device=device)
            
            # Last reward is the final reward from reward model
            running_return = exp.reward
            running_advantage = 0
            
            for t in reversed(range(seq_len)):
                # TD error
                if t == seq_len - 1:
                    td_error = exp.reward - exp.values[t]
                else:
                    td_error = 0 + gamma * exp.values[t + 1] - exp.values[t]
                
                # GAE
                running_advantage = td_error + gamma * lam * running_advantage
                advantages[t] = running_advantage
                
                # Returns
                rewards_to_go[t] = running_return
                running_return = gamma * running_return
            
            exp.advantages = advantages
            exp.returns = rewards_to_go
    
    def get_batch(self):
        """Get all experiences as batch."""
        return self.experiences
    
    def clear(self):
        """Clear memory."""
        self.experiences = []

print(" PPO memory and experience structures defined")

In [None]:
# PPO Rollout - Generate and Collect
def generate_with_values(
    model: GPT2LMHeadModel,
    value_model: ValueHead,
    tokenizer: GPT2Tokenizer,
    prompts: List[str],
    max_length: int = 40,
    temperature: float = 0.9,
    top_k: int = 50
) -> List[Tuple[str, torch.Tensor, torch.Tensor, torch.Tensor]]:
    """
    Generate responses and collect log probs and values.
    
    Args:
        model: Policy model (GPT-2)
        value_model: Value function
        tokenizer: Tokenizer
        prompts: List of prompts
        max_length: Max generation length
        temperature: Sampling temperature
        top_k: Top-k sampling
    
    Returns:
        List of (response_text, response_tokens, log_probs, values)
    """
    model.eval()
    value_model.eval()
    
    results = []
    
    for prompt in prompts:
        # Encode prompt
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
        prompt_len = input_ids.shape[1]
        
        # Storage
        all_log_probs = []
        all_values = []
        
        # Generate token by token
        generated = input_ids
        
        with torch.no_grad():
            for step in range(max_length):
                # Forward pass
                outputs = model(generated, output_hidden_states=True, return_dict=True)
                hidden_states = outputs.hidden_states[-1]  # Last layer
                logits = outputs.logits[:, -1, :]  # Next token logits
                
                # Get value for current state
                value = value_model(hidden_states)[:, -1]
                
                # Sample next token
                logits = logits / temperature
                top_k_logits, top_k_indices = torch.topk(logits, top_k, dim=-1)
                probs = F.softmax(top_k_logits, dim=-1)
                
                # Sample
                next_token_idx = torch.multinomial(probs, num_samples=1)
                next_token = top_k_indices.gather(-1, next_token_idx)
                
                # Compute log prob of sampled token
                log_prob = F.log_softmax(top_k_logits, dim=-1)
                token_log_prob = log_prob.gather(-1, next_token_idx)
                
                # Store
                all_log_probs.append(token_log_prob.squeeze())
                all_values.append(value.squeeze())
                
                # Append token
                generated = torch.cat([generated, next_token], dim=-1)
                
                # Check for EOS
                if next_token.item() == tokenizer.eos_token_id:
                    break
        
        # Decode response (excluding prompt)
        response_tokens = generated[0, prompt_len:]
        response_text = tokenizer.decode(response_tokens, skip_special_tokens=True)
        
        # Stack tensors
        log_probs_tensor = torch.stack(all_log_probs)
        values_tensor = torch.stack(all_values)
        
        results.append((response_text, response_tokens, log_probs_tensor, values_tensor))
    
    return results

print("Rollout generation function defined")

In [None]:

# PPO Training Step
def ppo_training_step(
    model: GPT2LMHeadModel,
    value_model: ValueHead,
    ref_model: GPT2LMHeadModel,
    memory: PPOMemory,
    optimizer_policy: torch.optim.Optimizer,
    optimizer_value: torch.optim.Optimizer,
    clip_eps: float = 0.2,
    kl_coef: float = 0.1,
    value_coef: float = 0.5,
    entropy_coef: float = 0.01,
    num_epochs: int = 4
) -> Dict[str, float]:
    """
    Perform PPO update on collected experiences.
    
    Args:
        model: Policy model to train
        value_model: Value function to train
        ref_model: Reference model (frozen)
        memory: Experience buffer
        optimizer_policy: Optimizer for policy
        optimizer_value: Optimizer for value
        clip_eps: PPO clipping epsilon
        kl_coef: KL penalty coefficient
        value_coef: Value loss coefficient
        entropy_coef: Entropy bonus coefficient
        num_epochs: PPO epochs per batch
    
    Returns:
        Dictionary of training metrics
    """
    model.train()
    value_model.train()
    
    # Compute advantages
    memory.compute_advantages()
    
    experiences = memory.get_batch()
    
    total_policy_loss = 0
    total_value_loss = 0
    total_kl = 0
    
    for epoch in range(num_epochs):
        for exp in experiences:
            # Reconstruct full input (query + response)
            query_tokens = gpt2_tokenizer.encode(exp.query, return_tensors='pt').to(device)
            full_tokens = torch.cat([query_tokens, exp.response_tokens.unsqueeze(0)], dim=1)
            
            # Forward pass through policy
            outputs = model(full_tokens, output_hidden_states=True, return_dict=True)
            hidden_states = outputs.hidden_states[-1]
            logits = outputs.logits
            
            # Get log probs for response tokens
            response_len = exp.response_tokens.shape[0]
            response_logits = logits[0, -response_len-1:-1, :]  # Shift by 1
            log_probs_new = F.log_softmax(response_logits, dim=-1)
            
            # Gather log probs for actual tokens
            token_log_probs_new = log_probs_new.gather(
                1, exp.response_tokens.unsqueeze(1)
            ).squeeze()
            
            # Compute ratio for PPO
            ratio = torch.exp(token_log_probs_new - exp.log_probs.detach())
            
            # Clipped surrogate loss
            advantages = exp.advantages.detach()
            advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
            
            surr1 = ratio * advantages
            surr2 = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * advantages
            policy_loss = -torch.min(surr1, surr2).mean()
            
            # KL penalty with reference model
            with torch.no_grad():
                ref_outputs = ref_model(full_tokens)
                ref_logits = ref_outputs.logits
                ref_response_logits = ref_logits[0, -response_len-1:-1, :]
                ref_log_probs = F.log_softmax(ref_response_logits, dim=-1)
                ref_token_log_probs = ref_log_probs.gather(
                    1, exp.response_tokens.unsqueeze(1)
                ).squeeze()
            
            kl_penalty = (token_log_probs_new - ref_token_log_probs).mean()
            
            # Entropy bonus (encourage exploration)
            probs = F.softmax(response_logits, dim=-1)
            entropy = -(probs * log_probs_new).sum(dim=-1).mean()
            
            # Value loss
            response_hidden = hidden_states[0, -response_len-1:-1, :]
            values_new = value_model(response_hidden.unsqueeze(0)).squeeze()
            value_loss = F.mse_loss(values_new, exp.returns.detach())
            
            # Total loss
            total_loss = (
                policy_loss +
                kl_coef * kl_penalty -
                entropy_coef * entropy +
                value_coef * value_loss
            )
            
            # Backward
            optimizer_policy.zero_grad()
            optimizer_value.zero_grad()
            total_loss.backward()
            
            # Clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            torch.nn.utils.clip_grad_norm_(value_model.parameters(), 1.0)
            
            optimizer_policy.step()
            optimizer_value.step()
            
            # Accumulate metrics
            total_policy_loss += policy_loss.item()
            total_value_loss += value_loss.item()
            total_kl += kl_penalty.item()
    
    n = len(experiences) * num_epochs
    return {
        'policy_loss': total_policy_loss / n,
        'value_loss': total_value_loss / n,
        'kl_divergence': total_kl / n
    }

print(" PPO training step defined")

In [None]:
# RLHF Training Loop
def train_rlhf(
    model: GPT2LMHeadModel,
    value_model: ValueHead,
    ref_model: GPT2LMHeadModel,
    reward_model: RewardModel,
    embedding_model,
    tokenizer: GPT2Tokenizer,
    prompts: List[str],
    num_iterations: int = 10,
    batch_size: int = 4,
    lr_policy: float = 1e-5,
    lr_value: float = 1e-4
):
    """
    Main RLHF training loop with PPO.
    
    Args:
        model: Policy model to train
        value_model: Value function
        ref_model: Reference model (frozen)
        reward_model: Reward model
        embedding_model: For computing rewards
        tokenizer: Tokenizer
        prompts: Training prompts
        num_iterations: Number of training iterations
        batch_size: Prompts per iteration
        lr_policy: Learning rate for policy
        lr_value: Learning rate for value
    """
    # Optimizers
    optimizer_policy = torch.optim.Adam(model.parameters(), lr=lr_policy)
    optimizer_value = torch.optim.Adam(value_model.parameters(), lr=lr_value)
    
    print(f"\nStarting RLHF training for {num_iterations} iterations")
    print(f"Batch size: {batch_size}, Policy LR: {lr_policy}, Value LR: {lr_value}")
    print("=" * 60)
    
    for iteration in range(num_iterations):
        print(f"\n--- Iteration {iteration + 1}/{num_iterations} ---")
        
        # Sample prompts
        batch_prompts = np.random.choice(prompts, size=min(batch_size, len(prompts)), replace=False).tolist()
        
        # Rollout: generate responses
        print("Generating responses...")
        rollout_results = generate_with_values(
            model, value_model, tokenizer,
            batch_prompts,
            max_length=30
        )
        
        # Create memory
        memory = PPOMemory()
        
        # Compute rewards and store experiences
        print("Computing rewards...")
        for i, (prompt, (response, resp_tokens, log_probs, values)) in enumerate(zip(batch_prompts, rollout_results)):
            full_text = prompt + " " + response
            reward = reward_model.get_reward([full_text], embedding_model)[0]
            
            experience = PPOExperience(
                query=prompt,
                response=response,
                response_tokens=resp_tokens,
                log_probs=log_probs,
                values=values,
                reward=reward
            )
            memory.add(experience)
            
            print(f"  [{i+1}] Reward: {reward:.3f} | {prompt} → {response[:50]}...")
        
        # PPO update
        print("Performing PPO update...")
        metrics = ppo_training_step(
            model, value_model, ref_model, memory,
            optimizer_policy, optimizer_value,
            clip_eps=0.2,
            kl_coef=0.1,
            num_epochs=4
        )
        
        print(f"  Policy Loss: {metrics['policy_loss']:.4f}")
        print(f"  Value Loss: {metrics['value_loss']:.4f}")
        print(f"  KL Divergence: {metrics['kl_divergence']:.4f}")
        
        # Save checkpoint
        if (iteration + 1) % 5 == 0:
            os.makedirs('models/rlhf', exist_ok=True)
            torch.save({
                'iteration': iteration,
                'model_state_dict': model.state_dict(),
                'value_model_state_dict': value_model.state_dict(),
                'optimizer_policy': optimizer_policy.state_dict(),
                'optimizer_value': optimizer_value.state_dict(),
            }, f'models/rlhf/checkpoint_iter_{iteration+1}.pt')
            print(f"  Saved checkpoint")
    
    print("RLHF training complete!")

In [None]:
# Test RLHF Training (Small Scale)
print("TESTING RLHF TRAINING (SMALL SCALE)")

# Prepare training prompts
training_prompts = [
    "The mountain",
    "A gentle breeze",
    "The moonlight",
    "In autumn",
    "The ocean waves",
    "A quiet moment",
    "The starlight",
    "Through the forest"
]

# Clone model for training (don't modify original)
trainable_model = deepcopy(gpt2_model)
trainable_value = ValueHead(input_dim=768).to(device)

# Run RLHF for a few iterations
print("\nTraining RLHF model...")
train_rlhf(
    model=trainable_model,
    value_model=trainable_value,
    ref_model=ref_model,
    reward_model=reward_model,
    embedding_model=embedding_model,
    tokenizer=gpt2_tokenizer,
    prompts=training_prompts,
    num_iterations=5,
    batch_size=3,
    lr_policy=1e-5,
    lr_value=1e-4
)

In [None]:
#Compare Before/After RLHF
print("COMPARING BEFORE/AFTER RLHF")

test_prompts = ["The forest", "Moonlight falls", "A river flows"]

print("\nBEFORE RLHF (Original GPT-2) ")
for prompt in test_prompts:
    with torch.no_grad():
        input_ids = gpt2_tokenizer.encode(prompt, return_tensors='pt').to(device)
        output = gpt2_model.generate(
            input_ids,
            max_length=input_ids.shape[1] + 30,
            do_sample=True,
            top_k=50,
            temperature=0.9,
            pad_token_id=gpt2_tokenizer.eos_token_id
        )
        text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
        reward = reward_model.get_reward([text], embedding_model)[0]
        print(f"\n[Reward: {reward:.3f}] {text}")

print("\n AFTER RLHF (Fine-tuned) ")
trainable_model.eval()
for prompt in test_prompts:
    with torch.no_grad():
        input_ids = gpt2_tokenizer.encode(prompt, return_tensors='pt').to(device)
        output = trainable_model.generate(
            input_ids,
            max_length=input_ids.shape[1] + 30,
            do_sample=True,
            top_k=50,
            temperature=0.9,
            pad_token_id=gpt2_tokenizer.eos_token_id
        )
        text = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)
        reward = reward_model.get_reward([text], embedding_model)[0]
        print(f"\n[Reward: {reward:.3f}] {text}")


In [None]:
#  RLHF with User Feedback Integration
class RLHFPoetryGenerator:
    """
    RLHF-based poetry generator with reciprocal learning.
    Integrates with feedback logging and reward model updates.
    """
    
    def __init__(
        self,
        base_model: GPT2LMHeadModel,
        tokenizer: GPT2Tokenizer,
        embedding_model,
        user_id: str
    ):
        self.base_model = base_model
        self.policy_model = deepcopy(base_model)
        self.ref_model = deepcopy(base_model)
        self.ref_model.eval()
        for param in self.ref_model.parameters():
            param.requires_grad = False
        
        self.value_model = ValueHead().to(device)
        self.reward_model = RewardModel().to(device)
        
        self.tokenizer = tokenizer
        self.embedding_model = embedding_model
        self.user_id = user_id
        
        self.feedback_history = []
        
        print(f"✓ RLHF Poetry Generator initialized for user {user_id}")
    
    def generate(self, prompt: str, max_length: int = 40) -> str:
        """Generate poetry using current policy."""
        self.policy_model.eval()
        
        with torch.no_grad():
            input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(device)
            output = self.policy_model.generate(
                input_ids,
                max_length=input_ids.shape[1] + max_length,
                do_sample=True,
                top_k=50,
                temperature=0.9,
                pad_token_id=self.tokenizer.eos_token_id
            )
            text = self.tokenizer.decode(output[0], skip_special_tokens=True)
        
        return text
    
    def add_feedback(self, text: str, rating: float):
        """Add user feedback."""
        self.feedback_history.append({'text': text, 'rating': rating})
        print(f"✓ Added feedback: rating={rating:.2f}")
    
    def update_from_feedback(self):
        """Update reward model and retrain policy."""
        if len(self.feedback_history) < 5:
            print("Need at least 5 feedback examples to update")
            return
        
        print(f"\nUpdating from {len(self.feedback_history)} feedback examples...")
        
        # Retrain reward model
        texts = [f['text'] for f in self.feedback_history]
        ratings = [f['rating'] for f in self.feedback_history]
        
        embeddings = self.embedding_model.encode(texts)
        X = torch.FloatTensor(embeddings).to(device)
        y = torch.FloatTensor(ratings).to(device)
        
        optimizer = torch.optim.Adam(self.reward_model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        for epoch in range(50):
            optimizer.zero_grad()
            preds = self.reward_model(X)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
        
        print(f"✓ Reward model updated (final loss: {loss.item():.4f})")
        
        # Retrain policy with RLHF
        prompts = ["The mountain", "A gentle breeze", "The moonlight", "In autumn"]
        train_rlhf(
            model=self.policy_model,
            value_model=self.value_model,
            ref_model=self.ref_model,
            reward_model=self.reward_model,
            embedding_model=self.embedding_model,
            tokenizer=self.tokenizer,
            prompts=prompts,
            num_iterations=3,
            batch_size=2,
            lr_policy=1e-5,
            lr_value=1e-4
        )
        
        print("Policy updated with RLHF")

In [None]:
# Test RLHF Poetry Generator
print("\nTesting RLHF Poetry Generator ")
rlhf_gen = RLHFPoetryGenerator(
    base_model=gpt2_model,
    tokenizer=gpt2_tokenizer,
    embedding_model=embedding_model,
    user_id='bob'
)

In [None]:
# Generate initial poems
print("\n Initial Generation ")
prompts = ["The stars", "A whisper", "The dawn"]
initial_poems = []
for prompt in prompts:
    poem = rlhf_gen.generate(prompt, max_length=30)
    initial_poems.append(poem)
    print(f"{prompt} → {poem}")

In [None]:
# Simulate user feedback
print("\nSimulating User Feedback ")
# User likes poetic, flowing language
rlhf_gen.add_feedback("The gentle breeze whispers through ancient trees", 0.95)
rlhf_gen.add_feedback("Moonlight dances on the tranquil lake", 0.90)
rlhf_gen.add_feedback("Mountains stand tall in majestic silence", 0.88)
rlhf_gen.add_feedback("Stars shimmer across the velvet night", 0.92)
rlhf_gen.add_feedback("Rivers flow endlessly toward distant shores", 0.89)


In [None]:
# User dislikes plain descriptions
rlhf_gen.add_feedback("The weather is okay today", 0.15)
rlhf_gen.add_feedback("There are trees and stuff", 0.10)
rlhf_gen.add_feedback("Water is wet and blue", 0.12)


In [None]:
# Update models
rlhf_gen.update_from_feedback()

# Generate after feedback
print("\n--- After Feedback Update ---")
for prompt in prompts:
    poem = rlhf_gen.generate(prompt, max_length=30)
    print(f"{prompt} → {poem}")

In [None]:
# RLHF Metrics and Evaluation
def evaluate_rlhf_model(
    model: GPT2LMHeadModel,
    ref_model: GPT2LMHeadModel,
    reward_model: RewardModel,
    embedding_model,
    tokenizer: GPT2Tokenizer,
    test_prompts: List[str],
    num_samples: int = 5
) -> Dict[str, float]:
    """
    Evaluate RLHF-trained model.
    
    Args:
        model: Trained policy model
        ref_model: Reference model
        reward_model: Reward model
        embedding_model: For computing rewards
        tokenizer: Tokenizer
        test_prompts: Test prompts
        num_samples: Samples per prompt
    
    Returns:
        Dictionary of metrics
    """
    model.eval()
    
    all_rewards = []
    all_kl_divs = []
    all_texts = []
    
    for prompt in test_prompts:
        for _ in range(num_samples):
            # Generate with trained model
            with torch.no_grad():
                input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
                
                # Policy model
                output_policy = model.generate(
                    input_ids,
                    max_length=input_ids.shape[1] + 30,
                    do_sample=True,
                    top_k=50,
                    temperature=0.9,
                    pad_token_id=tokenizer.eos_token_id,
                    output_scores=True,
                    return_dict_in_generate=True
                )
                
                text = tokenizer.decode(output_policy.sequences[0], skip_special_tokens=True)
                all_texts.append(text)
                
                # Compute reward
                reward = reward_model.get_reward([text], embedding_model)[0]
                all_rewards.append(reward)
                
                # Compute KL divergence with reference model
                policy_logits = torch.stack(output_policy.scores, dim=1)  # (1, seq_len, vocab)
                
                # Get reference logits
                ref_output = ref_model(output_policy.sequences, return_dict=True)
                ref_logits = ref_output.logits[:, :-1, :]  # Shift
                
                # Match dimensions
                min_len = min(policy_logits.shape[1], ref_logits.shape[1])
                policy_logits = policy_logits[:, :min_len, :]
                ref_logits = ref_logits[:, :min_len, :]
                
                # KL divergence
                policy_probs = F.softmax(policy_logits, dim=-1)
                ref_probs = F.softmax(ref_logits, dim=-1)
                kl = F.kl_div(
                    policy_probs.log(),
                    ref_probs,
                    reduction='batchmean'
                )
                all_kl_divs.append(kl.item())
    
    # Compute diversity
    from collections import Counter
    all_words = []
    for text in all_texts:
        words = text.lower().split()
        all_words.extend(words)
    
    unique_words = len(set(all_words))
    total_words = len(all_words)
    diversity = unique_words / total_words if total_words > 0 else 0
    
    return {
        'mean_reward': np.mean(all_rewards),
        'std_reward': np.std(all_rewards),
        'mean_kl': np.mean(all_kl_divs),
        'std_kl': np.std(all_kl_divs),
        'diversity': diversity,
        'num_samples': len(all_texts)
    }

In [None]:
# Evaluate trained model
test_prompts_eval = ["The mountain", "A gentle breeze", "The moonlight", "In autumn", "The ocean"]

metrics_before = evaluate_rlhf_model(
    model=gpt2_model,
    ref_model=ref_model,
    reward_model=reward_model,
    embedding_model=embedding_model,
    tokenizer=gpt2_tokenizer,
    test_prompts=test_prompts_eval,
    num_samples=3
)

metrics_after = evaluate_rlhf_model(
    model=trainable_model,
    ref_model=ref_model,
    reward_model=reward_model,
    embedding_model=embedding_model,
    tokenizer=gpt2_tokenizer,
    test_prompts=test_prompts_eval,
    num_samples=3
)

In [None]:
print("\n BEFORE RLHF ")
for key, value in metrics_before.items():
    print(f"  {key:15s}: {value:.4f}")

print("\n AFTER RLHF ")
for key, value in metrics_after.items():
    print(f"  {key:15s}: {value:.4f}")

print("\n IMPROVEMENT ")
print(f"  Reward :  {metrics_after['mean_reward'] - metrics_before['mean_reward']:+.4f}")
print(f"  KL :      {metrics_after['mean_kl'] - metrics_before['mean_kl']:+.4f}")
print(f"  Diversity : {metrics_after['diversity'] - metrics_before['diversity']:+.4f}")

In [None]:
#Save RLHF Models
print("SAVING RLHF MODELS")

os.makedirs('models/rlhf', exist_ok=True)

# Save policy model
trainable_model.save_pretrained('models/rlhf/policy_model')
gpt2_tokenizer.save_pretrained('models/rlhf/policy_model')

# Save value model
torch.save({
    'model_state_dict': trainable_value.state_dict(),
}, 'models/rlhf/value_model.pt')

# Save reward model
torch.save({
    'model_state_dict': reward_model.state_dict(),
}, 'models/rlhf/reward_model.pt')

print(" Saved policy model to models/rlhf/policy_model/")
print(" Saved value model to models/rlhf/value_model.pt")
print(" Saved reward model to models/rlhf/reward_model.pt")

In [None]:
# Save experiment results
experiment_results = {
    'metrics_before': metrics_before,
    'metrics_after': metrics_after,
    'training_config': {
        'num_iterations': 5,
        'batch_size': 3,
        'lr_policy': 1e-5,
        'lr_value': 1e-4,
        'clip_eps': 0.2,
        'kl_coef': 0.1
    }
}

os.makedirs('outputs/rlhf', exist_ok=True)
with open('outputs/rlhf/experiment_results.json', 'w') as f:
    json.dump(experiment_results, f, indent=2)
