In [None]:
# 🔧 Setup: Run this cell first!
# Check GPU availability and install dependencies

import torch
import sys

# Check GPU
if torch.cuda.is_available():
    device = torch.device('cuda')
    print(f"✅ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
else:
    device = torch.device('cpu')
    print("⚠️ No GPU detected. Some cells may run slowly.")
    print("   Go to Runtime → Change runtime type → GPU")

print(f"\n📦 Python {sys.version.split()[0]}")
print(f"🔥 PyTorch {torch.__version__}")

# Set random seeds for reproducibility
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"🎲 Random seed set to {SEED}")

%matplotlib inline

# Case Study: Personalizing an Enterprise AI Coding Assistant with Conversation-Based RL

## Implementation Notebook — NexaCode Technologies

In this notebook, you will build a simplified but realistic version of NexaCode's developer personalization pipeline. You will implement session-aware rollout collection, a Process Reward Model with majority voting, GRPO-TCR training, and evaluate personalization on real conversation data.

**Estimated time:** 75-90 minutes on a T4 GPU.

## 0. Setup and Environment

In [None]:
import os
import json
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter, deque
from typing import List, Tuple, Dict, Optional
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Seed everything
torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")

In [None]:
!pip install -q datasets transformers accelerate

## 1. Data Acquisition — Loading Anthropic HH-RLHF

NexaCode's developers generate conversations with the AI assistant every day. For this case study, we use the Anthropic HH-RLHF dataset, which contains 170,000+ conversations with chosen (preferred) and rejected responses -- the same structure as NexaCode's correction data.

Each conversation has a "chosen" path (the response the human preferred) and a "rejected" path (the response that was worse). We will use the human's follow-up messages as next-state signals, just as NexaCode's system uses developer corrections.

In [None]:
from datasets import load_dataset

# Load a subset for T4 memory constraints
dataset = load_dataset("Anthropic/hh-rlhf", split="train[:5000]")
val_dataset = load_dataset("Anthropic/hh-rlhf", split="test[:500]")

print(f"Training samples: {len(dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"\nSample conversation (chosen):")
print(dataset[0]['chosen'][:500])

### TODO 1: Parse Conversations into Training Triples

The raw HH-RLHF data is formatted as alternating "Human:" and "Assistant:" turns. Parse each conversation into structured (context, response, feedback) triples.

In [None]:
def parse_conversation(raw_text: str) -> List[Dict[str, str]]:
    """
    Parse a raw HH-RLHF conversation string into a list of turns.

    The raw format looks like:
        \n\nHuman: What is Python?\n\nAssistant: Python is a programming language...
        \n\nHuman: Can you show me an example?\n\nAssistant: Sure, here is...

    Args:
        raw_text: Raw conversation string from the dataset

    Returns:
        List of dicts with keys 'role' ('user' or 'assistant') and 'content'

    Example:
        >>> parse_conversation("\\n\\nHuman: Hi\\n\\nAssistant: Hello!")
        [{'role': 'user', 'content': 'Hi'}, {'role': 'assistant', 'content': 'Hello!'}]
    """
    # ============ TODO ============
    # Step 1: Split the raw text on "\n\nHuman:" and "\n\nAssistant:" boundaries
    # Step 2: For each segment, determine if it is a Human or Assistant turn
    # Step 3: Strip whitespace and create the turn dict
    # Step 4: Return the list of turns, filtering out empty turns
    # Hint: Use str.split() and check if each segment starts with "Human:" or "Assistant:"
    # ==============================

    turns = []  # YOUR CODE HERE

    return turns


def extract_training_triples(turns: List[Dict]) -> List[Dict]:
    """
    Extract (context, response, feedback) triples from parsed turns.

    For each assistant turn that is followed by a user turn, create a triple:
    - context: all turns before the assistant response
    - response: the assistant's response text
    - feedback: the user's next message (next-state signal)

    Args:
        turns: List of parsed turn dicts

    Returns:
        List of training triple dicts with keys:
        'context' (list of turn dicts), 'response' (str), 'feedback' (str)
    """
    # ============ TODO ============
    # Step 1: Iterate through turns
    # Step 2: For each assistant turn at index i, check if turns[i+1] is a user turn
    # Step 3: If so, create a triple with context=turns[:i], response=turns[i], feedback=turns[i+1]
    # Step 4: Return all triples
    # ==============================

    triples = []  # YOUR CODE HERE

    return triples

In [None]:
# Verification
sample_text = "\n\nHuman: Write a sorting function in Python\n\nAssistant: Here is a JavaScript sorting function: function sort(arr) { return arr.sort(); }\n\nHuman: No, I asked for Python not JavaScript.\n\nAssistant: Sorry! Here is the Python version: def sort_list(lst): return sorted(lst)\n\nHuman: Perfect, thanks!"

turns = parse_conversation(sample_text)
assert len(turns) == 5, f"Expected 5 turns, got {len(turns)}"
assert turns[0]['role'] == 'user', f"First turn should be user, got {turns[0]['role']}"
assert turns[1]['role'] == 'assistant', f"Second turn should be assistant"

triples = extract_training_triples(turns)
assert len(triples) == 2, f"Expected 2 triples, got {len(triples)}"
assert "Python not JavaScript" in triples[0]['feedback'], "First feedback should contain the correction"
print("Parser verification passed.")

## 2. Exploratory Data Analysis

Let us understand the structure of the feedback signals in our data.

In [None]:
# Parse all conversations and extract triples
all_triples = []
parse_errors = 0

for i, sample in enumerate(dataset):
    try:
        chosen_turns = parse_conversation(sample['chosen'])
        chosen_triples = extract_training_triples(chosen_turns)
        all_triples.extend(chosen_triples)
    except Exception:
        parse_errors += 1

print(f"Parsed {len(dataset)} conversations")
print(f"Extracted {len(all_triples)} training triples")
print(f"Parse errors: {parse_errors}")

### TODO 2: Analyze Feedback Signal Distribution

Generate three visualizations that characterize the training data.

In [None]:
def analyze_feedback_signals(triples: List[Dict]) -> Dict:
    """
    Analyze the distribution of feedback signals in the training data.

    Produce the following analysis:
    1. Distribution of feedback lengths (in words)
    2. Distribution of response lengths (in words)
    3. Classify each feedback as 'positive', 'negative', or 'neutral' using
       simple keyword heuristics (positive: "thanks", "great", "perfect", etc.;
       negative: "no", "wrong", "not", "incorrect", etc.)

    Args:
        triples: List of training triple dicts

    Returns:
        Dict with keys:
        - 'feedback_lengths': list of int (word counts)
        - 'response_lengths': list of int (word counts)
        - 'sentiments': list of str ('positive', 'negative', 'neutral')
        - 'sentiment_counts': dict mapping sentiment to count
    """
    # ============ TODO ============
    # Step 1: Compute word counts for feedback and response
    # Step 2: Classify each feedback using keyword matching
    # Step 3: Count sentiment distribution
    # Step 4: Return the analysis dict
    # ==============================

    analysis = {}  # YOUR CODE HERE

    return analysis


# Run analysis and generate 3 visualizations
analysis = analyze_feedback_signals(all_triples)

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

# Plot 1: Feedback length distribution
# Plot 2: Response length distribution
# Plot 3: Sentiment distribution bar chart

# YOUR VISUALIZATION CODE HERE

plt.tight_layout()
plt.show()

Answer these questions based on your EDA:
1. What is the median feedback length? What does this suggest about the richness of the next-state signal?
2. What fraction of feedback is corrective (negative)? Is this enough to train on?
3. Are there any outliers in response length that might cause issues with the overlong reward shaping?

## 3. Baseline — Prompt Injection

Before building the RL pipeline, let us establish a baseline using NexaCode's current approach: prepending recent corrections to the system prompt.

In [None]:
class PromptInjectionBaseline:
    """
    NexaCode's current baseline: cache the last N corrections and
    inject them into the system prompt.
    """
    def __init__(self, max_corrections: int = 10):
        self.correction_cache = deque(maxlen=max_corrections)

    def add_correction(self, correction: str):
        """Add a developer correction to the cache."""
        self.correction_cache.append(correction)

    def build_prompt(self, user_query: str) -> str:
        """Build a prompt with injected corrections."""
        corrections_text = ""
        if self.correction_cache:
            corrections_text = "DEVELOPER PREFERENCES:\n"
            for c in self.correction_cache:
                corrections_text += f"- {c}\n"
            corrections_text += "\n"
        return corrections_text + user_query

    def evaluate(self, triples: List[Dict]) -> float:
        """
        Evaluate baseline: for each triple, check if the correction
        would have prevented the mistake (simplified heuristic).
        """
        corrected = 0
        total = 0
        for triple in triples:
            total += 1
            feedback_lower = triple['feedback'].lower()
            # If feedback is positive, the baseline "succeeded"
            pos_words = ['thanks', 'great', 'perfect', 'good', 'correct', 'yes']
            if any(w in feedback_lower for w in pos_words):
                corrected += 1
            else:
                # Add the correction for future use
                self.add_correction(triple['feedback'][:100])
        return corrected / max(total, 1)

baseline = PromptInjectionBaseline()
baseline_accuracy = baseline.evaluate(all_triples[:200])
print(f"Baseline preference accuracy: {baseline_accuracy:.1%}")
print(f"Corrections cached: {len(baseline.correction_cache)}")

## 4. Model Design — Building the Personalization Pipeline

### 4.1 Process Reward Model with Majority Voting

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

### TODO 3: Implement the PRM Scoring Function

In [None]:
class ProcessRewardModel(nn.Module):
    """
    A Process Reward Model that scores (response, feedback) pairs.
    Uses a small transformer encoder with a classification head.

    Architecture:
        - Embedding layer (shared vocab with the policy)
        - 2-layer transformer encoder
        - Mean pooling over sequence
        - Linear classifier: hidden_size -> 3 (for scores -1, 0, +1)
    """
    def __init__(self, vocab_size: int, hidden_size: int = 128, num_heads: int = 4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size, nhead=num_heads,
            dim_feedforward=256, batch_first=True, dropout=0.1
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.classifier = nn.Linear(hidden_size, 3)  # -1, 0, +1

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor = None):
        """
        Forward pass. Returns logits for 3 classes.

        Args:
            input_ids: (batch, seq_len) token IDs
            attention_mask: (batch, seq_len) binary mask

        Returns:
            logits: (batch, 3)
        """
        x = self.embedding(input_ids)
        x = self.encoder(x)
        # Mean pooling (respecting the mask)
        if attention_mask is not None:
            mask = attention_mask.unsqueeze(-1).float()
            x = (x * mask).sum(dim=1) / mask.sum(dim=1).clamp(min=1)
        else:
            x = x.mean(dim=1)
        return self.classifier(x)

    def majority_vote(self, input_ids, attention_mask=None, num_votes=5):
        """
        Run majority voting over num_votes forward passes with dropout.

        Args:
            input_ids: (batch, seq_len)
            attention_mask: (batch, seq_len)
            num_votes: Number of votes (m)

        Returns:
            rewards: (batch,) tensor with values in {-1, 0, +1}
            all_votes: (num_votes, batch) tensor of individual votes
        """
        # ============ TODO ============
        # Step 1: Set model to train mode (enables dropout for diversity)
        # Step 2: Run num_votes forward passes, collecting argmax predictions
        # Step 3: Map class indices to scores: 0->-1, 1->0, 2->+1
        # Step 4: For each sample, take the most common vote (majority)
        # Step 5: Set model back to eval mode
        # Step 6: Return (majority_rewards, all_votes_tensor)
        # Hint: Use collections.Counter for majority voting
        # ==============================

        rewards = None  # YOUR CODE HERE
        all_votes = None  # YOUR CODE HERE

        return rewards, all_votes


prm = ProcessRewardModel(vocab_size=len(tokenizer)).to(device)
print(f"PRM parameters: {sum(p.numel() for p in prm.parameters()):,}")

In [None]:
# Verification
test_ids = torch.randint(0, 100, (4, 20)).to(device)
test_mask = torch.ones(4, 20).to(device)
rewards, votes = prm.majority_vote(test_ids, test_mask, num_votes=5)
assert rewards.shape == (4,), f"Expected shape (4,), got {rewards.shape}"
assert all(r in [-1, 0, 1] for r in rewards.tolist()), "Rewards must be in {-1, 0, +1}"
print(f"PRM majority voting verification passed. Rewards: {rewards.tolist()}")

### 4.2 GRPO Advantage Computation and Loss

### TODO 4: Implement the GRPO-TCR Loss Function

In [None]:
def compute_grpo_advantages(rewards: torch.Tensor) -> torch.Tensor:
    """
    Compute group-relative advantages from rewards.

    A_i = (r_i - mean(r)) / std(r)

    Args:
        rewards: (G,) tensor of rewards for G responses to the same prompt

    Returns:
        advantages: (G,) tensor of normalized advantages
    """
    mean = rewards.mean()
    std = rewards.std()
    if std < 1e-8:
        return torch.zeros_like(rewards)
    return (rewards - mean) / std


def overlong_reward(response_length: int, L_max: int = 512, L_cache: int = 100) -> float:
    """
    Compute the overlong response penalty.

    Args:
        response_length: Token count of the response
        L_max: Maximum allowed length
        L_cache: Size of the penalty transition zone

    Returns:
        Penalty value in [-1, 0]
    """
    safe = L_max - L_cache
    if response_length <= safe:
        return 0.0
    elif response_length <= L_max:
        return (safe - response_length) / L_cache
    else:
        return -1.0


def grpo_tcr_loss(
    log_probs_new: torch.Tensor,
    log_probs_ref: torch.Tensor,
    advantages: torch.Tensor,
    response_lengths: torch.Tensor,
    eps_low: float = 0.2,
    eps_high: float = 0.28,
    beta_kl: float = 0.01,
    L_max: int = 512,
    L_cache: int = 100,
) -> Tuple[torch.Tensor, Dict]:
    """
    Compute the full GRPO-TCR loss.

    Combines:
    - Token-level clipped surrogate loss with clip-higher
    - KL divergence penalty
    - Overlong reward shaping

    Args:
        log_probs_new: (batch, seq_len) log probs under current policy
        log_probs_ref: (batch, seq_len) log probs under reference policy
        advantages: (batch,) group-relative advantages
        response_lengths: (batch,) token counts per response
        eps_low: Lower clip bound offset
        eps_high: Upper clip bound offset (clip-higher)
        beta_kl: KL penalty coefficient
        L_max, L_cache: Overlong shaping parameters

    Returns:
        (loss, metrics_dict) where loss is scalar and metrics_dict contains
        'mean_advantage', 'mean_kl', 'mean_length_penalty', 'mean_ratio'
    """
    # ============ TODO ============
    # Step 1: Compute policy ratio: ratio = exp(log_new - log_ref)
    # Step 2: Expand advantages to token level: (batch,) -> (batch, 1)
    # Step 3: Unclipped objective = ratio * advantages
    # Step 4: Clipped ratio = clamp(ratio, 1-eps_low, 1+eps_high)
    # Step 5: Clipped objective = clipped_ratio * advantages
    # Step 6: Surrogate loss = -mean(min(unclipped, clipped))
    # Step 7: KL penalty = beta_kl * mean(log_new - log_ref)
    # Step 8: Length penalties: compute overlong_reward for each response
    # Step 9: Total loss = surrogate + KL + mean(length_penalties)
    # Step 10: Collect metrics
    # ==============================

    loss = None  # YOUR CODE HERE
    metrics = {}  # YOUR CODE HERE

    return loss, metrics

In [None]:
# Verification
batch, seq = 8, 20
log_new = torch.randn(batch, seq, device=device) * 0.1 - 2.0
log_ref = torch.randn(batch, seq, device=device) * 0.1 - 2.0
advs = torch.randn(batch, device=device)
lengths = torch.randint(50, 600, (batch,), device=device).float()

loss, metrics = grpo_tcr_loss(log_new, log_ref, advs, lengths)
assert loss.dim() == 0, f"Loss should be scalar, got shape {loss.shape}"
assert not torch.isnan(loss), "Loss is NaN"
assert 'mean_kl' in metrics, "Metrics should include mean_kl"
print(f"GRPO-TCR loss verification passed. Loss: {loss.item():.4f}")
print(f"Metrics: { {k: f'{v:.4f}' for k, v in metrics.items()} }")

## 5. Training Strategy

### TODO 5: Implement the Training Loop

In [None]:
class SimplePolicy(nn.Module):
    """Simplified policy model for T4 training."""
    def __init__(self, vocab_size, hidden_size=128, num_heads=4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=0)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=hidden_size, nhead=num_heads,
            dim_feedforward=256, batch_first=True, dropout=0.1
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=2)
        self.head = nn.Linear(hidden_size, vocab_size)

    def forward(self, input_ids):
        x = self.embedding(input_ids)
        x = self.encoder(x)
        return self.head(x)

    def get_log_probs(self, input_ids, target_ids):
        logits = self.forward(input_ids)
        log_probs = F.log_softmax(logits, dim=-1)
        return log_probs.gather(2, target_ids.unsqueeze(-1)).squeeze(-1)


def train_grpo_tcr(
    policy: nn.Module,
    ref_policy: nn.Module,
    prm: ProcessRewardModel,
    train_triples: List[Dict],
    tokenizer,
    num_steps: int = 300,
    batch_size: int = 8,
    lr: float = 1e-4,
    beta_kl: float = 0.01,
) -> Dict[str, List]:
    """
    Train the policy using GRPO-TCR on conversation triples.

    Args:
        policy: The trainable policy model
        ref_policy: Frozen reference policy (same architecture, initial weights)
        prm: Process Reward Model for scoring
        train_triples: List of (context, response, feedback) triples
        tokenizer: Tokenizer for encoding text
        num_steps: Number of training steps
        batch_size: Batch size
        lr: Learning rate
        beta_kl: KL penalty coefficient

    Returns:
        Dict of training metrics lists: 'losses', 'kl_divs', 'mean_rewards',
        'prm_accuracies'

    Implementation:
        1. Initialize AdamW optimizer with weight_decay=0.01
        2. For each step:
           a. Sample a batch of triples
           b. Tokenize the response and feedback
           c. Get PRM rewards via majority voting
           d. Compute GRPO advantages from rewards
           e. Get log-probs from policy and ref_policy
           f. Compute GRPO-TCR loss
           g. Backprop and step (with gradient clipping at max_norm=1.0)
           h. Log metrics
        3. Return metrics dict
    """
    # ============ TODO ============
    # Implement the full training loop following the steps above.
    # Log loss, KL divergence, mean reward, and PRM accuracy at each step.
    # Print progress every 50 steps.
    # ==============================

    metrics = {
        'losses': [],
        'kl_divs': [],
        'mean_rewards': [],
    }

    # YOUR CODE HERE

    return metrics

In [None]:
# Initialize models
vocab_size = len(tokenizer)
policy = SimplePolicy(vocab_size).to(device)
ref_policy = SimplePolicy(vocab_size).to(device)
ref_policy.load_state_dict(policy.state_dict())
for p in ref_policy.parameters():
    p.requires_grad = False

print(f"Policy parameters: {sum(p.numel() for p in policy.parameters()):,}")
print("Starting GRPO-TCR training...")

# Train (reduce num_steps for faster iteration during development)
train_metrics = train_grpo_tcr(
    policy, ref_policy, prm, all_triples,
    tokenizer, num_steps=300, batch_size=8
)

## 6. Evaluation

### TODO 6: Evaluate and Compare Against Baseline

In [None]:
def evaluate_personalization(
    policy: nn.Module,
    ref_policy: nn.Module,
    test_triples: List[Dict],
    tokenizer,
    baseline_accuracy: float,
) -> Dict:
    """
    Evaluate the personalized model against the baseline.

    Compute:
    1. Average log-probability improvement (personalized vs reference)
    2. KL divergence distribution
    3. Simulated correction rate reduction

    Args:
        policy: Trained personalized policy
        ref_policy: Frozen reference policy
        test_triples: Held-out test triples
        tokenizer: Tokenizer
        baseline_accuracy: The prompt injection baseline's accuracy

    Returns:
        Dict with 'preference_accuracy', 'mean_kl', 'correction_rate_reduction',
        'log_prob_improvements'
    """
    # ============ TODO ============
    # Step 1: For each test triple, tokenize the response
    # Step 2: Compute log-probs under both policy and ref_policy
    # Step 3: If policy log-prob > ref log-prob, count as "improved"
    # Step 4: Compute KL divergence
    # Step 5: Estimate correction rate reduction
    # Step 6: Return results dict
    # ==============================

    results = {}  # YOUR CODE HERE

    return results

In [None]:
# Parse validation triples
val_triples = []
for sample in val_dataset:
    try:
        turns = parse_conversation(sample['chosen'])
        val_triples.extend(extract_training_triples(turns))
    except Exception:
        pass

eval_results = evaluate_personalization(
    policy, ref_policy, val_triples[:100], tokenizer, baseline_accuracy
)

# Generate comparison plots
fig, axes = plt.subplots(1, 3, figsize=(16, 4))

# Plot 1: Training loss curve
axes[0].plot(train_metrics['losses'], alpha=0.3, color='blue')
window = 20
if len(train_metrics['losses']) > window:
    smoothed = np.convolve(train_metrics['losses'], np.ones(window)/window, mode='valid')
    axes[0].plot(smoothed, linewidth=2, color='blue')
axes[0].set_xlabel('Training Step')
axes[0].set_ylabel('GRPO-TCR Loss')
axes[0].set_title('Training Convergence')
axes[0].grid(True, alpha=0.3)

# Plot 2: Preference accuracy comparison
# YOUR CODE HERE — bar chart comparing baseline vs RL

# Plot 3: KL divergence over training
axes[2].plot(train_metrics['kl_divs'], alpha=0.3, color='red')
if len(train_metrics['kl_divs']) > window:
    smoothed_kl = np.convolve(train_metrics['kl_divs'], np.ones(window)/window, mode='valid')
    axes[2].plot(smoothed_kl, linewidth=2, color='red')
axes[2].axhline(y=5.0, color='black', linestyle='--', label='KL threshold')
axes[2].set_xlabel('Training Step')
axes[2].set_ylabel('KL Divergence')
axes[2].set_title('Policy Drift Monitoring')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nEvaluation Results:")
for k, v in eval_results.items():
    if isinstance(v, float):
        print(f"  {k}: {v:.4f}")
    elif isinstance(v, list):
        print(f"  {k}: {len(v)} values (mean={np.mean(v):.4f})")

Write a 200-word analysis: How does the RL-personalized model compare to the baseline? Where does it succeed, and where does it still struggle?

## 7. Error Analysis

### TODO 7: Categorize Failure Modes

In [None]:
def categorize_errors(
    policy: nn.Module,
    test_triples: List[Dict],
    tokenizer,
    num_samples: int = 50,
) -> Dict[str, int]:
    """
    Categorize errors in the personalized model's outputs.

    Error categories:
    - 'style_mismatch': Response has wrong formatting or tone
    - 'content_error': Response contains factually wrong information
    - 'preference_ignored': Response ignores a stated preference
    - 'too_verbose': Response is unnecessarily long
    - 'too_brief': Response is too short to be useful
    - 'correct': Response was good (not an error)

    Args:
        policy: Trained policy model
        test_triples: Test data
        tokenizer: Tokenizer
        num_samples: Number of samples to analyze

    Returns:
        Dict mapping error category to count

    Approach:
        1. For each sample, tokenize and get model output
        2. Compare model output log-probs with reference
        3. Use the feedback text to classify the error type
           (keyword heuristics: "wrong" -> content_error,
            "verbose"/"long" -> too_verbose, "short"/"more" -> too_brief,
            "style"/"format" -> style_mismatch, "prefer"/"want" -> preference_ignored)
        4. Return counts
    """
    # ============ TODO ============
    # Implement error categorization
    # ==============================

    error_counts = {}  # YOUR CODE HERE

    return error_counts

In [None]:
errors = categorize_errors(policy, val_triples, tokenizer)
print("Error Distribution:")
total = sum(errors.values())
for category, count in sorted(errors.items(), key=lambda x: -x[1]):
    print(f"  {category}: {count} ({100*count/total:.0f}%)")

# Visualize
plt.figure(figsize=(8, 5))
plt.bar(errors.keys(), errors.values(), color='#e74c3c', alpha=0.8)
plt.xticks(rotation=45, ha='right')
plt.ylabel('Count')
plt.title('Error Distribution by Category')
plt.tight_layout()
plt.show()

Identify the top 3 failure modes. For each, explain: (a) why the RL system fails here, and (b) what modification to the training pipeline would address it.

## 8. Scalability and Deployment

### TODO 8: Inference Latency Benchmarking

In [None]:
def benchmark_inference(
    model: nn.Module,
    tokenizer,
    num_samples: int = 100,
    seq_lengths: List[int] = [32, 64, 128, 256, 512],
) -> Dict[str, Dict[str, float]]:
    """
    Benchmark inference latency at various sequence lengths.

    For each sequence length, run num_samples forward passes and
    record p50, p95, p99 latencies in milliseconds.

    Args:
        model: The policy model
        tokenizer: Tokenizer
        num_samples: Number of samples per sequence length
        seq_lengths: List of sequence lengths to test

    Returns:
        Dict mapping seq_length (str) to latency dict with
        keys 'p50', 'p95', 'p99' (values in milliseconds)
    """
    import time
    # ============ TODO ============
    # Step 1: For each sequence length, generate random input_ids
    # Step 2: Run num_samples forward passes, recording time for each
    # Step 3: Compute p50, p95, p99 from the latency distribution
    # Step 4: Return results
    # Hint: Use torch.cuda.synchronize() before timing for accurate GPU measurement
    # ==============================

    results = {}  # YOUR CODE HERE

    return results

In [None]:
latency_results = benchmark_inference(policy, tokenizer)
print("Inference Latency Benchmark:")
print(f"{'Seq Length':>12} {'p50':>10} {'p95':>10} {'p99':>10}")
print("-" * 45)
for seq_len, stats in sorted(latency_results.items(), key=lambda x: int(x[0])):
    print(f"{seq_len:>12} {stats['p50']:>8.1f}ms {stats['p95']:>8.1f}ms {stats['p99']:>8.1f}ms")

budget_200ms = all(v['p99'] < 200 for v in latency_results.values())
print(f"\nAll p99 latencies within 200ms budget: {'YES' if budget_200ms else 'NO'}")

## 9. Ethical and Regulatory Analysis

### TODO 9: Ethical Impact Assessment

Write a 300-word ethical impact assessment for NexaCode's personalization system. Address the following three concerns:

1. **Reinforcement of bad practices:** Could the RL system learn to produce insecure code because a developer consistently approves it? What guardrails should be in place?

2. **Privacy of developer conversations:** The system learns from every conversation turn. What data minimization practices should NexaCode implement? How should correction data be handled when a developer leaves the company?

3. **Fairness across experience levels:** Junior developers make more corrections (because they learn from the AI's output). Could the RL system inadvertently optimize differently for junior vs. senior developers, creating a two-tier experience?

For each concern, propose a specific technical mitigation.

In [None]:
# Write your assessment as a multi-line string
ethical_assessment = """
YOUR 300-WORD ASSESSMENT HERE
"""

print(ethical_assessment)
assert len(ethical_assessment.split()) >= 250, "Assessment should be at least 250 words"
print(f"\nWord count: {len(ethical_assessment.split())}")

## Summary

In this case study notebook, you built a simplified version of NexaCode's developer personalization pipeline:

1. **Data acquisition:** Parsed Anthropic HH-RLHF conversations into training triples with next-state feedback signals.
2. **EDA:** Analyzed feedback distributions and correction patterns.
3. **Baseline:** Implemented and evaluated the prompt injection baseline.
4. **PRM:** Built a Process Reward Model with majority voting.
5. **GRPO-TCR:** Implemented the full loss function with clip-higher and overlong reward shaping.
6. **Training:** Ran an RL training loop on conversation data.
7. **Evaluation:** Compared RL personalization against the baseline.
8. **Error analysis:** Categorized failure modes to guide future improvements.
9. **Deployment:** Benchmarked inference latency for production readiness.

For the production system design (multi-GPU architecture, API design, monitoring, A/B testing, and cost analysis), refer to **Section 4** of the full case study document.

The key insight from this case study: **your everyday conversations with an AI assistant already contain rich training signal.** With the right RL framework (GRPO-TCR for implicit feedback, OPD for explicit corrections), this signal can be extracted and used to continuously personalize the model — all without manual labeling, without service interruption, and without compromising privacy.