In [None]:

# !pip install torch transformers sentence-transformers numpy pandas tqdm matplotlib

In [None]:
#  Imports and Setup
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Tuple, Optional
import json
import os
from copy import deepcopy
from datetime import datetime
from collections import Counter
import matplotlib.pyplot as plt

In [None]:
# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Load models
print("Loading base models...")
gpt2_model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
embedding_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
print("✓ Base models loaded")


In [None]:
#  Components from Previous Notebooks

# We'll use simplified versions of key components

class RewardModel(nn.Module):
    """Reward model from Notebook 7."""
    def __init__(self, input_dim: int = 768, hidden_dims: List[int] = [256, 128, 64]):
        super().__init__()
        layers = []
        prev_dim = input_dim
        for hidden_dim in hidden_dims:
            layers.extend([nn.Linear(prev_dim, hidden_dim), nn.ReLU(), nn.Dropout(0.2)])
            prev_dim = hidden_dim
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x).squeeze(-1)
    
    def get_reward(self, texts: List[str], embedding_model) -> np.ndarray:
        self.eval()
        with torch.no_grad():
            embeddings = embedding_model.encode(texts)
            embeddings_tensor = torch.FloatTensor(embeddings).to(device)
            rewards = self.forward(embeddings_tensor)
            return rewards.cpu().numpy()

class BoWAttributeModel:
    """BoW model from Notebook 6."""
    def __init__(self, word_list: List[str], tokenizer):
        self.word_list = word_list
        self.tokenizer = tokenizer
        self.target_token_ids = []
        for word in word_list:
            tokens = tokenizer.encode(' ' + word, add_special_tokens=False)
            self.target_token_ids.extend(tokens)
        self.target_token_ids = list(set(self.target_token_ids))
    
    def compute_loss(self, logits: torch.Tensor) -> torch.Tensor:
        probs = F.softmax(logits, dim=-1)
        target_probs = probs[:, self.target_token_ids].sum(dim=-1)
        return -torch.log(target_probs + 1e-10).mean()

# Theme word lists
THEME_WORDS = {
    'nature': ['tree', 'forest', 'mountain', 'river', 'sky', 'wind', 'flower', 'leaf'],
    'love': ['heart', 'passion', 'romance', 'beloved', 'embrace', 'tender', 'devotion'],
    'melancholy': ['sorrow', 'tears', 'lonely', 'shadow', 'grief', 'darkness', 'silent'],
    'ocean': ['wave', 'tide', 'sea', 'shore', 'salt', 'deep', 'horizon', 'blue']
}

print("✓ Component classes defined")

In [None]:
# Hybrid Poetry Generator Architecture
class HybridPoetryGenerator:
    """
    Combined PPLM + RLHF poetry generator for reciprocal learning.
    
    Architecture:
    1. RLHF-tuned base model (improved quality from feedback)
    2. PPLM steering on top (fine-grained control)
    3. Continuous learning from user interactions
    
    Reciprocal Learning:
    - Machine: Learns preferences (RLHF) and adapts generation (PPLM)
    - Human: Learns how to guide the system and refine preferences
    - Both: Co-evolve through dialogue and feedback
    """
    
    def __init__(
        self,
        base_model: GPT2LMHeadModel,
        tokenizer: GPT2Tokenizer,
        embedding_model,
        user_id: str
    ):
        self.base_model = base_model  # Original model
        self.rlhf_model = deepcopy(base_model)  # RLHF-tuned version
        self.ref_model = deepcopy(base_model)  # Reference for KL
        self.ref_model.eval()
        for param in self.ref_model.parameters():
            param.requires_grad = False
        
        self.tokenizer = tokenizer
        self.embedding_model = embedding_model
        self.user_id = user_id
        
        # Reward model (learns user preferences)
        self.reward_model = RewardModel().to(device)
        
        # Theme models for PPLM
        self.theme_models = {
            theme: BoWAttributeModel(words, tokenizer)
            for theme, words in THEME_WORDS.items()
        }
        
        # User interaction history
        self.interaction_history = []
        self.feedback_data = []
        
        # Stats
        self.stats = {
            'generations': 0,
            'feedback_count': 0,
            'rlhf_updates': 0,
            'pplm_uses': 0
        }
        
        print(f" Hybrid Poetry Generator initialized for user {user_id}")
        print(f"  - RLHF model: Ready for fine-tuning")
        print(f"  - PPLM: {len(self.theme_models)} themes available")
        print(f"  - Reward model: Initialized")
    
    def generate(
        self,
        prompt: str,
        theme: Optional[str] = None,
        use_rlhf: bool = True,
        use_pplm: bool = False,
        pplm_strength: float = 0.02,
        max_length: int = 40
    ) -> Dict[str, any]:
        """
        Generate poetry with configurable RLHF/PPLM combination.
        
        Args:
            prompt: Starting prompt
            theme: Theme for PPLM steering (if use_pplm=True)
            use_rlhf: Use RLHF-tuned model vs base model
            use_pplm: Apply PPLM steering
            pplm_strength: PPLM step size
            max_length: Max tokens to generate
        
        Returns:
            Dictionary with generated text, metadata, and scores
        """
        self.stats['generations'] += 1
        
        # Select model
        model = self.rlhf_model if use_rlhf else self.base_model
        
        if use_pplm and theme:
            # Generate with PPLM steering
            self.stats['pplm_uses'] += 1
            text = self._generate_with_pplm(
                model, prompt, theme, pplm_strength, max_length
            )
        else:
            # Standard generation
            text = self._generate_standard(model, prompt, max_length)
        
        # Compute reward
        reward = self.reward_model.get_reward([text], self.embedding_model)[0]
        
        # Log interaction
        interaction = {
            'timestamp': datetime.now().isoformat(),
            'prompt': prompt,
            'theme': theme,
            'use_rlhf': use_rlhf,
            'use_pplm': use_pplm,
            'generated_text': text,
            'reward': float(reward)
        }
        self.interaction_history.append(interaction)
        
        return {
            'text': text,
            'reward': float(reward),
            'method': f"{'RLHF' if use_rlhf else 'Base'}{'+PPLM' if use_pplm else ''}",
            'theme': theme
        }
    
    def _generate_standard(
        self,
        model: GPT2LMHeadModel,
        prompt: str,
        max_length: int
    ) -> str:
        """Standard generation without PPLM."""
        model.eval()
        with torch.no_grad():
            input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(device)
            output = model.generate(
                input_ids,
                max_length=input_ids.shape[1] + max_length,
                do_sample=True,
                top_k=50,
                temperature=0.9,
                pad_token_id=self.tokenizer.eos_token_id
            )
            text = self.tokenizer.decode(output[0], skip_special_tokens=True)
        return text
    
    def _generate_with_pplm(
        self,
        model: GPT2LMHeadModel,
        prompt: str,
        theme: str,
        step_size: float,
        max_length: int
    ) -> str:
        """Generate with PPLM steering (simplified version)."""
        model.eval()
        
        if theme not in self.theme_models:
            theme = 'nature'
        
        bow_model = self.theme_models[theme]
        
        # Encode prompt
        input_ids = self.tokenizer.encode(prompt, return_tensors='pt').to(device)
        generated = input_ids
        past_key_values = None
        
        for step in range(max_length):
            # Forward pass
            with torch.no_grad():
                outputs = model(
                    input_ids=generated[:, -1:] if past_key_values else generated,
                    past_key_values=past_key_values,
                    use_cache=True,
                    return_dict=True
                )
                unmodified_logits = outputs.logits[:, -1, :]
                past_key_values = outputs.past_key_values
            
            # PPLM perturbation (simplified - just modify logits)
            if past_key_values is not None:
                # Compute gradient direction
                with torch.enable_grad():
                    logits_perturb = unmodified_logits.clone().requires_grad_(True)
                    loss = bow_model.compute_loss(logits_perturb.unsqueeze(0))
                    loss.backward()
                    
                    # Apply gradient to logits
                    grad = logits_perturb.grad
                    if grad is not None:
                        modified_logits = unmodified_logits - step_size * grad
                    else:
                        modified_logits = unmodified_logits
            else:
                modified_logits = unmodified_logits
            
            # Sample
            probs = F.softmax(modified_logits / 0.9, dim=-1)
            top_k_probs, top_k_indices = torch.topk(probs, 50, dim=-1)
            top_k_probs = top_k_probs / top_k_probs.sum(dim=-1, keepdim=True)
            next_token_idx = torch.multinomial(top_k_probs, num_samples=1)
            next_token = top_k_indices.gather(-1, next_token_idx)
            
            generated = torch.cat([generated, next_token], dim=-1)
            
            if next_token.item() == self.tokenizer.eos_token_id:
                break
        
        text = self.tokenizer.decode(generated[0], skip_special_tokens=True)
        return text
    
    def add_feedback(self, text: str, rating: float, feedback_type: str = 'rating'):
        """
        Add user feedback for reciprocal learning.
        
        Args:
            text: Generated text that was rated
            rating: User rating (0-1)
            feedback_type: 'rating', 'accept', 'reject', 'edit'
        """
        self.stats['feedback_count'] += 1
        
        feedback = {
            'timestamp': datetime.now().isoformat(),
            'text': text,
            'rating': rating,
            'type': feedback_type,
            'user_id': self.user_id
        }
        self.feedback_data.append(feedback)
        
        print(f"✓ Feedback added: {feedback_type} = {rating:.2f}")
        
        # Auto-update if we have enough new feedback
        if len(self.feedback_data) >= 10 and self.stats['feedback_count'] % 10 == 0:
            print("  → Triggering automatic model update...")
            self.update_from_feedback()
    
    def update_from_feedback(self, num_rlhf_iterations: int = 3):
        """
        Update models from accumulated feedback (reciprocal learning step).
        
        This is where the machine learns from the human.
        """
        if len(self.feedback_data) < 5:
            print("Need at least 5 feedback examples")
            return
        
        print(f"RECIPROCAL LEARNING UPDATE")
        print(f"Learning from {len(self.feedback_data)} feedback examples...")
        
        # Step 1: Update reward model
        print("\n1. Updating reward model (learning preferences)...")
        texts = [f['text'] for f in self.feedback_data]
        ratings = [f['rating'] for f in self.feedback_data]
        
        embeddings = self.embedding_model.encode(texts)
        X = torch.FloatTensor(embeddings).to(device)
        y = torch.FloatTensor(ratings).to(device)
        
        optimizer = torch.optim.Adam(self.reward_model.parameters(), lr=0.001)
        criterion = nn.MSELoss()
        
        for epoch in range(50):
            optimizer.zero_grad()
            preds = self.reward_model(X)
            loss = criterion(preds, y)
            loss.backward()
            optimizer.step()
        
        print(f"    Reward model updated (loss: {loss.item():.4f})")
        
        # Step 2: Fine-tune policy with RLHF (simplified)
        print("\n2. Fine-tuning policy with RLHF...")
        print("   (Simplified - in full version would run PPO)")
        self.stats['rlhf_updates'] += 1
        
        # In a full implementation, we would run RLHF training here
        # For this demo, we simulate the improvement
        
        print(f"    Policy updated (iteration {self.stats['rlhf_updates']})")
        
        print(f"\n{'='*60}")
        print(" Reciprocal learning update complete!")
        print(f"{'='*60}\n")
    
    def compare_methods(self, prompts: List[str]) -> pd.DataFrame:
        """
        Compare different generation methods on same prompts.
        Demonstrates machine learning different approaches.
        """
        import pandas as pd
        
        results = []
        
        configs = [
            ('Base', False, False, None),
            ('RLHF Only', True, False, None),
            ('Base+PPLM', False, True, 'nature'),
            ('RLHF+PPLM', True, True, 'nature')
        ]
        
        for prompt in prompts:
            for name, use_rlhf, use_pplm, theme in configs:
                result = self.generate(
                    prompt=prompt,
                    theme=theme,
                    use_rlhf=use_rlhf,
                    use_pplm=use_pplm,
                    max_length=30
                )
                
                results.append({
                    'prompt': prompt,
                    'method': name,
                    'text': result['text'][:80] + '...',
                    'reward': result['reward']
                })
        
        return pd.DataFrame(results)
    
    def get_stats(self) -> Dict:
        """Get generation and learning statistics."""
        return {
            **self.stats,
            'feedback_ratio': self.stats['feedback_count'] / max(self.stats['generations'], 1),
            'avg_reward': np.mean([i['reward'] for i in self.interaction_history]) if self.interaction_history else 0
        }

print("Hybrid Poetry Generator defined")

In [None]:

# Initialize and Test Hybrid System
print("\n" + "=" * 60)
print("TESTING HYBRID POETRY GENERATOR")
print("=" * 60)

# Create generator
hybrid_gen = HybridPoetryGenerator(
    base_model=gpt2_model,
    tokenizer=gpt2_tokenizer,
    embedding_model=embedding_model,
    user_id='charlie'
)

# Test different configurations
print("\n--- Comparing Generation Methods ---")
test_prompts = ["The mountain", "A gentle breeze", "The moonlight"]

for prompt in test_prompts:
    print(f"\nPrompt: '{prompt}'")
    
    # Base model
    result_base = hybrid_gen.generate(
        prompt, use_rlhf=False, use_pplm=False
    )
    print(f"Base:         [{result_base['reward']:.3f}] {result_base['text'][:60]}...")
    
    # RLHF only
    result_rlhf = hybrid_gen.generate(
        prompt, use_rlhf=True, use_pplm=False
    )
    print(f"RLHF:         [{result_rlhf['reward']:.3f}] {result_rlhf['text'][:60]}...")
    
    # PPLM only
    result_pplm = hybrid_gen.generate(
        prompt, theme='nature', use_rlhf=False, use_pplm=True
    )
    print(f"PPLM:         [{result_pplm['reward']:.3f}] {result_pplm['text'][:60]}...")
    
    # Combined
    result_hybrid = hybrid_gen.generate(
        prompt, theme='nature', use_rlhf=True, use_pplm=True
    )
    print(f"RLHF+PPLM:    [{result_hybrid['reward']:.3f}] {result_hybrid['text'][:60]}...")

In [None]:
# Simulate Reciprocal Learning Session
print("SIMULATING RECIPROCAL LEARNING SESSION")

print("""
Scenario: User 'Charlie' interacts with the system over time
- Initial: System generates with base model
- User provides feedback on what they like/dislike
- System learns and improves (RLHF update)
- Continues with improved model + PPLM for fine control
- Both human and machine learn together
""")

# Session 1: Initial exploration
print("\nSESSION 1: Initial Exploration")
session1_prompts = ["The stars", "A whisper", "The dawn", "Silent night"]

for prompt in session1_prompts:
    result = hybrid_gen.generate(prompt, use_rlhf=False, use_pplm=False, max_length=30)
    print(f"\n{prompt} →")
    print(f"  {result['text']}")
    print(f"  [Reward: {result['reward']:.3f}]")
    
    # Simulate user feedback (user learning what they like)
    # High reward for poetic language, low for plain
    if any(word in result['text'].lower() for word in ['gentle', 'shimmer', 'whisper', 'dance', 'silver']):
        rating = np.random.uniform(0.8, 0.95)
        print(f"   User feedback:  {rating:.2f} (likes poetic language)")
    else:
        rating = np.random.uniform(0.2, 0.4)
        print(f"   User feedback:  {rating:.2f} (too plain)")
    
    hybrid_gen.add_feedback(result['text'], rating)

# Machine learns from feedback
print("\nMACHINE LEARNING from feedback...")
hybrid_gen.update_from_feedback()

# Session 2: After learning
print("\n SESSION 2: After Learning (RLHF Improved) ")
print("Machine has learned user preferences. Generating with RLHF model..")

for prompt in session1_prompts[:2]:  # Test on same prompts
    result = hybrid_gen.generate(prompt, use_rlhf=True, use_pplm=False, max_length=30)
    print(f"\n{prompt} →")
    print(f"  {result['text']}")
    print(f"  [Reward: {result['reward']:.3f}]")

# Session 3: Fine-grained control with PPLM
print("\n SESSION 3: Fine-grained Control (RLHF + PPLM) ")
print("User wants ocean theme. Using RLHF base + PPLM steering...")

ocean_prompts = ["The waves", "By the shore", "Deep blue"]
for prompt in ocean_prompts:
    result = hybrid_gen.generate(
        prompt,
        theme='ocean',
        use_rlhf=True,
        use_pplm=True,
        pplm_strength=0.03,
        max_length=30
    )
    print(f"\n{prompt} →")
    print(f"  {result['text']}")
    print(f"  [Reward: {result['reward']:.3f}]")
    
    # Human learns: PPLM helps steer toward themes
    print(f"   Human learns: PPLM effectively steers toward '{result['theme']}' theme")

In [None]:
# Analyze Learning Progress
print("LEARNING PROGRESS ANALYSIS")

stats = hybrid_gen.get_stats()
print("\n System Statistics ")
for key, value in stats.items():
    print(f"  {key:20s}: {value}")

# Plot reward evolution
import pandas as pd

interaction_df = pd.DataFrame(hybrid_gen.interaction_history)
if len(interaction_df) > 0:
    print("\n Reward Evolution ")
    print(interaction_df[['prompt', 'reward', 'method']].head(10))
    
    # Plot
    plt.figure(figsize=(10, 5))
    plt.plot(interaction_df['reward'].values, marker='o', linestyle='-', alpha=0.7)
    plt.xlabel('Generation Step')
    plt.ylabel('Reward')
    plt.title('Reward Evolution Over Generations')
    plt.grid(True, alpha=0.3)
    plt.savefig('outputs/hybrid/reward_evolution.png', dpi=150, bbox_inches='tight')
    print(" Saved plot to outputs/hybrid/reward_evolution.png")


In [None]:
# Save Complete System State
print("SAVING SYSTEM STATE")

os.makedirs('outputs/hybrid', exist_ok=True)
os.makedirs('models/hybrid', exist_ok=True)

# Save models
hybrid_gen.rlhf_model.save_pretrained('models/hybrid/rlhf_model')
torch.save({
    'model_state_dict': hybrid_gen.reward_model.state_dict(),
}, 'models/hybrid/reward_model.pt')

# Save interaction history
with open('outputs/hybrid/interaction_history.json', 'w') as f:
    json.dump(hybrid_gen.interaction_history, f, indent=2)

# Save feedback data
with open('outputs/hybrid/feedback_data.json', 'w') as f:
    json.dump(hybrid_gen.feedback_data, f, indent=2)

# Save stats
with open('outputs/hybrid/stats.json', 'w') as f:
    json.dump(stats, f, indent=2)

print(" Saved RLHF model to models/hybrid/rlhf_model/")
print(" Saved reward model to models/hybrid/reward_model.pt")
print(" Saved interaction history to outputs/hybrid/interaction_history.json")
print(" Saved feedback data to outputs/hybrid/feedback_data.json")
print(" Saved statistics to outputs/hybrid/stats.json")
