In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm

# 1. Load pre-trained model and tokenizer
model_name = "gpt2"  # Start with a smaller model for demonstration
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set pad token

# 2. Define Reward Model for RLHF
class RewardModel(nn.Module):
    def __init__(self, model_name):
        super().__init__()
        # Use the same architecture as the policy model but with a different head
        self.backbone = AutoModelForCausalLM.from_pretrained(model_name).transformer
        self.reward_head = nn.Linear(self.backbone.config.n_embd, 1)
        
    def forward(self, input_ids, attention_mask=None):
        outputs = self.backbone(input_ids, attention_mask=attention_mask)
        last_hidden_states = outputs.last_hidden_state
        # Take the final token representation for reward prediction
        final_state = last_hidden_states[:, -1, :]
        return self.reward_head(final_state)

# 3. PPO Implementation for the Minecraft Environment
class PPOTrainer:
    def __init__(self, policy_model, reward_model, tokenizer, 
                 lr=1e-5, clip_ratio=0.2, value_coef=0.5, entropy_coef=0.01):
        self.policy_model = policy_model
        self.reward_model = reward_model
        self.tokenizer = tokenizer
        self.optimizer = optim.Adam(self.policy_model.parameters(), lr=lr)
        self.clip_ratio = clip_ratio
        self.value_coef = value_coef
        self.entropy_coef = entropy_coef
        
        # Clone policy for old policy in PPO
        self.old_policy = AutoModelForCausalLM.from_pretrained(model_name)
        self.update_old_policy()
        
    def update_old_policy(self):
        """Copy current policy parameters to old policy"""
        self.old_policy.load_state_dict(self.policy_model.state_dict())
    
    def compute_advantages(self, rewards, values, gamma=0.99, lambda_=0.95):
        """Compute GAE (Generalized Advantage Estimation)"""
        advantages = torch.zeros_like(rewards)
        returns = torch.zeros_like(rewards)
        
        last_gae_lam = 0
        for t in reversed(range(len(rewards))):
            # For simplicity, assuming all episodes end at the same time
            next_value = values[t+1] if t < len(rewards)-1 else 0
            delta = rewards[t] + gamma * next_value - values[t]
            last_gae_lam = delta + gamma * lambda_ * last_gae_lam
            advantages[t] = last_gae_lam
            returns[t] = advantages[t] + values[t]
            
        return advantages, returns
    
    def train_step(self, states, actions, rewards, next_states, dones):
        """
        Execute one PPO update step
        - states: tokenized prompts from Minecraft environment
        - actions: tokenized model responses
        - rewards: rewards from the environment or reward model
        """
        # Prepare batched inputs
        states_tensor = torch.tensor(states, dtype=torch.long)
        actions_tensor = torch.tensor(actions, dtype=torch.long)
        
        # Get value estimates from current policy
        with torch.no_grad():
            values = []
            for state in states_tensor:
                # Use a value head or reward model to estimate values
                value = self.reward_model(state.unsqueeze(0)).item()
                values.append(value)
            values = torch.tensor(values)
        
        # Compute advantages and returns
        rewards_tensor = torch.tensor(rewards)
        advantages, returns = self.compute_advantages(rewards_tensor, values)
        
        # Normalize advantages (helps with training stability)
        advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8)
        
        # Policy and value optimization
        for _ in range(4):  # Multiple epochs of optimization on the same data
            # Get log probs from current and old policy
            curr_log_probs = self.get_log_probs(self.policy_model, states_tensor, actions_tensor)
            old_log_probs = self.get_log_probs(self.old_policy, states_tensor, actions_tensor)
            
            # PPO ratio and clipped objective
            ratio = torch.exp(curr_log_probs - old_log_probs.detach())
            clip_adv = torch.clamp(ratio, 1-self.clip_ratio, 1+self.clip_ratio) * advantages
            policy_loss = -torch.min(ratio * advantages, clip_adv).mean()
            
            # Value loss
            value_loss = F.mse_loss(values, returns)
            
            # Entropy loss for exploration
            entropy = self.compute_entropy(self.policy_model, states_tensor)
            entropy_loss = -entropy.mean()
            
            # Total loss
            loss = policy_loss + self.value_coef * value_loss + self.entropy_coef * entropy_loss
            
            # Optimize
            self.optimizer.zero_grad()
            loss.backward()
            nn.utils.clip_grad_norm_(self.policy_model.parameters(), 0.5)  # Prevent exploding gradients
            self.optimizer.step()
        
        # Update old policy
        self.update_old_policy()
        
        return {
            'policy_loss': policy_loss.item(),
            'value_loss': value_loss.item(),
            'entropy': entropy.mean().item(),
            'total_loss': loss.item()
        }
    
    def get_log_probs(self, model, states, actions):
        """Calculate log probabilities of actions under the policy"""
        # Forward pass through model
        outputs = model(states, labels=actions)
        logits = outputs.logits
        
        # Get log probs for each action token
        log_probs = F.log_softmax(logits[:, :-1, :], dim=-1)
        
        # Gather log probs of the actual actions
        action_log_probs = torch.gather(
            log_probs, 
            2, 
            actions[:, 1:].unsqueeze(-1)
        ).squeeze(-1)
        
        # Return mean log prob per sequence
        return action_log_probs.mean(dim=1)
    
    def compute_entropy(self, model, states):
        """Compute entropy of the policy for exploration"""
        with torch.no_grad():
            outputs = model(states)
            logits = outputs.logits
            probs = F.softmax(logits, dim=-1)
            log_probs = F.log_softmax(logits, dim=-1)
            entropy = -(probs * log_probs).sum(dim=-1).mean(dim=1)
        return entropy

# 4. Minecraft Environment Wrapper (simplified)
class MinecraftEnv:
    def __init__(self, reward_model):
        self.reward_model = reward_model
        # This would integrate with the actual Minecraft API like MineRL
        # (https://minerl.io/) or the Minecraft Python API
        # For demonstration, we'll use a mock environment
        
        # Example tasks in Minecraft
        self.tasks = [
            "Find and mine diamonds",
            "Build a shelter before nightfall",
            "Craft a diamond pickaxe",
            "Defeat the Ender Dragon",
            "Create an automated farm",
        ]
        
        # Example context templates
        self.context_templates = [
            "You are in a forest biome. {}",
            "You are in an underground cave. {}",
            "You are in a village. {}",
            "You are in the Nether. {}",
            "You are on a mountain top. {}"
        ]
    
    def reset(self):
        """Reset environment and return initial observation"""
        # Select random task and context
        task = np.random.choice(self.tasks)
        context = np.random.choice(self.context_templates).format(task)
        self.current_task = task
        self.current_context = context
        return context
    
    def step(self, action):
        """
        Takes an action (model's text response) and returns:
        - next_state: next prompt/context
        - reward: computed by reward model
        - done: whether the episode is complete
        - info: additional information
        """
        # In a real implementation, this would execute the action in Minecraft
        # and return the updated game state
        
        # For demonstration, we'll use the reward model to evaluate the action
        context_tokens = tokenizer.encode(self.current_context, return_tensors="pt")
        action_tokens = tokenizer.encode(action, return_tensors="pt")
        
        # Concatenate context and action for reward model
        full_sequence = torch.cat([context_tokens, action_tokens[:, 1:]], dim=1)
        
        # Calculate reward using the reward model
        with torch.no_grad():
            reward = self.reward_model(full_sequence).item()
        
        # Simple termination condition
        done = len(action.split()) > 50  # End episode if response is too long
        
        # Generate new context based on action (simplified)
        if not done:
            next_context = f"After {action[:20]}..., {self.current_task} still needs to be completed."
        else:
            next_context = self.reset()  # New episode
        
        return next_context, reward, done, {}
    
    def close(self):
        """Close environment resources"""
        pass

# 5. Main training loop
def train(policy_model, reward_model, tokenizer, num_episodes=1000):
    """
    Main RLHF training loop for Minecraft LLM agent
    
    Args:
        policy_model: Language model being fine-tuned
        reward_model: Trained reward model
        tokenizer: Tokenizer for the language model
        num_episodes: Number of training episodes
    
    Returns:
        Trained policy model and tokenizer
    """
    # Initialize trainer and environment
    trainer = PPOTrainer(policy_model, reward_model, tokenizer)
    env = MinecraftEnv(reward_model)
    
    # Training loop
    for episode in tqdm(range(num_episodes)):
        # Reset environment
        context = env.reset()
        done = False
        episode_rewards = []
        states = []
        actions = []
        rewards = []
        next_states = []
        dones = []
        
        # Episode loop
        while not done:
            # Tokenize state
            state_tokens = tokenizer.encode(context, return_tensors="pt")
            states.append(state_tokens.squeeze().tolist())
            
            # Generate action using policy model
            with torch.no_grad():
                action_tokens = policy_model.generate(
                    state_tokens,
                    max_length=state_tokens.shape[1] + 50,
                    temperature=0.7,
                    do_sample=True
                )
            
            # Convert tokens to text
            action_text = tokenizer.decode(action_tokens[0][state_tokens.shape[1]:])
            actions.append(action_tokens.squeeze().tolist())
            
            # Take step in environment
            next_context, reward, done, _ = env.step(action_text)
            rewards.append(reward)
            next_states.append(tokenizer.encode(next_context, return_tensors="pt").squeeze().tolist())
            dones.append(done)
            
            # Update current context
            context = next_context
        
        # Update policy with collected experience
        train_stats = trainer.train_step(states, actions, rewards, next_states, dones)
        
        # Log training progress
        if episode % 10 == 0:
            print(f"Episode {episode}: Avg reward: {np.mean(rewards):.4f}, Loss: {train_stats['total_loss']:.4f}")
            
            # Generate sample response for a test prompt
            test_prompt = "You are in a dangerous cave with zombies. Find diamonds."
            test_tokens = tokenizer.encode(test_prompt, return_tensors="pt")
            with torch.no_grad():
                output = policy_model.generate(
                    test_tokens, 
                    max_length=test_tokens.shape[1] + 100,
                    temperature=0.7
                )
            print(f"Sample response: {tokenizer.decode(output[0][test_tokens.shape[1]:])}\n")
            
    # Save the fine-tuned model
    policy_model.save_pretrained("minecraft_rl_model")
    tokenizer.save_pretrained("minecraft_rl_model")
    
    return policy_model, tokenizer

# 6. Reward Model Training (before RLHF)
def train_reward_model(model_name, preference_dataset, num_epochs=3, batch_size=8, lr=1e-5):
    # Initialize reward model
    reward_model = RewardModel(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    
    # In real implementation, load your preference dataset
    # dataset = load_dataset(preference_dataset)
    
    # For demonstration, create a simple mock dataset
    preferred_responses = [
        ("You are in a forest. Find diamonds.", "I'll look for a cave entrance or dig down carefully to layer 12, where diamonds are most common."),
        ("You are in a cave. Build a shelter.", "I'll secure this area by placing torches and building a wall with a door to keep monsters out.")
    ]
    
    rejected_responses = [
        ("You are in a forest. Find diamonds.", "I'll just start digging straight down."),
        ("You are in a cave. Build a shelter.", "I'll explore deeper into the cave without securing the area first.")
    ]
    
    # Loss function and optimizer
    optimizer = optim.Adam(reward_model.parameters(), lr=lr)
    
    # Training loop
    for epoch in range(num_epochs):
        reward_model.train()
        total_loss = 0
        
        # In real implementation, use DataLoader
        for i in range(len(preferred_responses)):
            # Tokenize preferred and rejected responses
            preferred_prompt, preferred_response = preferred_responses[i]
            rejected_prompt, rejected_response = rejected_responses[i]
            
            preferred_tokens = tokenizer(preferred_prompt + preferred_response, return_tensors="pt", truncation=True, max_length=512)
            rejected_tokens = tokenizer(rejected_prompt + rejected_response, return_tensors="pt", truncation=True, max_length=512)
            
            # Forward pass
            preferred_reward = reward_model(preferred_tokens.input_ids)
            rejected_reward = reward_model(rejected_tokens.input_ids)
            
            # Bradley-Terry loss for preference learning
            loss = -F.logsigmoid(preferred_reward - rejected_reward).mean()
            
            # Backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(preferred_responses):.4f}")
    
    # Save the trained reward model
    torch.save(reward_model.state_dict(), "minecraft_reward_model.pt")
    return reward_model

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [5]:
import os
from kaggle_secrets import UserSecretsClient
os.environ["HF_TOKEN"] = UserSecretsClient().get_secret("HF_TOKEN")

In [6]:
"""
Use Case: Creating a Minecraft Assistant for New Players

This script demonstrates how to use the RLHF-trained model to create
an in-game assistant that helps new Minecraft players survive their first night.
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import time

# Load our RLHF fine-tuned model
model_path = "minecraft_rl_model"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

class MinecraftAssistant:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.context = ""
        self.game_time = "day"  # Track in-game time
        self.inventory = []     # Track player inventory
        
    def update_game_state(self, time_of_day, current_biome, inventory):
        """Update the assistant's knowledge of game state"""
        self.game_time = time_of_day
        self.current_biome = current_biome
        self.inventory = inventory
        
    def get_advice(self, player_situation):
        """Generate contextual advice based on player's situation"""
        # Construct a detailed prompt with game state
        prompt = f"""
        You are in a {self.current_biome} biome. 
        It is currently {self.game_time}.
        Your inventory contains: {', '.join(self.inventory) if self.inventory else 'nothing'}.
        
        Situation: {player_situation}
        
        What should I do next to survive and progress?
        """
        
        # Generate response using our RLHF-trained model
        input_ids = self.tokenizer.encode(prompt, return_tensors="pt")
        
        # Set parameters to control generation
        output = self.model.generate(
            input_ids,
            max_length=300,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id
        )
        
        # Extract only the generated response (not the prompt)
        response = self.tokenizer.decode(output[0][input_ids.shape[1]:], skip_special_tokens=True)
        return response

# Demo scenario: New player's first day
def first_day_scenario():
    """Simulate a new player's first day experience"""
    assistant = MinecraftAssistant(model, tokenizer)
    
    # Morning: Just spawned
    assistant.update_game_state("morning", "forest", [])
    situation = "I just spawned in a new world. I see trees around me and some sheep."
    print("Player:", situation)
    advice = assistant.get_advice(situation)
    print("Assistant:", advice)
    time.sleep(2)  # Simulate time passing
    
    # Midday: Collected some basic resources
    assistant.update_game_state("midday", "forest", ["wooden axe", "12 oak logs", "4 wool"])
    situation = "I've gathered some wood and wool. I can see a cave in the distance."
    print("\nPlayer:", situation)
    advice = assistant.get_advice(situation)
    print("Assistant:", advice)
    time.sleep(2)
    
    # Late afternoon: Need to prepare for night
    assistant.update_game_state("late afternoon", "forest", 
                              ["wooden axe", "8 oak logs", "4 wool", "wooden pickaxe", "12 cobblestone", "3 coal"])
    situation = "The sun is going down and I don't have a shelter yet."
    print("\nPlayer:", situation)
    advice = assistant.get_advice(situation)
    print("Assistant:", advice)
    time.sleep(2)
    
    # Night: Danger!
    assistant.update_game_state("night", "forest edge", 
                              ["wooden axe", "3 oak logs", "wooden pickaxe", "crafting table", "8 cobblestone", "3 coal"])
    situation = "It's dark and I hear zombies nearby! I didn't finish my shelter!"
    print("\nPlayer:", situation)
    advice = assistant.get_advice(situation)
    print("Assistant:", advice)

if __name__ == "__main__":
    print("Running Minecraft Assistant Demo for First Day Survival\n")
    first_day_scenario()

OSError: minecraft_rl_model is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`