# üß™ GPT-Neo 125M Sensitivity Analysis

**Compare smaller vs larger LLM for reward shaping.**

This notebook tests whether a smaller, faster LLM (GPT-Neo 125M) can achieve
similar reward shaping benefits compared to GPT-Neo 1.3B.

## Research Questions:
- Does model size affect shaping quality?
- Can we get the benefits of LLM shaping with reduced compute?
- What's the latency vs performance tradeoff?

‚ö†Ô∏è **GPU Required** for reasonable inference times

## 1Ô∏è‚É£ Setup

In [None]:
# Install dependencies (Colab)
!pip install stable-baselines3 gymnasium 'shimmy>=1.3.0' transformers torch matplotlib pandas tqdm -q

In [None]:
# Clone and install Overcooked-AI
!git clone https://github.com/HumanCompatibleAI/overcooked_ai.git
%cd overcooked_ai
!pip install -e . -q
%cd ..

In [None]:
import os
import time
import torch
import numpy as np
import pandas as pd
import gymnasium as gym
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from gymnasium import Wrapper
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback
from transformers import AutoModelForCausalLM, AutoTokenizer
from overcooked_ai_py.mdp.overcooked_mdp import OvercookedGridworld
from overcooked_ai_py.mdp.overcooked_env import OvercookedEnv

sns.set_theme(style="whitegrid")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Mount Google Drive (Colab)
from google.colab import drive
drive.mount('/content/drive')

## 2Ô∏è‚É£ Configuration

In [None]:
# ==========================================
# CONFIGURATION
# ==========================================

# Models to compare
MODELS = {
    "GPT-Neo-125M": "EleutherAI/gpt-neo-125m",
    "GPT-Neo-1.3B": "EleutherAI/gpt-neo-1.3B",
}

# Environment settings
LAYOUT = "asymmetric_advantages"
HORIZON = 400

# Training settings
TIMESTEPS = 100_000  # Shorter runs for sensitivity analysis
SEEDS = [42, 123, 456]

# LLM shaping
SHAPING_COEFF = 0.05
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Output
OUTPUT_DIR = "/content/drive/MyDrive/llm_sensitivity"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Testing models: {list(MODELS.keys())}")
print(f"Device: {DEVICE}")
print(f"Seeds: {SEEDS}")
print(f"Timesteps: {TIMESTEPS:,}")

## 3Ô∏è‚É£ LLM Loading

In [None]:
def load_llm(model_name, model_path):
    """Load an LLM model and tokenizer."""
    print(f"Loading {model_name}...")
    
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
    ).to(DEVICE)
    model.eval()
    
    # Count parameters
    n_params = sum(p.numel() for p in model.parameters())
    print(f"  Parameters: {n_params / 1e6:.1f}M")
    
    return model, tokenizer

In [None]:
# Load both models
loaded_models = {}
for name, path in MODELS.items():
    model, tokenizer = load_llm(name, path)
    loaded_models[name] = {"model": model, "tokenizer": tokenizer}

## 4Ô∏è‚É£ Latency Comparison

In [None]:
def measure_inference_latency(model, tokenizer, n_samples=100):
    """Measure average inference latency."""
    
    test_prompt = (
        "In a cooperative cooking game, evaluate this state: "
        "Player 1 is holding a tomato near the pot. "
        "Player 2 is at the dish dispenser."
    )
    
    # Warmup
    for _ in range(5):
        inputs = tokenizer(test_prompt, return_tensors="pt", padding=True).to(DEVICE)
        with torch.no_grad():
            _ = model(**inputs)
    
    # Measure
    latencies = []
    for _ in range(n_samples):
        inputs = tokenizer(test_prompt, return_tensors="pt", padding=True).to(DEVICE)
        
        if DEVICE == "cuda":
            torch.cuda.synchronize()
        
        start = time.perf_counter()
        with torch.no_grad():
            _ = model(**inputs)
        
        if DEVICE == "cuda":
            torch.cuda.synchronize()
        
        latencies.append((time.perf_counter() - start) * 1000)  # ms
    
    return np.mean(latencies), np.std(latencies)

In [None]:
# Compare inference latency
print("Measuring inference latency...")
latency_results = []

for name, data in loaded_models.items():
    mean_ms, std_ms = measure_inference_latency(data["model"], data["tokenizer"])
    latency_results.append({
        "model": name,
        "latency_ms": mean_ms,
        "latency_std": std_ms
    })
    print(f"  {name}: {mean_ms:.2f} ¬± {std_ms:.2f} ms")

latency_df = pd.DataFrame(latency_results)
latency_df

In [None]:
# Visualize latency comparison
fig, ax = plt.subplots(figsize=(8, 5))
bars = ax.bar(latency_df["model"], latency_df["latency_ms"], 
              yerr=latency_df["latency_std"], capsize=5, color=["#3498db", "#e74c3c"])
ax.set_ylabel("Latency (ms)")
ax.set_title("LLM Inference Latency Comparison")

# Add speedup annotation
if len(latency_df) == 2:
    speedup = latency_df.iloc[1]["latency_ms"] / latency_df.iloc[0]["latency_ms"]
    ax.annotate(f"{speedup:.1f}x faster", xy=(0, latency_df.iloc[0]["latency_ms"]),
                xytext=(0.3, latency_df.iloc[1]["latency_ms"] * 0.8),
                fontsize=12, color="green", fontweight="bold")

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "latency_comparison.png"), dpi=150)
plt.show()

## 5Ô∏è‚É£ Environment Wrappers

In [None]:
class TwoAgentOvercookedEnv(gym.Env):
    """Gym wrapper for Overcooked with joint action space."""
    
    def __init__(self, layout="asymmetric_advantages", horizon=400):
        super().__init__()
        self.mdp = OvercookedGridworld.from_layout_name(layout)
        self.base_env = OvercookedEnv.from_mdp(self.mdp, horizon=horizon)
        self.horizon = horizon
        
        n_act = 6  # per agent
        obs_dim = self.base_env.featurize_state_mdp(self.base_env.state)[0].shape[0]
        
        self.action_space = gym.spaces.Discrete(n_act * n_act)  # joint
        self.observation_space = gym.spaces.Box(
            low=-np.inf, high=np.inf, shape=(2 * obs_dim,), dtype=np.float32
        )

    def reset(self, seed=None, **kwargs):
        self.base_env.reset()
        obs = self._get_obs()
        return obs, {}

    def step(self, joint_action):
        a0 = joint_action // 6
        a1 = joint_action % 6
        next_state, reward, done, info = self.base_env.step([a0, a1])
        obs = self._get_obs()
        return obs, float(reward), done, False, info

    def _get_obs(self):
        f0, f1 = self.base_env.featurize_state_mdp(self.base_env.state)
        return np.concatenate([f0, f1]).astype(np.float32)

In [None]:
class LLMShapingWrapper(Wrapper):
    """Adds LLM-based reward shaping."""
    
    def __init__(self, env, model, tokenizer, coeff=0.05):
        super().__init__(env)
        self.model = model
        self.tokenizer = tokenizer
        self.coeff = coeff
        self.total_shaped_reward = 0

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)
        
        # Compute LLM shaping bonus
        shaped = self._compute_shaping(obs)
        self.total_shaped_reward += shaped
        
        return obs, reward + shaped, done, truncated, info

    def _compute_shaping(self, obs):
        """Get LLM-based reward shaping."""
        prompt = f"State features: {obs[:10].tolist()}"  # Truncated for speed
        
        inputs = self.tokenizer(
            prompt, return_tensors="pt", truncation=True, max_length=128
        ).to(self.model.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            logits = outputs.logits
        
        # Convert to scalar bonus
        bonus = torch.tanh(logits.mean()).item()
        return self.coeff * bonus

    def reset(self, **kwargs):
        self.total_shaped_reward = 0
        return self.env.reset(**kwargs)

## 6Ô∏è‚É£ Training Comparison

In [None]:
def train_with_llm(model_name, model, tokenizer, seed, timesteps=100000):
    """Train PPO with LLM shaping."""
    
    # Create environment with shaping
    env = TwoAgentOvercookedEnv(LAYOUT, HORIZON)
    env = LLMShapingWrapper(env, model, tokenizer, SHAPING_COEFF)
    vec_env = DummyVecEnv([lambda: env])
    
    # Train
    agent = PPO("MlpPolicy", vec_env, seed=seed, verbose=0)
    
    start_time = time.time()
    agent.learn(total_timesteps=timesteps)
    train_time = time.time() - start_time
    
    # Evaluate on base env (no shaping)
    eval_env = TwoAgentOvercookedEnv(LAYOUT, HORIZON)
    eval_vec = DummyVecEnv([lambda: eval_env])
    mean_reward, std_reward = evaluate_policy(agent, eval_vec, n_eval_episodes=10)
    
    return {
        "model": model_name,
        "seed": seed,
        "mean_reward": mean_reward,
        "std_reward": std_reward,
        "train_time_s": train_time
    }

In [None]:
# Run training comparison
results = []

for model_name, data in tqdm(loaded_models.items(), desc="Models"):
    for seed in tqdm(SEEDS, desc=f"{model_name} seeds", leave=False):
        result = train_with_llm(
            model_name, 
            data["model"], 
            data["tokenizer"], 
            seed, 
            TIMESTEPS
        )
        results.append(result)
        print(f"  {model_name} seed={seed}: {result['mean_reward']:.2f}")

results_df = pd.DataFrame(results)
results_df.to_csv(os.path.join(OUTPUT_DIR, "sensitivity_results.csv"), index=False)
results_df

## 7Ô∏è‚É£ Analysis

In [None]:
# Aggregate results
agg = results_df.groupby("model").agg({
    "mean_reward": ["mean", "std"],
    "train_time_s": ["mean", "std"]
}).round(2)

print("\nüìä Aggregated Results:")
display(agg)

In [None]:
# Visualization: Reward comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Reward comparison
ax1 = axes[0]
model_means = results_df.groupby("model")["mean_reward"].mean()
model_stds = results_df.groupby("model")["mean_reward"].std()
ax1.bar(model_means.index, model_means.values, yerr=model_stds.values, 
        capsize=5, color=["#3498db", "#e74c3c"])
ax1.set_ylabel("Mean Reward")
ax1.set_title("Training Performance by LLM Size")

# Training time comparison
ax2 = axes[1]
time_means = results_df.groupby("model")["train_time_s"].mean()
time_stds = results_df.groupby("model")["train_time_s"].std()
ax2.bar(time_means.index, time_means.values / 60, yerr=time_stds.values / 60, 
        capsize=5, color=["#3498db", "#e74c3c"])
ax2.set_ylabel("Training Time (minutes)")
ax2.set_title("Training Time by LLM Size")

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "model_comparison.png"), dpi=150)
plt.show()

In [None]:
# Efficiency metric: reward per training minute
results_df["reward_per_minute"] = results_df["mean_reward"] / (results_df["train_time_s"] / 60)

efficiency = results_df.groupby("model")["reward_per_minute"].mean()

plt.figure(figsize=(8, 5))
plt.bar(efficiency.index, efficiency.values, color=["#3498db", "#e74c3c"])
plt.ylabel("Reward per Training Minute")
plt.title("Training Efficiency by LLM Size")
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "efficiency_comparison.png"), dpi=150)
plt.show()

## 8Ô∏è‚É£ Summary

In [None]:
# Combined summary
summary = latency_df.merge(
    results_df.groupby("model").agg({
        "mean_reward": "mean",
        "train_time_s": "mean"
    }).reset_index(),
    on="model"
)

summary["train_time_min"] = summary["train_time_s"] / 60
summary = summary.drop(columns=["latency_std", "train_time_s"])

print("\n" + "="*60)
print("üìà LLM SIZE SENSITIVITY SUMMARY")
print("="*60)
display(summary.round(2))

# Key findings
print("\nüîç Key Findings:")
if len(summary) == 2:
    small = summary[summary["model"] == "GPT-Neo-125M"].iloc[0]
    large = summary[summary["model"] == "GPT-Neo-1.3B"].iloc[0]
    
    latency_speedup = large["latency_ms"] / small["latency_ms"]
    reward_diff = ((small["mean_reward"] - large["mean_reward"]) / abs(large["mean_reward"])) * 100
    time_speedup = large["train_time_min"] / small["train_time_min"]
    
    print(f"  ‚Ä¢ 125M is {latency_speedup:.1f}x faster per inference")
    print(f"  ‚Ä¢ Reward difference: {reward_diff:+.1f}%")
    print(f"  ‚Ä¢ Training speedup: {time_speedup:.1f}x")

In [None]:
# Save final summary
summary.to_csv(os.path.join(OUTPUT_DIR, "final_summary.csv"), index=False)
print(f"\n‚úÖ All results saved to: {OUTPUT_DIR}")