# LLM Sensitivity Analysis

Analyze sensitivity to LLM reward weighting parameter.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

sns.set_theme(style="whitegrid")

In [None]:
ALPHA_VALUES = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]
N_EVAL_EPISODES = 10
SEEDS = [0, 1, 2]

SENSITIVITY_CSV = "/content/drive/MyDrive/llm_sensitivity.csv"

In [None]:
def run_sensitivity_sweep(env_fn, llm_reward_fn, alpha_values, seeds, n_episodes):
    """Run training across different alpha values and seeds."""
    results = []
    
    for alpha in alpha_values:
        for seed in seeds:
            env = DummyVecEnv([env_fn])
            
            model = PPO(
                "MlpPolicy",
                env,
                n_steps=2048,
                batch_size=2048,
                learning_rate=3e-4,
                seed=seed,
                verbose=0
            )
            
            model.learn(total_timesteps=100_000)
            
            returns = evaluate_policy(model, env, n_episodes, alpha, llm_reward_fn)
            
            results.append({
                "alpha": alpha,
                "seed": seed,
                "mean_return": np.mean(returns),
                "std_return": np.std(returns)
            })
            
            print(f"alpha={alpha}, seed={seed}, return={np.mean(returns):.2f}")
    
    return pd.DataFrame(results)

In [None]:
def evaluate_policy(model, env, n_episodes, alpha, llm_reward_fn):
    """Evaluate policy with given alpha weighting."""
    returns = []
    
    for _ in range(n_episodes):
        obs = env.reset()
        done = False
        episode_return = 0
        
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, env_reward, done, info = env.step(action)
            
            llm_reward = llm_reward_fn(obs, action) if alpha > 0 else 0
            combined = (1 - alpha) * env_reward + alpha * llm_reward
            episode_return += combined
        
        returns.append(episode_return)
    
    return returns

In [None]:
# Placeholder for actual experiment
# df = run_sensitivity_sweep(make_env, get_llm_reward, ALPHA_VALUES, SEEDS, N_EVAL_EPISODES)
# df.to_csv(SENSITIVITY_CSV, index=False)

# Load pre-computed results if available
try:
    df = pd.read_csv(SENSITIVITY_CSV)
    print(f"Loaded {len(df)} sensitivity results")
except FileNotFoundError:
    print("No sensitivity results found. Run the sweep first.")

In [None]:
agg = df.groupby("alpha").agg(
    mean=("mean_return", "mean"),
    std=("mean_return", "std")
).reset_index()

plt.figure(figsize=(10, 6))
plt.errorbar(agg["alpha"], agg["mean"], yerr=agg["std"], marker="o", capsize=5)
plt.xlabel("Alpha (LLM Weight)")
plt.ylabel("Mean Return")
plt.title("Sensitivity to LLM Reward Weighting")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
best_alpha = agg.loc[agg["mean"].idxmax(), "alpha"]
best_return = agg["mean"].max()

print(f"Optimal alpha: {best_alpha}")
print(f"Best mean return: {best_return:.2f}")