In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gymnasium as gym
import os
import sys
import inspect
import random
import torch
from collections import deque
from pathlib import Path
from datetime import datetime
from difflib import Differ
from tqdm.notebook import tqdm
import copy

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("viridis")

# Set paths
current_dir = os.getcwd()
project_root = str(Path(current_dir).parent.parent)
sys.path.append(project_root)

# Import required modules from existing codebase
from AdaptiveRewardFunctionLearning.Prompts.prompts import device, apiKey, modelName
from RLEnvironment.env import CustomBipedalWalkerEnv
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCritic import RewardUpdateSystem
from RLEnvironment.training.agent import DQLearningAgent
from RLEnvironment.training.training import trainDQLearning
from AdaptiveRewardFunctionLearning.RewardGeneration.bipedalRewardFunctions import (
    badRewardBipedal, stabilityRewardBipedal, efficiencyRewardBipedal,
    potentialBasedRewardBipedal, energyBasedRewardBipedal, baselineRewardBipedal
)

# Helper function to save plots
def save_plot(fig, name, folder="BipedalWalkerRobustnessResults"):
    # Create logs directory with subfolder if it doesn't exist
    logs_dir = Path(project_root) / 'AdaptiveRewardFunctionLearning' / 'Experiments' / folder
    if not os.path.exists(logs_dir):
        os.makedirs(logs_dir)
    
    # Create timestamp
    timestamp = datetime.now().strftime("%d%m%Y_%H%M%S")
    
    # Full path for saving
    filepath = os.path.join(logs_dir, f"{name}_{timestamp}.png")
    
    # Save figure
    fig.savefig(filepath, bbox_inches='tight', dpi=300)
    print(f"Saved plot: {name}_{timestamp}.png in {folder}")
    return filepath

In [None]:
def runBipedalRobustnessTest(episodes=5000, update_interval=1000, 
                            leg_changes=[30, 50],
                            terrain_roughness=1.0,
                            gravity=9.8,
                            seed=42,
                            discretize_bins=3):
    """
    Run a robustness test for the BipedalWalker environment.
    
    Args:
        episodes: Number of training episodes
        update_interval: Episodes between environment changes
        leg_changes: List of leg lengths to alternate between
        terrain_roughness: Terrain roughness parameter
        gravity: Gravity parameter
        seed: Random seed
        discretize_bins: Number of bins for discretizing continuous actions
        
    Returns:
        dict: Results of the experiment
    """
    print(f"Starting BipedalWalker Robustness Test with seed {seed}...")
    
    # Set random seeds
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    # Create environment
    env = gym.make('BipedalWalker-v3', hardcore=False, render_mode=None)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    env.reset(seed=seed)
    
    # Wrap with custom environment
    env = CustomBipedalWalkerEnv(env, numComponents=2, discretize_bins=discretize_bins)
    
    # Set initial parameters
    env.setEnvironmentParameters(
        leg_length=leg_changes[0],
        terrain_roughness=terrain_roughness,
        gravity=gravity
    )
    
    # Create agent - works with discretized action space
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n  # Discretized action space
    
    agent = DQLearningAgent(
        env=env, 
        stateSize=state_size, 
        actionSize=action_size, 
        device=device,
        learningRate=0.0005,
        discountFactor=0.99,
        epsilon=1.0,
        epsilonDecay=0.9995,
        epsilonMin=0.05,
        replayBufferSize=100000,
        batchSize=64,
        targetUpdateFreq=200
    )
    
    # Create update system
    update_system = RewardUpdateSystem(apiKey, modelName)
    
    # Start with bad reward function
    env.setRewardFunction(badRewardBipedal)
    
    # Initialize components (to be activated later)
    env.setComponentReward(1, stabilityRewardBipedal)
    env.setComponentReward(2, efficiencyRewardBipedal)
    update_system.lastUpdateEpisode = 0
    
    # Storage for episode data
    episode_rewards = []
    episode_distances = []  # For BipedalWalker, track distance instead of balance time
    reward_change_episodes = []
    environment_changes = []
    
    # Component weight tracking
    component_weights = []
    component_updates = []
    
    # Parameter index tracking
    current_param_idx = 0
    
    def onEpisodeEnd(env, updatesystem, episode, reward, steps):
        nonlocal episode_rewards, episode_distances, reward_change_episodes, environment_changes
        nonlocal current_param_idx, component_weights, component_updates
        
        # Record metrics
        episode_rewards.append(reward)
        
        # For BipedalWalker, track forward distance
        if hasattr(env.env.unwrapped, 'hull'):
            distance = env.env.unwrapped.hull.position[0]
        else:
            distance = 0  # Fallback
        episode_distances.append(distance)
        
        # Calculate metrics for decision making
        metrics = {
            'currentEpisode': episode,
            'recentRewards': episode_rewards[-100:] if len(episode_rewards) > 100 else episode_rewards,
            'averageDistance': np.mean(episode_distances[-100:]) if episode_distances else 0,
            'distanceVariance': np.var(episode_distances[-100:]) if len(episode_distances) > 1 else 0
        }
        
        # Collect component weights
        if hasattr(env, 'getCurrentWeights'):
            weights = env.getCurrentWeights()
            component_weights.append({
                'episode': episode,
                'stability': weights['stability'],
                'efficiency': weights['efficiency']
            })
        
        # Print debug info periodically
        if episode % 100 == 0:
            print(f"\nMetrics at Episode {episode}:")
            print(f"Recent Average Reward: {np.mean(metrics['recentRewards']):.2f}")
            print(f"Average Distance: {metrics['averageDistance']:.2f}")
            
            if hasattr(env, 'getCurrentWeights'):
                weights = env.getCurrentWeights()
                print(f"Component Weights - Stability: {weights['stability']:.2f}, "
                      f"Efficiency: {weights['efficiency']:.2f}")
        
        # Transition from bad reward to component-based reward at episode 500
        if episode == 5000:
            print("\nTransitioning from bad reward to component-based reward")
            env.usingComponents = True  # Activate component-based reward
            reward_change_episodes.append(episode)
        
        # Handle LLM updates for adaptive reward after transition
        if updatesystem is not None and episode > 5000:
            for component in range(1, 3):
                updatesystem.targetComponent = component
                if updatesystem.waitingTime(f'component_{component}', metrics, updatesystem.lastUpdateEpisode):
                    current_func = env.rewardComponents[f'rewardFunction{component}']
                    new_function, updated = updatesystem.validateAndUpdate(current_func)
                    
                    if updated:
                        # Record old function string
                        try:
                            old_func_str = inspect.getsource(current_func)
                        except:
                            old_func_str = str(current_func)
                        
                        # Record new function string
                        if callable(new_function):
                            try:
                                new_func_str = inspect.getsource(new_function)
                            except:
                                new_func_str = str(new_function)
                        else:
                            new_func_str = str(new_function)
                        
                        # Apply the update
                        env.setComponentReward(component, new_function)
                        reward_change_episodes.append(episode)
                        updatesystem.lastUpdateEpisode = episode
                        
                        # Record update details
                        component_updates.append({
                            'episode': episode,
                            'component': component,
                            'old_function': old_func_str,
                            'new_function': new_func_str,
                            'pre_update_performance': np.mean(episode_rewards[-20:]) if len(episode_rewards) >= 20 else 0
                        })
                        
                        print(f"✓ LLM update for component {component} at episode {episode}")
        
        # Environment changes at specified intervals
        if episode % update_interval == 0 and episode > 0:
            current_param_idx = (current_param_idx + 1) % len(leg_changes)
            new_leg_length = leg_changes[current_param_idx]
            
            # Apply new parameters
            env.setEnvironmentParameters(
                leg_length=new_leg_length,
                terrain_roughness=terrain_roughness,
                gravity=gravity
            )
            
            # Record change
            environment_changes.append(episode)
            print(f"\nChanged leg length to: {new_leg_length} at episode {episode}")
    
    # Train the agent
    agent, env, rewards = trainDQLearning(
        agent=agent,
        env=env,
        numEpisodes=episodes,
        updateSystem=update_system,
        onEpisodeEnd=onEpisodeEnd
    )
    
    # Store results
    results = {
        'adaptivereward': {
            'rewards': episode_rewards,
            'distances': episode_distances,
            'rewardChanges': reward_change_episodes,
            'environmentChanges': environment_changes,
            'component_weights': component_weights,
            'component_updates': component_updates
        }
    }
    
    # Print final metrics
    print(f"\nCompleted testing adaptive reward")
    print(f"Final average reward: {np.mean(episode_rewards[-100:]):.2f}")
    print(f"Final average distance: {np.mean(episode_distances[-100:]):.2f}")
    
    return results

In [None]:
def runRewardUpdateExperiment(episodes=5000, update_interval=1000, seed=42, discretize_bins=3):
    """
    Run an experiment to test the effectiveness of LLM-based reward function updates
    without environmental changes.
    
    Args:
        episodes: Number of training episodes
        update_interval: Episodes between forced reward function updates
        seed: Random seed
        discretize_bins: Number of bins for discretizing continuous actions
        
    Returns:
        dict: Results of the experiment including rewards, distances, and update points
    """
    print(f"Starting BipedalWalker Reward Update Experiment with seed {seed}...")
    
    # Set random seeds
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    # Create environment
    env = gym.make('BipedalWalker-v3', hardcore=False, render_mode=None)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    env.reset(seed=seed)
    
    # Wrap with custom environment
    env = CustomBipedalWalkerEnv(env, numComponents=2, discretize_bins=discretize_bins)
    
    # Set fixed parameters - no environmental changes
    env.setEnvironmentParameters(
        leg_length=40,  # Medium length
        terrain_roughness=0.5,  # Moderate terrain
        gravity=9.8
    )
    
    # Create agent
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n  # Discretized action space
    
    agent = DQLearningAgent(
        env=env, 
        stateSize=state_size, 
        actionSize=action_size, 
        device=device,
        learningRate=0.0005,
        discountFactor=0.99,
        epsilon=1.0,
        epsilonDecay=0.9995,
        epsilonMin=0.05,
        replayBufferSize=100000,
        batchSize=64,
        targetUpdateFreq=200
    )
    
    # Create update system
    update_system = RewardUpdateSystem(apiKey, modelName)
    
    # Initialize with component rewards
    env.setComponentReward(1, stabilityRewardBipedal)
    env.setComponentReward(2, efficiencyRewardBipedal)
    update_system.lastUpdateEpisode = 0
    
    # Storage for episode data
    episode_rewards = []
    episode_distances = []
    reward_change_episodes = []
    
    # Component weight tracking
    component_weights = []
    component_updates = []
    
    # For tracking performance before/after updates
    update_performance = []
    
    def onEpisodeEnd(env, updatesystem, episode, reward, steps):
        nonlocal episode_rewards, episode_distances, reward_change_episodes
        nonlocal component_weights, component_updates, update_performance
        
        # Record metrics
        episode_rewards.append(reward)
        
        # For BipedalWalker, track forward distance
        if hasattr(env.env.unwrapped, 'hull'):
            distance = env.env.unwrapped.hull.position[0]
        else:
            distance = 0  # Fallback
        episode_distances.append(distance)
        
        # Calculate metrics for decision making
        metrics = {
            'currentEpisode': episode,
            'recentRewards': episode_rewards[-100:] if len(episode_rewards) > 100 else episode_rewards,
            'averageDistance': np.mean(episode_distances[-100:]) if episode_distances else 0,
            'distanceVariance': np.var(episode_distances[-100:]) if len(episode_distances) > 1 else 0
        }
        
        # Collect component weights
        if hasattr(env, 'getCurrentWeights'):
            weights = env.getCurrentWeights()
            component_weights.append({
                'episode': episode,
                'stability': weights['stability'],
                'efficiency': weights['efficiency']
            })
        
        # Print debug info periodically
        if episode % 100 == 0:
            print(f"\nMetrics at Episode {episode}:")
            print(f"Recent Average Reward: {np.mean(metrics['recentRewards']):.2f}")
            print(f"Average Distance: {metrics['averageDistance']:.2f}")
            
            if hasattr(env, 'getCurrentWeights'):
                weights = env.getCurrentWeights()
                print(f"Component Weights - Stability: {weights['stability']:.2f}, "
                      f"Efficiency: {weights['efficiency']:.2f}")
        
        # Forced reward function updates at specified intervals
        forced_update = episode % update_interval == 0 and episode > 0
        
        # Handle LLM updates for reward function
        if updatesystem is not None and (forced_update or updatesystem.waitingTime('adaptive', metrics, updatesystem.lastUpdateEpisode)):
            # Before update metrics
            pre_update_rewards = episode_rewards[-20:] if len(episode_rewards) >= 20 else episode_rewards
            pre_update_distances = episode_distances[-20:] if len(episode_distances) >= 20 else episode_distances
            
            pre_update_performance = {
                'episode': episode,
                'pre_avg_reward': np.mean(pre_update_rewards),
                'pre_avg_distance': np.mean(pre_update_distances)
            }
            
            # Perform updates for both components
            for component in range(1, 3):
                updatesystem.targetComponent = component
                current_func = env.rewardComponents[f'rewardFunction{component}']
                new_function, updated = updatesystem.validateAndUpdate(current_func)
                
                if updated:
                    # Record old function string
                    try:
                        old_func_str = inspect.getsource(current_func)
                    except:
                        old_func_str = str(current_func)
                    
                    # Record new function string
                    if callable(new_function):
                        try:
                            new_func_str = inspect.getsource(new_function)
                        except:
                            new_func_str = str(new_function)
                    else:
                        new_func_str = str(new_function)
                    
                    # Apply the update
                    env.setComponentReward(component, new_function)
                    reward_change_episodes.append(episode)
                    updatesystem.lastUpdateEpisode = episode
                    
                    # Record update details
                    component_updates.append({
                        'episode': episode,
                        'component': component,
                        'old_function': old_func_str,
                        'new_function': new_func_str
                    })
                    
                    print(f"✓ LLM update for component {component} at episode {episode}")
            
            # Only record performance metrics if any component was updated
            if episode in reward_change_episodes:
                update_performance.append(pre_update_performance)
    
    # Train the agent
    agent, env, rewards = trainDQLearning(
        agent=agent,
        env=env,
        numEpisodes=episodes,
        updateSystem=update_system,
        onEpisodeEnd=onEpisodeEnd
    )
    
    # Calculate post-update performance for each update
    for i, update in enumerate(update_performance):
        update_ep = update['episode']
        
        # Look ahead 20 episodes after the update
        end_idx = min(update_ep + 20, len(episode_rewards))
        post_update_rewards = episode_rewards[update_ep:end_idx]
        post_update_distances = episode_distances[update_ep:end_idx]
        
        update_performance[i]['post_avg_reward'] = np.mean(post_update_rewards)
        update_performance[i]['post_avg_distance'] = np.mean(post_update_distances)
        update_performance[i]['reward_change_pct'] = ((update_performance[i]['post_avg_reward'] / update_performance[i]['pre_avg_reward']) - 1) * 100 if update_performance[i]['pre_avg_reward'] > 0 else 0
        update_performance[i]['distance_change_pct'] = ((update_performance[i]['post_avg_distance'] / update_performance[i]['pre_avg_distance']) - 1) * 100 if update_performance[i]['pre_avg_distance'] > 0 else 0
    
    # Store results
    results = {
        'rewards': episode_rewards,
        'distances': episode_distances,
        'reward_changes': reward_change_episodes,
        'component_weights': component_weights,
        'component_updates': component_updates,
        'update_performance': update_performance
    }
    
    # Print final metrics
    print(f"\nCompleted reward update experiment")
    print(f"Final average reward: {np.mean(episode_rewards[-100:]):.2f}")
    print(f"Final average distance: {np.mean(episode_distances[-100:]):.2f}")
    print(f"Number of reward function updates: {len(reward_change_episodes)}")
    
    return results

In [None]:
def runAdaptationComparisonExperiment(episodes=2000, change_episode=1000, post_change_episodes=2000, seed=42, discretize_bins=3):
    """
    Run a comparative experiment with an environmental change, comparing adaptive
    and static reward functions.
    
    Args:
        episodes: Number of initial training episodes before the split
        change_episode: Episode at which to make the environmental change
        post_change_episodes: Number of episodes to run after the change
        seed: Random seed
        discretize_bins: Number of bins for discretizing continuous actions
        
    Returns:
        dict: Results of both branches of the experiment
    """
    print(f"Starting BipedalWalker Adaptation Comparison Experiment with seed {seed}...")
    
    # Set random seeds
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    # Create environment
    env = gym.make('BipedalWalker-v3', hardcore=False, render_mode=None)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    env.reset(seed=seed)
    
    # Wrap with custom environment
    env = CustomBipedalWalkerEnv(env, numComponents=2, discretize_bins=discretize_bins)
    
    # Set initial parameters
    initial_leg_length = 30  # Start with short legs
    changed_leg_length = 50  # Change to long legs
    
    env.setEnvironmentParameters(
        leg_length=initial_leg_length,
        terrain_roughness=0.5,  # Moderate terrain
        gravity=9.8
    )
    
    # Create agent
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n  # Discretized action space
    
    agent = DQLearningAgent(
        env=env, 
        stateSize=state_size, 
        actionSize=action_size, 
        device=device,
        learningRate=0.0005,
        discountFactor=0.99,
        epsilon=1.0,
        epsilonDecay=0.9995,
        epsilonMin=0.05,
        replayBufferSize=100000,
        batchSize=64,
        targetUpdateFreq=200
    )
    
    # Create update system
    update_system = RewardUpdateSystem(apiKey, modelName)
    
    # Initialize with component rewards
    env.setComponentReward(1, stabilityRewardBipedal)
    env.setComponentReward(2, efficiencyRewardBipedal)
    update_system.lastUpdateEpisode = 0
    
    # Storage for episode data
    episode_rewards = []
    episode_distances = []
    
    # Component weight tracking
    component_weights = []
    
    def onEpisodeEnd(env, updatesystem, episode, reward, steps):
        nonlocal episode_rewards, episode_distances, component_weights
        
        # Record metrics
        episode_rewards.append(reward)
        
        # For BipedalWalker, track forward distance
        if hasattr(env.env.unwrapped, 'hull'):
            distance = env.env.unwrapped.hull.position[0]
        else:
            distance = 0  # Fallback
        episode_distances.append(distance)
        
        # Collect component weights
        if hasattr(env, 'getCurrentWeights'):
            weights = env.getCurrentWeights()
            component_weights.append({
                'episode': episode,
                'stability': weights['stability'],
                'efficiency': weights['efficiency']
            })
        
        # Print debug info periodically
        if episode % 100 == 0:
            print(f"\nMetrics at Episode {episode}:")
            print(f"Recent Average Reward: {np.mean(episode_rewards[-100:] if len(episode_rewards) >= 100 else episode_rewards):.2f}")
            print(f"Average Distance: {np.mean(episode_distances[-100:] if len(episode_distances) >= 100 else episode_distances):.2f}")
            
            if hasattr(env, 'getCurrentWeights'):
                weights = env.getCurrentWeights()
                print(f"Component Weights - Stability: {weights['stability']:.2f}, "
                      f"Efficiency: {weights['efficiency']:.2f}")
    
    # Train the agent for initial episodes
    print(f"\nInitial training phase with leg length {initial_leg_length}...")
    agent, env, _ = trainDQLearning(
        agent=agent,
        env=env,
        numEpisodes=episodes,
        updateSystem=update_system,
        onEpisodeEnd=onEpisodeEnd
    )
    
    # Save the state at the split point
    print(f"\nSaving state at episode {episodes} for branching...")
    
    # Save important state information
    model_state = copy.deepcopy(agent.model.state_dict())
    target_model_state = copy.deepcopy(agent.targetModel.state_dict())
    memory = copy.deepcopy(agent.memory)
    epsilon = agent.epsilon
    
    # Save environment reward components and weights
    reward_components = {}
    component_weights_save = {}
    for i in range(1, 3):
        func_name = f'rewardFunction{i}'
        if func_name in env.rewardComponents:
            reward_components[func_name] = env.rewardComponents[func_name]
    
    if hasattr(env, 'componentWeights'):
        component_weights_save = copy.deepcopy(env.componentWeights)
    
    # Create results containers for both branches
    adaptive_results = {
        'rewards': copy.deepcopy(episode_rewards),
        'distances': copy.deepcopy(episode_distances),
        'component_weights': copy.deepcopy(component_weights),
        'reward_changes': [],
        'environment_changes': [episodes]  # The change happens right after the initial training
    }
    
    static_results = {
        'rewards': copy.deepcopy(episode_rewards),
        'distances': copy.deepcopy(episode_distances),
        'component_weights': copy.deepcopy(component_weights),
        'reward_changes': [],
        'environment_changes': [episodes]  # The change happens right after the initial training
    }
    
    # Branch 1: Adaptive branch - with reward function updates
    print(f"\nStarting adaptive branch with leg length change to {changed_leg_length}...")
    
    # Reset environment and agent
    env = gym.make('BipedalWalker-v3', hardcore=False, render_mode=None)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    env.reset(seed=seed+1)  # Slightly different seed to prevent identical runs
    
    env = CustomBipedalWalkerEnv(env, numComponents=2, discretize_bins=discretize_bins)
    
    # Set changed parameters
    env.setEnvironmentParameters(
        leg_length=changed_leg_length,  # Change to long legs
        terrain_roughness=0.5,
        gravity=9.8
    )
    
    # Reset agent
    agent = DQLearningAgent(
        env=env, 
        stateSize=state_size, 
        actionSize=action_size, 
        device=device,
        learningRate=0.0005,
        discountFactor=0.99,
        epsilon=epsilon,  # Continue with the same exploration rate
        epsilonDecay=0.9995,
        epsilonMin=0.05,
        replayBufferSize=100000,
        batchSize=64,
        targetUpdateFreq=200
    )
    
    # Restore saved state
    agent.model.load_state_dict(model_state)
    agent.targetModel.load_state_dict(target_model_state)
    agent.memory = memory
    
    # Restore reward components
    for name, func in reward_components.items():
        component_num = int(name[-1])
        env.setComponentReward(component_num, func)
    
    if component_weights_save:
        env.componentWeights = component_weights_save
    
    # Reset episode tracking for continuation
    adaptive_episode_rewards = []
    adaptive_episode_distances = []
    adaptive_component_weights = []
    adaptive_reward_changes = []
    
    def onAdaptiveEpisodeEnd(env, updatesystem, episode, reward, steps):
        nonlocal adaptive_episode_rewards, adaptive_episode_distances, adaptive_component_weights, adaptive_reward_changes
        
        # Record metrics
        adaptive_episode_rewards.append(reward)
        
        # For BipedalWalker, track forward distance
        if hasattr(env.env.unwrapped, 'hull'):
            distance = env.env.unwrapped.hull.position[0]
        else:
            distance = 0  # Fallback
        adaptive_episode_distances.append(distance)
        
        # Calculate metrics for decision making
        metrics = {
            'currentEpisode': episode,
            'recentRewards': adaptive_episode_rewards[-100:] if len(adaptive_episode_rewards) > 100 else adaptive_episode_rewards,
            'averageDistance': np.mean(adaptive_episode_distances[-100:]) if adaptive_episode_distances else 0,
            'distanceVariance': np.var(adaptive_episode_distances[-100:]) if len(adaptive_episode_distances) > 1 else 0
        }
        
        # Collect component weights
        if hasattr(env, 'getCurrentWeights'):
            weights = env.getCurrentWeights()
            adaptive_component_weights.append({
                'episode': episode,
                'stability': weights['stability'],
                'efficiency': weights['efficiency']
            })
        
        # Print debug info periodically
        if episode % 100 == 0:
            print(f"\nAdaptive Branch - Episode {episode}:")
            print(f"Recent Average Reward: {np.mean(metrics['recentRewards']):.2f}")
            print(f"Average Distance: {metrics['averageDistance']:.2f}")
            
            if hasattr(env, 'getCurrentWeights'):
                weights = env.getCurrentWeights()
                print(f"Component Weights - Stability: {weights['stability']:.2f}, "
                      f"Efficiency: {weights['efficiency']:.2f}")
        
        # Handle LLM updates for reward function
        if updatesystem is not None and updatesystem.waitingTime('adaptive', metrics, updatesystem.lastUpdateEpisode):
            for component in range(1, 3):
                updatesystem.targetComponent = component
                current_func = env.rewardComponents[f'rewardFunction{component}']
                new_function, updated = updatesystem.validateAndUpdate(current_func)
                
                if updated:
                    # Apply the update
                    env.setComponentReward(component, new_function)
                    adaptive_reward_changes.append(episode)
                    updatesystem.lastUpdateEpisode = episode
                    
                    print(f"✓ Adaptive Branch - LLM update for component {component} at episode {episode}")
    
    # Train the adaptive branch
    adaptive_agent, adaptive_env, _ = trainDQLearning(
        agent=agent,
        env=env,
        numEpisodes=post_change_episodes,
        updateSystem=update_system,
        onEpisodeEnd=onAdaptiveEpisodeEnd
    )
    
    # Branch 2: Static branch - without reward function updates
    print(f"\nStarting static branch with leg length change to {changed_leg_length}...")
    
    # Reset environment and agent
    env = gym.make('BipedalWalker-v3', hardcore=False, render_mode=None)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    env.reset(seed=seed+2)  # Different seed to prevent identical runs
    
    env = CustomBipedalWalkerEnv(env, numComponents=2, discretize_bins=discretize_bins)
    
    # Set changed parameters
    env.setEnvironmentParameters(
        leg_length=changed_leg_length,  # Change to long legs
        terrain_roughness=0.5,
        gravity=9.8
    )
    
    # Reset agent
    agent = DQLearningAgent(
        env=env, 
        stateSize=state_size, 
        actionSize=action_size, 
        device=device,
        learningRate=0.0005,
        discountFactor=0.99,
        epsilon=epsilon,  # Continue with the same exploration rate
        epsilonDecay=0.9995,
        epsilonMin=0.05,
        replayBufferSize=100000,
        batchSize=64,
        targetUpdateFreq=200
    )
    
    # Restore saved state
    agent.model.load_state_dict(model_state)
    agent.targetModel.load_state_dict(target_model_state)
    agent.memory = memory
    
    # Restore reward components
    for name, func in reward_components.items():
        component_num = int(name[-1])
        env.setComponentReward(component_num, func)
    
    if component_weights_save:
        env.componentWeights = component_weights_save
    
    # Create null update system that never updates
    null_update_system = type('NullUpdateSystem', (), {'waitingTime': lambda *args: False, 'lastUpdateEpisode': 0, 'targetComponent': 1})()
    
    # Reset episode tracking for continuation
    static_episode_rewards = []
    static_episode_distances = []
    static_component_weights = []
    
    def onStaticEpisodeEnd(env, updatesystem, episode, reward, steps):
        nonlocal static_episode_rewards, static_episode_distances, static_component_weights
        
        # Record metrics
        static_episode_rewards.append(reward)
        
        # For BipedalWalker, track forward distance
        if hasattr(env.env.unwrapped, 'hull'):
            distance = env.env.unwrapped.hull.position[0]
        else:
            distance = 0  # Fallback
        static_episode_distances.append(distance)
        
        # Collect component weights
        if hasattr(env, 'getCurrentWeights'):
            weights = env.getCurrentWeights()
            static_component_weights.append({
                'episode': episode,
                'stability': weights['stability'],
                'efficiency': weights['efficiency']
            })
        
        # Print debug info periodically
        if episode % 100 == 0:
            print(f"\nStatic Branch - Episode {episode}:")
            print(f"Recent Average Reward: {np.mean(static_episode_rewards[-100:] if len(static_episode_rewards) >= 100 else static_episode_rewards):.2f}")
            print(f"Average Distance: {np.mean(static_episode_distances[-100:] if len(static_episode_distances) >= 100 else static_episode_distances):.2f}")
            
            if hasattr(env, 'getCurrentWeights'):
                weights = env.getCurrentWeights()
                print(f"Component Weights - Stability: {weights['stability']:.2f}, "
                      f"Efficiency: {weights['efficiency']:.2f}")
    
    # Train the static branch
    static_agent, static_env, _ = trainDQLearning(
        agent=agent,
        env=env,
        numEpisodes=post_change_episodes,
        updateSystem=null_update_system,
        onEpisodeEnd=onStaticEpisodeEnd
    )
    
    # Combine results
    adaptive_results['rewards'].extend(adaptive_episode_rewards)
    adaptive_results['distances'].extend(adaptive_episode_distances)
    adaptive_results['component_weights'].extend(adaptive_component_weights)
    adaptive_results['reward_changes'].extend([episodes + ep for ep in adaptive_reward_changes])
    
    static_results['rewards'].extend(static_episode_rewards)
    static_results['distances'].extend(static_episode_distances)
    static_results['component_weights'].extend(static_component_weights)
    
    # Store combined results
    results = {
        'adaptive': adaptive_results,
        'static': static_results,
        'change_episode': episodes,
        'total_episodes': episodes + post_change_episodes,
        'leg_lengths': {
            'initial': initial_leg_length,
            'changed': changed_leg_length
        }
    }
    
    # Print final comparison
    print(f"\nComparison after {post_change_episodes} episodes post-change:")
    print(f"Adaptive Branch - Final average reward: {np.mean(adaptive_episode_rewards[-100:]):.2f}")
    print(f"Adaptive Branch - Final average distance: {np.mean(adaptive_episode_distances[-100:]):.2f}")
    print(f"Static Branch - Final average reward: {np.mean(static_episode_rewards[-100:]):.2f}")
    print(f"Static Branch - Final average distance: {np.mean(static_episode_distances[-100:]):.2f}")
    
    return results

In [None]:
def analyze_bipedal_results(results):
    """Analyze BipedalWalker robustness test results"""
    # Extract data
    rewards = results['adaptivereward']['rewards']
    distances = results['adaptivereward']['distances']
    reward_changes = results['adaptivereward']['rewardChanges']
    environment_changes = results['adaptivereward']['environmentChanges']
    
    # Calculate moving averages
    window_size = 20
    rewards_ma = pd.Series(rewards).rolling(window=window_size).mean()
    distances_ma = pd.Series(distances).rolling(window=window_size).mean()
    
    # Create visualization
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
    
    # Plot rewards
    ax1.plot(rewards, alpha=0.3, color='blue', label='Episode Reward')
    ax1.plot(rewards_ma, linewidth=2, color='darkblue', label=f'{window_size}-Episode Moving Average')
    
    # Add vertical lines for reward function updates
    for episode in reward_changes:
        ax1.axvline(x=episode, color='green', linestyle='--', alpha=0.7)
        ax1.annotate(f"Reward Update", xy=(episode, ax1.get_ylim()[1]*0.9),
                    xytext=(0, -20), textcoords='offset points',
                    ha='center', va='bottom',
                    arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
    
    # Add vertical lines for environment changes
    for episode in environment_changes:
        ax1.axvline(x=episode, color='red', linestyle='--', alpha=0.7)
        ax1.annotate(f"Leg Length Change", xy=(episode, ax1.get_ylim()[1]*0.7),
                    xytext=(0, -20), textcoords='offset points',
                    ha='center', va='bottom',
                    arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0'))
    
    ax1.set_title("Reward Over Episodes", fontsize=14)
    ax1.set_ylabel("Reward", fontsize=12)
    ax1.grid(True, alpha=0.3)
    ax1.legend(loc='upper left')
    
    # Plot distances
    ax2.plot(distances, alpha=0.3, color='purple', label='Distance Traveled')
    ax2.plot(distances_ma, linewidth=2, color='darkmagenta', label=f'{window_size}-Episode Moving Average')
    
    # Add vertical lines for reward function updates
    for episode in reward_changes:
        ax2.axvline(x=episode, color='green', linestyle='--', alpha=0.7)
    
    # Add vertical lines for environment changes
    for episode in environment_changes:
        ax2.axvline(x=episode, color='red', linestyle='--', alpha=0.7)
    
    ax2.set_title("Distance Over Episodes", fontsize=14)
    ax2.set_xlabel("Episode", fontsize=12)
    ax2.set_ylabel("Distance (m)", fontsize=12)
    ax2.grid(True, alpha=0.3)
    ax2.legend(loc='upper left')
    
    plt.tight_layout()
    save_plot(fig, "bipedal_performance")
    
    # Create component weights visualization if available
    if 'component_weights' in results['adaptivereward'] and results['adaptivereward']['component_weights']:
        component_data = results['adaptivereward']['component_weights']
        episodes = [d['episode'] for d in component_data]
        stability_weights = [d['stability'] for d in component_data]
        efficiency_weights = [d['efficiency'] for d in component_data]
        
        fig, ax = plt.subplots(figsize=(12, 6))
        ax.plot(episodes, stability_weights, 'b-', label='Stability Weight')
        ax.plot(episodes, efficiency_weights, 'g-', label='Efficiency Weight')
        
        # Add vertical lines for reward function updates
        for ep in reward_changes:
            ax.axvline(x=ep, color='g', linestyle='--', alpha=0.5, 
                     label='Reward Update' if reward_changes.index(ep) == 0 else None)
        
        # Add vertical lines for environment changes
        for ep in environment_changes:
            ax.axvline(x=ep, color='r', linestyle='--', alpha=0.5,
                     label='Environment Change' if environment_changes.index(ep) == 0 else None)
        
        ax.set_title('Evolution of Component Weights', fontsize=14)
        ax.set_xlabel('Episode', fontsize=12)
        ax.set_ylabel('Component Weight', fontsize=12)
        ax.legend()
        ax.grid(True)
        
        plt.tight_layout()
        save_plot(fig, "bipedal_component_weights")
    
    # Calculate and print robustness metrics
    print("\n=== BipedalWalker Robustness Analysis ===")
    
    # Recovery from bad initialization metrics
    if 500 in reward_changes:
        pre_change = slice(max(0, 500-100), 500)
        post_change = slice(500, min(500+100, len(rewards)))
        
        pre_reward = np.mean(rewards[pre_change])
        post_reward = np.mean(rewards[post_change])
        
        pre_distance = np.mean(distances[pre_change])
        post_distance = np.mean(distances[post_change])
        
        # Calculate recovery time
        recovery_threshold = 0.9 * pre_reward if pre_reward > 0 else 2.0
        recovery_episode = None
        
        for i in range(500, min(500+500, len(rewards))):
            if i + 10 < len(rewards) and np.mean(rewards[i-10:i+1]) >= recovery_threshold:
                recovery_episode = i
                break
        
        recovery_time = recovery_episode - 500 if recovery_episode else "Did not recover"
        
        print("\nRecovery from Bad Initialization:")
        print(f"Pre-change average reward: {pre_reward:.2f}")
        print(f"Post-change average reward: {post_reward:.2f}")
        print(f"Reward improvement: {((post_reward/pre_reward)-1)*100:.2f}%" if pre_reward != 0 else "N/A")
        print(f"Pre-change average distance: {pre_distance:.2f}")
        print(f"Post-change average distance: {post_distance:.2f}")
        print(f"Distance improvement: {((post_distance/pre_distance)-1)*100:.2f}%" if pre_distance != 0 else "N/A")
        print(f"Recovery time: {recovery_time} episodes")
    
    # Environment change impact metrics
    print("\nResponse to Environment Changes:")
    for i, change_ep in enumerate(environment_changes):
        if change_ep + 100 >= len(rewards):
            continue
            
        pre_change = slice(max(0, change_ep-100), change_ep)
        post_change = slice(change_ep, min(change_ep+100, len(rewards)))
        
        pre_reward = np.mean(rewards[pre_change])
        post_reward = np.mean(rewards[post_change])
        
        pre_distance = np.mean(distances[pre_change])
        post_distance = np.mean(distances[post_change])
        
        # Determine leg length
        leg_desc = "Long legs" if i % 2 == 0 else "Short legs"
        
        print(f"\nChange {i+1} ({leg_desc}):")
        print(f"Pre-change average reward: {pre_reward:.2f}")
        print(f"Post-change average reward: {post_reward:.2f}")
        print(f"Reward change: {((post_reward/pre_reward)-1)*100:.2f}%" if pre_reward != 0 else "N/A")
        print(f"Pre-change average distance: {pre_distance:.2f}")
        print(f"Post-change average distance: {post_distance:.2f}")
        print(f"Distance change: {((post_distance/pre_distance)-1)*100:.2f}%" if pre_distance != 0 else "N/A")
        
        # Calculate recovery time
        recovery_threshold = 0.9 * pre_reward
        recovery_episode = None
        
        for ep in range(change_ep, min(change_ep + 500, len(rewards))):
            if np.mean(rewards[max(0, ep-10):ep+1]) >= recovery_threshold:
                recovery_episode = ep
                break
        
        if recovery_episode:
            recovery_time = recovery_episode - change_ep
            print(f"Recovery time: {recovery_time} episodes")
        else:
            print("Recovery time: Did not fully recover within 500 episodes")
    
    # Calculate overall performance stability
    min_reward = min(rewards[500:]) if len(rewards) > 500 else min(rewards)
    avg_reward = np.mean(rewards[500:]) if len(rewards) > 500 else np.mean(rewards)
    stability = min_reward / avg_reward if avg_reward != 0 else 0
    
    print(f"\nOverall Performance Stability: {stability:.3f}")
    print(f"(Ratio of minimum to average reward, higher is better)")
    
    return {
        'overall_stability': stability,
        'bad_init_recovery': ((post_reward/pre_reward)-1)*100 if 'post_reward' in locals() and pre_reward != 0 else None,
        'avg_reward': avg_reward
    }

In [None]:
def plotCombinedResults(results, save_path="BipedalWalkerRobustnessResults"):
    """
    Plot combined results from the adaptive and static branches.
    
    Args:
        results: Dictionary containing results from both branches
        save_path: Directory to save visualization outputs
        
    Returns:
        dict: Performance advantage metrics
    """
    # Extract data
    adaptive_rewards = results['adaptive']['rewards']
    adaptive_distances = results['adaptive']['distances']
    static_rewards = results['static']['rewards']
    static_distances = results['static']['distances']
    
    change_episode = results['change_episode']
    reward_changes = results['adaptive']['reward_changes']
    
    # Calculate window size based on number of episodes
    window_size = max(20, min(50, int(len(adaptive_rewards) / 100)))
    
    # Calculate moving averages
    adaptive_rewards_ma = pd.Series(adaptive_rewards).rolling(window=window_size).mean()
    adaptive_distances_ma = pd.Series(adaptive_distances).rolling(window=window_size).mean()
    static_rewards_ma = pd.Series(static_rewards).rolling(window=window_size).mean()
    static_distances_ma = pd.Series(static_distances).rolling(window=window_size).mean()
    
    # Create visualization with subplots for rewards and distances
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
    
    # Plot rewards
    ax1.plot(adaptive_rewards_ma, linewidth=2, color='blue', label='Adaptive Reward')
    ax1.plot(static_rewards_ma, linewidth=2, color='red', label='Static Reward')
    
    # Add vertical line for environment change
    ax1.axvline(x=change_episode, color='black', linestyle='--', alpha=0.7, label='Environment Change')
    
    # Add vertical lines for reward function updates
    for episode in reward_changes:
        ax1.axvline(x=episode, color='green', linestyle='--', alpha=0.7, label='Reward Update' if episode == reward_changes[0] else None)
    
    # Customize reward plot
    ax1.set_title("Reward Comparison Between Adaptive and Static Approaches", fontsize=14)
    ax1.set_ylabel("Reward", fontsize=12)
    ax1.grid(True, alpha=0.3)
    ax1.legend(loc='upper left')
    
    # Add annotation for leg length change
    leg_lengths = results['leg_lengths']
    ax1.annotate(f"Leg Length: {leg_lengths['initial']} → {leg_lengths['changed']}", 
                xy=(change_episode, ax1.get_ylim()[1]*0.9),
                xytext=(10, 0), textcoords='offset points',
                ha='left', va='center',
                bbox=dict(boxstyle='round,pad=0.3', fc='yellow', alpha=0.3))
    
    # Plot distances
    ax2.plot(adaptive_distances_ma, linewidth=2, color='blue', label='Adaptive Reward')
    ax2.plot(static_distances_ma, linewidth=2, color='red', label='Static Reward')
    
    # Add vertical line for environment change
    ax2.axvline(x=change_episode, color='black', linestyle='--', alpha=0.7)
    
    # Add vertical lines for reward function updates
    for episode in reward_changes:
        ax2.axvline(x=episode, color='green', linestyle='--', alpha=0.7)
    
    # Customize distance plot
    ax2.set_title("Distance Traveled Comparison", fontsize=14)
    ax2.set_xlabel("Episode", fontsize=12)
    ax2.set_ylabel("Distance (m)", fontsize=12)
    ax2.grid(True, alpha=0.3)
    ax2.legend(loc='upper left')
    
    plt.tight_layout()
    save_plot(fig, "adaptive_static_comparison", folder=save_path)
    
    # Create additional plot to highlight the immediate post-change period
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8), sharex=True)
    
    # Calculate range to display (200 episodes before and 500 after change)
    pre_change_window = 200
    post_change_window = 500
    start_idx = max(0, change_episode - pre_change_window)
    end_idx = min(len(adaptive_rewards), change_episode + post_change_window)
    
    x_range = range(start_idx, end_idx)
    
    # Plot rewards for the zoomed period
    ax1.plot(x_range, adaptive_rewards_ma.iloc[start_idx:end_idx], linewidth=2, color='blue', label='Adaptive Reward')
    ax1.plot(x_range, static_rewards_ma.iloc[start_idx:end_idx], linewidth=2, color='red', label='Static Reward')
    
    # Add vertical line for environment change
    ax1.axvline(x=change_episode, color='black', linestyle='--', alpha=0.7, label='Environment Change')
    
    # Add vertical lines for reward function updates within range
    for episode in [ep for ep in reward_changes if start_idx <= ep <= end_idx]:
        ax1.axvline(x=episode, color='green', linestyle='--', alpha=0.7, label='Reward Update' if episode == reward_changes[0] else None)
    
    # Add shaded regions for pre-post comparison windows
    pre_change_start = max(start_idx, change_episode - 20)
    pre_change_end = change_episode
    post_change_start = change_episode
    post_change_end = min(end_idx, change_episode + 100)
    
    # Pre-change window
    ax1.axvspan(pre_change_start, pre_change_end, alpha=0.2, color='gray', label='Pre-Change Window')
    # Post-change window
    ax1.axvspan(post_change_start, post_change_end, alpha=0.2, color='yellow', label='Post-Change Window')
    
    # Customize reward plot
    ax1.set_title("Zoomed View of Performance During Environmental Change", fontsize=14)
    ax1.set_ylabel("Reward", fontsize=12)
    ax1.grid(True, alpha=0.3)
    ax1.legend(loc='upper left')
    
    # Plot distances for the zoomed period
    ax2.plot(x_range, adaptive_distances_ma.iloc[start_idx:end_idx], linewidth=2, color='blue', label='Adaptive Reward')
    ax2.plot(x_range, static_distances_ma.iloc[start_idx:end_idx], linewidth=2, color='red', label='Static Reward')
    
    # Add vertical line for environment change
    ax2.axvline(x=change_episode, color='black', linestyle='--', alpha=0.7)
    
    # Add vertical lines for reward function updates within range
    for episode in [ep for ep in reward_changes if start_idx <= ep <= end_idx]:
        ax2.axvline(x=episode, color='green', linestyle='--', alpha=0.7)
    
    # Add shaded regions for pre-post comparison windows
    ax2.axvspan(pre_change_start, pre_change_end, alpha=0.2, color='gray')
    ax2.axvspan(post_change_start, post_change_end, alpha=0.2, color='yellow')
    
    # Customize distance plot
    ax2.set_title("Zoomed View of Distance Traveled", fontsize=14)
    ax2.set_xlabel("Episode", fontsize=12)
    ax2.set_ylabel("Distance (m)", fontsize=12)
    ax2.grid(True, alpha=0.3)
    ax2.legend(loc='upper left')
    
    plt.tight_layout()
    save_plot(fig, "adaptive_static_comparison_zoomed", folder=save_path)
    
    # Calculate performance advantage metrics
    advantage_metrics = calculateAdaptationMetrics(results)
    
    return advantage_metrics

In [None]:
def calculateAdaptationMetrics(results):
    """
    Calculate detailed performance metrics comparing adaptive and static approaches.
    
    Args:
        results: Dictionary containing results from both branches
        
    Returns:
        dict: Detailed performance metrics
    """
    # Extract data
    adaptive_rewards = results['adaptive']['rewards']
    adaptive_distances = results['adaptive']['distances']
    static_rewards = results['static']['rewards']
    static_distances = results['static']['distances']
    
    change_episode = results['change_episode']
    
    # Define windows for analysis
    pre_window = 20  # 20 episodes before change
    immediate_window = 20  # 20 episodes immediately after change
    recovery_window = 100  # Look ahead up to 100 episodes for recovery
    
    # Calculate pre-change performance (baseline)
    pre_change_start = max(0, change_episode - pre_window)
    pre_change_adaptive_rewards = adaptive_rewards[pre_change_start:change_episode]
    pre_change_static_rewards = static_rewards[pre_change_start:change_episode]
    pre_change_adaptive_distances = adaptive_distances[pre_change_start:change_episode]
    pre_change_static_distances = static_distances[pre_change_start:change_episode]
    
    pre_change_adaptive_reward = np.mean(pre_change_adaptive_rewards)
    pre_change_static_reward = np.mean(pre_change_static_rewards)
    pre_change_adaptive_distance = np.mean(pre_change_adaptive_distances)
    pre_change_static_distance = np.mean(pre_change_static_distances)
    
    # Calculate immediate post-change performance
    post_change_end = min(len(adaptive_rewards), change_episode + immediate_window)
    immediate_adaptive_rewards = adaptive_rewards[change_episode:post_change_end]
    immediate_static_rewards = static_rewards[change_episode:post_change_end]
    immediate_adaptive_distances = adaptive_distances[change_episode:post_change_end]
    immediate_static_distances = static_distances[change_episode:post_change_end]
    
    immediate_adaptive_reward = np.mean(immediate_adaptive_rewards)
    immediate_static_reward = np.mean(immediate_static_rewards)
    immediate_adaptive_distance = np.mean(immediate_adaptive_distances)
    immediate_static_distance = np.mean(immediate_static_distances)
    
    # Calculate performance drops
    adaptive_reward_drop_pct = ((pre_change_adaptive_reward - immediate_adaptive_reward) / pre_change_adaptive_reward * 100) if pre_change_adaptive_reward > 0 else 0
    static_reward_drop_pct = ((pre_change_static_reward - immediate_static_reward) / pre_change_static_reward * 100) if pre_change_static_reward > 0 else 0
    adaptive_distance_drop_pct = ((pre_change_adaptive_distance - immediate_adaptive_distance) / pre_change_adaptive_distance * 100) if pre_change_adaptive_distance > 0 else 0
    static_distance_drop_pct = ((pre_change_static_distance - immediate_static_distance) / pre_change_static_distance * 100) if pre_change_static_distance > 0 else 0
    
    # Calculate recovery times (episodes to reach 90% of pre-change performance)
    adaptive_recovery_threshold = 0.9 * pre_change_adaptive_reward
    static_recovery_threshold = 0.9 * pre_change_static_reward
    
    adaptive_recovery_episode = None
    static_recovery_episode = None
    
    # Use rolling window to smooth performance for recovery detection
    window_size = 10
    
    for i in range(change_episode, min(change_episode + recovery_window, len(adaptive_rewards))):
        # Need at least window_size episodes after i to calculate rolling average
        if i + window_size >= len(adaptive_rewards):
            break
            
        adaptive_window_avg = np.mean(adaptive_rewards[i:i+window_size])
        static_window_avg = np.mean(static_rewards[i:i+window_size])
        
        if adaptive_recovery_episode is None and adaptive_window_avg >= adaptive_recovery_threshold:
            adaptive_recovery_episode = i
        
        if static_recovery_episode is None and static_window_avg >= static_recovery_threshold:
            static_recovery_episode = i
    
    adaptive_recovery_time = (adaptive_recovery_episode - change_episode) if adaptive_recovery_episode else "Did not recover"
    static_recovery_time = (static_recovery_episode - change_episode) if static_recovery_episode else "Did not recover"
    
    # Look at final performance (last 100 episodes)
    final_window = 100
    final_adaptive_reward = np.mean(adaptive_rewards[-final_window:])
    final_static_reward = np.mean(static_rewards[-final_window:])
    final_adaptive_distance = np.mean(adaptive_distances[-final_window:])
    final_static_distance = np.mean(static_distances[-final_window:])
    
    # Calculate performance advantage
    reward_advantage_pct = ((final_adaptive_reward / final_static_reward) - 1) * 100 if final_static_reward > 0 else 0
    distance_advantage_pct = ((final_adaptive_distance / final_static_distance) - 1) * 100 if final_static_distance > 0 else 0
    
    # Track adaptive reward updates
    reward_updates = results['adaptive']['reward_changes']
    num_updates = len([ep for ep in reward_updates if ep > change_episode])
    first_update = min([ep for ep in reward_updates if ep > change_episode], default=None)
    update_delay = (first_update - change_episode) if first_update else None
    
    # Compile all metrics
    metrics = {
        'pre_change': {
            'adaptive_reward': pre_change_adaptive_reward,
            'static_reward': pre_change_static_reward,
            'adaptive_distance': pre_change_adaptive_distance,
            'static_distance': pre_change_static_distance
        },
        'immediate_post_change': {
            'adaptive_reward': immediate_adaptive_reward,
            'static_reward': immediate_static_reward,
            'adaptive_distance': immediate_adaptive_distance,
            'static_distance': immediate_static_distance
        },
        'performance_drop_pct': {
            'adaptive_reward': adaptive_reward_drop_pct,
            'static_reward': static_reward_drop_pct,
            'adaptive_distance': adaptive_distance_drop_pct,
            'static_distance': static_distance_drop_pct
        },
        'recovery': {
            'adaptive_episodes': adaptive_recovery_time,
            'static_episodes': static_recovery_time
        },
        'final_performance': {
            'adaptive_reward': final_adaptive_reward,
            'static_reward': final_static_reward,
            'adaptive_distance': final_adaptive_distance,
            'static_distance': final_static_distance
        },
        'performance_advantage_pct': {
            'reward': reward_advantage_pct,
            'distance': distance_advantage_pct
        },
        'updates': {
            'count': num_updates,
            'first_update_episode': first_update,
            'update_delay': update_delay
        }
    }
    
    # Print summary table
    print("\n=== Adaptation Performance Metrics ===\n")
    
    print("Performance Drop After Environmental Change:")
    print(f"Adaptive Reward: {adaptive_reward_drop_pct:.1f}%")
    print(f"Static Reward: {static_reward_drop_pct:.1f}%")
    print(f"Difference: {(static_reward_drop_pct - adaptive_reward_drop_pct):.1f} percentage points")
    
    print("\nRecovery Time:")
    print(f"Adaptive Reward: {adaptive_recovery_time if isinstance(adaptive_recovery_time, str) else f'{adaptive_recovery_time} episodes'}")
    print(f"Static Reward: {static_recovery_time if isinstance(static_recovery_time, str) else f'{static_recovery_time} episodes'}")
    
    print("\nFinal Performance Advantage:")
    print(f"Reward: Adaptive is {reward_advantage_pct:.1f}% higher")
    print(f"Distance: Adaptive is {distance_advantage_pct:.1f}% higher")
    
    print("\nReward Function Updates:")
    print(f"Number of updates after environmental change: {num_updates}")
    print(f"First update occurred at episode: {first_update if first_update else 'N/A'}")
    print(f"Delay between environmental change and first update: {update_delay if update_delay else 'N/A'} episodes")
    
    # Create a table format suitable for inclusion in a paper
    print("\nTable for Paper:")
    headers = ["Metric", "Adaptive Approach", "Static Approach", "Difference"]
    rows = [
        ["Performance Drop (%)", f"{adaptive_reward_drop_pct:.1f}%", f"{static_reward_drop_pct:.1f}%", f"{(static_reward_drop_pct - adaptive_reward_drop_pct):.1f} pp"],
        ["Recovery Time (eps)", f"{adaptive_recovery_time if isinstance(adaptive_recovery_time, str) else adaptive_recovery_time}", f"{static_recovery_time if isinstance(static_recovery_time, str) else static_recovery_time}", 
         f"{(static_recovery_time - adaptive_recovery_time) if not isinstance(adaptive_recovery_time, str) and not isinstance(static_recovery_time, str) else 'N/A'}"],
        ["Final Reward", f"{final_adaptive_reward:.2f}", f"{final_static_reward:.2f}", f"{reward_advantage_pct:.1f}%"],
        ["Final Distance", f"{final_adaptive_distance:.2f}", f"{final_static_distance:.2f}", f"{distance_advantage_pct:.1f}%"]
    ]
    
    # Print the table
    print(f"{headers[0]:20} {headers[1]:20} {headers[2]:20} {headers[3]:20}")
    print("-" * 80)
    for row in rows:
        print(f"{row[0]:20} {row[1]:20} {row[2]:20} {row[3]:20}")
    
    return metrics

In [None]:
# Part 1: Run the Reward Update Experiment without environmental changes
print("Running Part 1: Reward Update Experiment without Environmental Changes")
update_results = runRewardUpdateExperiment(
    episodes=10000,
    update_interval=2000,
    seed=42,
    discretize_bins=3
)

# Part 2: Run the Comparative Experiment with Environmental Change
print("\nRunning Part 2: Adaptive vs Static Comparison with Environmental Change")
comparison_results = runAdaptationComparisonExperiment(
    episodes=5000,            # Initial training episodes
    change_episode=5000,      # When to make environmental change 
    post_change_episodes=2000, # Episodes to run after change
    seed=43,
    discretize_bins=3
)

# Visualize and analyze the comparative results
print("\nAnalyzing and Visualizing Comparison Results")
advantage_metrics = plotCombinedResults(
    comparison_results,
    save_path="BipedalWalkerRobustnessResults"
)

print("\nRobustness experiments completed successfully!")

In [None]:
# Run the BipedalWalker robustness experiment
results = runBipedalRobustnessTest(
    episodes=10000,
    update_interval=10000,
    leg_changes=[50, 50],  # Alternate between short and long legs
    terrain_roughness=1.0,
    gravity=9.8,
    seed=42,
    discretize_bins=3  # Discretize continuous action space into 3 values per dimension
)

# Analyze results
print("\n--- Creating Performance Visualizations ---")
metrics = analyze_bipedal_results(results)

In [None]:
# Analyze how component weights evolve with different leg lengths
if 'component_weights' in results['adaptivereward'] and results['adaptivereward']['component_weights']:
    # Extract component weight data
    component_data = results['adaptivereward']['component_weights']
    episodes = [d['episode'] for d in component_data]
    stability_weights = [d['stability'] for d in component_data]
    efficiency_weights = [d['efficiency'] for d in component_data]
    
    # Extract environment changes
    env_changes = results['adaptivereward']['environmentChanges']
    
    # Define segments between environment changes
    segments = []
    for i in range(len(env_changes)):
        start = env_changes[i-1] if i > 0 else 0
        end = env_changes[i]
        segments.append((start, end, "Short legs" if i % 2 == 0 else "Long legs"))
    
    # Add final segment
    segments.append((env_changes[-1] if env_changes else 0, episodes[-1], 
                    "Long legs" if len(env_changes) % 2 == 0 else "Short legs"))
    
    # Calculate average weights per segment
    segment_weights = []
    for start, end, label in segments:
        # Find weights in this segment
        segment_indices = [i for i, ep in enumerate(episodes) if start <= ep < end]
        if segment_indices:
            avg_stability = np.mean([stability_weights[i] for i in segment_indices])
            avg_efficiency = np.mean([efficiency_weights[i] for i in segment_indices])
            segment_weights.append((label, avg_stability, avg_efficiency))
    
    # Create a bar chart showing component weight distribution
    labels = [s[0] for s in segment_weights]
    stability_avgs = [s[1] for s in segment_weights]
    efficiency_avgs = [s[2] for s in segment_weights]
    
    x = np.arange(len(labels))
    width = 0.35
    
    fig, ax = plt.subplots(figsize=(10, 6))
    rects1 = ax.bar(x - width/2, stability_avgs, width, label='Stability Weight')
    rects2 = ax.bar(x + width/2, efficiency_avgs, width, label='Efficiency Weight')
    
    ax.set_ylabel('Average Weight')
    ax.set_title('Component Weight Distribution by Leg Length')
    ax.set_xticks(x)
    ax.set_xticklabels(labels)
    ax.legend()
    
    # Add text annotations
    for i, rect in enumerate(rects1):
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width()/2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
    
    for i, rect in enumerate(rects2):
        height = rect.get_height()
        ax.annotate(f'{height:.2f}',
                    xy=(rect.get_x() + rect.get_width()/2, height),
                    xytext=(0, 3),  # 3 points vertical offset
                    textcoords="offset points",
                    ha='center', va='bottom')
    
    plt.tight_layout()
    save_plot(fig, "bipedal_component_weight_distribution")
    
    print("\nComponent Weight Analysis by Leg Length:")
    for label, stability, efficiency in segment_weights:
        print(f"{label}: Stability = {stability:.3f}, Efficiency = {efficiency:.3f}")
        if stability > efficiency:
            print(f"  Dominant component: Stability (+{stability-efficiency:.3f})")
        else:
            print(f"  Dominant component: Efficiency (+{efficiency-stability:.3f})")