Considering Adding:
- Confidence intervals for performance metrics (Most papers only really seem to go this far)
- Statistical significance tests between different approaches
- Variance analysis across multiple runs




In [1]:
!pip install optuna



In [2]:
import sys
print(sys.executable)


import optuna
print(optuna.__version__)

/home/sd37/.conda/envs/thesis/bin/python
4.1.0


  from .autonotebook import tqdm as notebook_tqdm


### **This experiment is investigating the performance of an adaptive reward function to state of the art reward functions in environments with environmentally variable changes.**

-> Is there a statisitcally significant improvement in performance over time in this varying environment.


In [3]:
import gymnasium as gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import os
import random
import torch

import sys
from pathlib import Path


current_dir = os.getcwd()  
project_root = str(Path(current_dir).parent.parent)
sys.path.append(project_root)


# Initialize environment and device
from AdaptiveRewardFunctionLearning.Prompts.prompts import device, apiKey,modelName

#Cu stomCartPoleEnv
from RLEnvironment.env import CustomCartPoleEnv
#RewardUpdateSystem
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCritic import RewardUpdateSystem
#DQLearningAgent
from RLEnvironment.training.agent import DQLearningAgent
from RLEnvironment.training.training import trainDQLearning

#DynamicRewardFunction
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCodeGeneration import stabilityReward, efficiencyReward, dynamicRewardFunction

#import
from AdaptiveRewardFunctionLearning.Visualisation.trainingTestFunctions import (
    runEpisode,
    detectJumps,
    analyzeRewardSensibility,
    performUpdate,
    updateCompositeRewardFunction,
    plotExperimentResults,
    savePlot
)

# Import new reward functions
from AdaptiveRewardFunctionLearning.RewardGeneration.cartpole_energy_reward import EnergyBasedRewardFunction


# from AdaptiveRewardFunctionLearning.RewardGeneration.cartpole_meta_learning import meta_learning_cartpole
# from AdaptiveRewardFunctionLearning.RewardGeneration.reward_meta_learning import RewardFunctionMetaLearner

**State of the Art Reward Functions with Reference to Papers**

**Potential-based Reward Shaping (PBRS):**
```python
def potentialBasedRewardShaping(observation, action):
    x, xDot, angle, angleDot = observation
    gamma = 0.99  # Example discount factor

    def phi(x, xDot, angle, angleDot):
        # Example potential function
        return -abs(x) - abs(angle)

    current_potential = phi(x, xDot, angle, angleDot)
    next_potential = phi(x + xDot, angle + angleDot, xDot, angleDot)  # Simplified next state
    return float(gamma * next_potential - current_potential)
```

Paper: "Potential-based Shaping in Model-based Reinforcement Learning"

Link: https://cdn.aaai.org/AAAI/2008/AAAI08-096.pdf


**Parameterized Reward Shaping:**
```python
def parameterizedRewardShaping(observation, action):
    x, xDot, angle, angleDot = observation
    original_reward = 1.0  # Assuming default CartPole reward

    def f(x, xDot, angle, angleDot):
        # Example shaping reward function
        return -abs(angle)

    def z_phi(x, xDot, angle, angleDot):
        # Example shaping weight function
        return 0.5

    shaping_reward = f(x, xDot, angle, angleDot)
    shaping_weight = z_phi(x, xDot, angle, angleDot)
    return float(original_reward + shaping_weight * shaping_reward)
```

Paper: "Learning to Utilize Shaping Rewards: A New Approach of Reward Shaping"

Link: http://arxiv.org/pdf/2011.02669.pdf


**Energy Based Reward Function - Physics Based**

```python
def energyBasedReward(observation, action):
    x, xDot, angle, angleDot = observation
    
    # Calculate kinetic and potential energy components
    kineticEnergy = 0.5 * (xDot**2 + angleDot**2)
    potentialEnergy = 9.8 * (1 + cos(angle))  # g * h, where h depends on angle
    
    # Reward is inverse of total energy (less energy = more stable = better reward)
    energyPenalty = -(kineticEnergy + potentialEnergy)
    return float(1.0 + 0.1 * energyPenalty)  # Base reward plus energy term
```

Paper: "Energy-Based Control for Safe Robot Learning" (2019)

Link: https://ieeexplore.ieee.org/document/8794207


**Baseline Reward Function:**
```python
def baselineCartPoleReward(observation, action):
    return 1.0
```

### **Performance Experiment**

In [4]:

# Initialize reward functions and meta-learners
energy_reward = EnergyBasedRewardFunction(mass_cart=1.0, mass_pole=0.1, length=0.5, gravity=9.8)
# meta_reward = RewardFunctionMetaLearner(state_dim=4, action_dim=1)  # CartPole has 4 state dims, 1 action dim

# def potentialBasedRewardShaping(observation, action):
#     """Advanced potential-based reward shaping using meta-learning"""
#     reward_func = meta_reward.generate_reward_function()
#     return float(reward_func(observation, action))

# def parameterizedRewardShaping(observation, action):
#     """Meta-learning based parameterized reward shaping"""
#     # Use meta-learning framework for parameter optimization
#     learner = meta_learning_cartpole()
#     return float(learner.parameterized_reward(observation, action))

def energyBasedReward(observation, action):
    """Enhanced physics-based energy reward"""
    return float(energy_reward.compute_reward(observation, action))


def potentialBasedReward(observation, action):
    """Potential-based reward shaping for CartPole  - This one is not dynamic"""
    x, x_dot, theta, theta_dot = observation
    gamma = 0.99
    
    def potential(state):
        # Potential function based on cart position and pole angle
        # Higher potential for centered cart and upright pole
        cart_potential = -(state[0] ** 2)  # Penalize distance from center
        angle_potential = -((state[2] ** 2))  # Penalize angle from vertical
        velocity_potential = -(state[1] ** 2)  # Penalize high velocities
        ang_velocity_potential = -(state[3] ** 2)  # Penalize high angular velocities
        
        return cart_potential + 2*angle_potential + velocity_potential + ang_velocity_potential

    current_potential = potential(observation)
    next_potential = potential([x + x_dot, x_dot, theta + theta_dot, theta_dot])
    
    # PBRS formula: γΦ(s') - Φ(s)
    shaped_reward = gamma * next_potential - current_potential
    
    return 1.0 + shaped_reward


def baselineReward(observation, action):
    """Standard baseline reward"""
    return 1.0

In [5]:
def runPerformanceComparisonTest(
    episodes=1000, 
    changeInterval=500, 
    lengthchanges=[0.5, 1.5],
    mass_cart=1.0,
    mass_pole=0.1,
    initial_length=0.5,
    gravity=9.8,
    seed=42
):
    print(f"Starting Performance Comparison Test with seed {seed}...")
    
    # Set all random seeds
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    
    # Initialize base environment with seed
    env = gym.make('CartPole-v1', max_episode_steps=2000, render_mode=None)
    env.action_space.seed(seed)
    env.observation_space.seed(seed)
    env.reset(seed=seed)
    
    env = CustomCartPoleEnv(env, numComponents=2)
    env.setEnvironmentParameters(masscart=mass_cart, length=lengthchanges[0], gravity=gravity)

    
    currentlengthidx = 0
    
    
    # Initialize energy-based reward function
    energy_reward = EnergyBasedRewardFunction(
        mass_cart=mass_cart, 
        mass_pole=mass_pole, 
        length=initial_length, 
        gravity=gravity
    )
    
    # Define all reward functions to test
    rewardfunctions = {
        'adaptivereward': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': RewardUpdateSystem(apiKey, modelName), 
            'rewardfunction': None,
            'update_method': 'llm'
        },
        'pbrs': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': None,
            'rewardfunction': potentialBasedReward,
            'update_method': None
        },
        'energy_based': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': energy_reward,
            'rewardfunction': energy_reward.compute_reward,
            'update_method': 'physics'
        },
        'baseline': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': None,
            'rewardfunction': baselineReward,
            'update_method': None
        }
    }

    results = {}

    # Create a specific order for testing
    test_order = ['adaptivereward', 'energy_based', 'baseline', 'pbrs']
    
    # Test each reward function in the specified order
    for rewardname in test_order:
        rewardinfo = rewardfunctions[rewardname]
        print(f"\nTesting reward function: {rewardname}")
    
        env.reset()
        if rewardname == 'adaptivereward':
            # Initialize both components for adaptive reward
            env.setComponentReward(1, stabilityReward)  # Set initial stability component
            env.setComponentReward(2, efficiencyReward)  # Set initial efficiency component
            rewardinfo['updatesystem'].lastUpdateEpisode = 0
        else:
            env.setRewardFunction(rewardinfo['rewardfunction'])
        
        episoderewards = []
        episodebalancetimes = []
        rewardchangeepisodes = []
        
        def onEpisodeEnd(env, updatesystem, episode, reward, steps):
            nonlocal episoderewards, episodebalancetimes, rewardchangeepisodes, currentlengthidx
            
            # Record episode results
            episoderewards.append(reward)
            episodebalancetimes.append(steps)
            
            # Create metrics dictionary
            metrics = {
                'currentEpisode': episode,
                'recentRewards': episoderewards[-100:] if len(episoderewards) > 100 else episoderewards,
                'averageBalanceTime': np.mean(episodebalancetimes[-100:]) if episodebalancetimes else 0,
                'balanceTimeVariance': np.var(episodebalancetimes[-100:]) if len(episodebalancetimes) > 1 else 0
            }
            
            # Debug print for metrics
            if episode % 1000 == 0:
                print(f"\nMetrics Debug at Episode {episode}:")
                print(f"Recent Average Reward: {np.mean(metrics['recentRewards']):.2f}")
                print(f"Average Balance Time: {metrics['averageBalanceTime']:.2f}")
                print(f"Balance Time Variance: {metrics['balanceTimeVariance']:.2f}")
                
                if rewardname == 'adaptivereward' and hasattr(env, 'getCurrentWeights'):
                    weights = env.getCurrentWeights()
                    print(f"Component Weights - Stability: {weights['stability']:.2f}, "
                          f"Efficiency: {weights['efficiency']:.2f}")
            
            # Handle LLM updates only for adaptive reward
            if rewardname == 'adaptivereward' and updatesystem is not None:
                # Only show major milestones
                if episode % 1000 == 0:
                    print(f"\nEpisode {episode} - Time Since Last Update: {episode - updatesystem.lastUpdateEpisode}")
                
                for component in range(1, 3):
                    updatesystem.targetComponent = component
                    if updatesystem.waitingTime(f'component_{component}', metrics, updatesystem.lastUpdateEpisode):
                        current_func = env.rewardComponents[f'rewardFunction{component}']
                        new_function, updated = updatesystem.validateAndUpdate(current_func)
                        
                        if updated:
                            env.setComponentReward(component, new_function)
                            rewardchangeepisodes.append(episode)
                            updatesystem.lastUpdateEpisode = episode
                            print(f"✓ LLM update for component {component} at episode {episode}")
            
            # Handle physics-based updates
            elif rewardinfo['update_method'] == 'physics':
                if episode % changeInterval == 0 and episode > 0:
                    print(f"\nUpdating physics-based reward at episode {episode}")
                    updatesystem.length = lengthchanges[currentlengthidx]
                    env.setRewardFunction(updatesystem.compute_reward)
                    rewardchangeepisodes.append(episode)
                    print("✓ Physics-based update completed")
            
            # Environment changes
            if episode % changeInterval == 0 and episode > 0:
                currentlengthidx = (currentlengthidx + 1) % len(lengthchanges)
                newlength = lengthchanges[currentlengthidx]
                env.setEnvironmentParameters(length=newlength)
                print(f"\nChanged pole length to: {newlength}m at episode {episode}")
        
        # Train the agent
        agent, env, rewards = trainDQLearning(
            agent=rewardinfo['agent'],
            env=env,
            numEpisodes=episodes,
            updateSystem=rewardinfo['updatesystem'],
            onEpisodeEnd=onEpisodeEnd
        )
        
        # Store results
        results[rewardname] = {
            'rewards': episoderewards,
            'balancetimes': episodebalancetimes,
            'rewardChanges': rewardchangeepisodes
        }
        
        # Print final performance metrics
        print(f"\nCompleted testing {rewardname}")
        print(f"Final average reward: {np.mean(episoderewards[-100:]):.2f}")
        print(f"Final average balance time: {np.mean(episodebalancetimes[-100:]):.2f}")
    
    return results

In [6]:
# Run Experiment

changeInterval = 20000

# results = runPerformanceComparisonTest(
#     episodes=40000,  
#     changeInterval=changeInterval,
#     mass_cart=1.0,
#     lengthchanges=[0.3, 0.9]  
# )

In [7]:
def visualizePerformanceComparison(results, changeInterval, folder_path):
    """
    Create and save performance comparison visualizations
    """
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 15))
    
    # Color map for different reward functions
    colors = ['b', 'g', 'r', 'c', 'm']
    
    # Plot rewards for each reward function with variance
    for idx, (rewardname, rewardresults) in enumerate(results.items()):
        rewards = pd.Series(rewardresults['rewards'])
        
        # Calculate rolling mean and standard deviation for rewards
        window = 50
        rolling_mean_rewards = rewards.rolling(window=window).mean()
        rolling_std_rewards = rewards.rolling(window=window).std()
        
        # Plot mean line for rewards
        ax1.plot(rolling_mean_rewards, 
                label=f'{rewardname}', 
                linewidth=2, 
                color=colors[idx])
        
        # Plot variance area for rewards
        ax1.fill_between(
            range(len(rewards)),
            rolling_mean_rewards - rolling_std_rewards,
            rolling_mean_rewards + rolling_std_rewards,
            color=colors[idx],
            alpha=0.2
        )
        
        # Add vertical lines for environment changes (red)
        change_episodes = range(changeInterval, len(rewards), changeInterval)
        for ep in change_episodes:
            ax1.axvline(x=ep, color='r', linestyle='--', alpha=0.3,
                       label='Environment Change' if ep == change_episodes[0] else None)
        
        # Add vertical lines for reward function changes (green)
        if 'rewardChanges' in rewardresults:
            for ep in rewardresults['rewardChanges']:
                ax1.axvline(x=ep, color='g', linestyle='--', alpha=0.3,
                          label=f'{rewardname} Update' if ep == rewardresults['rewardChanges'][0] else None)
    
    ax1.set_title('Average Reward Over Time with Variance')
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('Reward')
    ax1.legend()
    ax1.grid(True)
    
    # Plot balance times
    for idx, (rewardname, rewardresults) in enumerate(results.items()):
        balancetimes = pd.Series(rewardresults['balancetimes'])
        
        rolling_mean_balance = balancetimes.rolling(window=window).mean()
        rolling_std_balance = balancetimes.rolling(window=window).std()
        
        ax2.plot(rolling_mean_balance,
                label=f'{rewardname}', 
                linewidth=2, 
                color=colors[idx])
        
        ax2.fill_between(
            range(len(balancetimes)),
            rolling_mean_balance - rolling_std_balance,
            rolling_mean_balance + rolling_std_balance,
            color=colors[idx],
            alpha=0.2
        )
        
        # Add vertical lines for environment changes
        for ep in change_episodes:
            ax2.axvline(x=ep, color='r', linestyle='--', alpha=0.3,
                       label='Environment Change' if ep == change_episodes[0] else None)
        
        # Add vertical lines for reward function changes
        if 'rewardChanges' in rewardresults:
            for ep in rewardresults['rewardChanges']:
                ax2.axvline(x=ep, color='g', linestyle='--', alpha=0.3,
                          label=f'{rewardname} Update' if ep == rewardresults['rewardChanges'][0] else None)
    
    ax2.set_title('Average Balance Time Over Episodes with Variance')
    ax2.set_xlabel('Episode')
    ax2.set_ylabel('Steps')
    ax2.legend()
    ax2.grid(True)
    
    # Plot environment parameters
    env_param_history = []
    for episode in range(len(next(iter(results.values()))['rewards'])):
        idx = (episode // changeInterval) % 2
        length = 0.3 if idx == 0 else 0.9  # Alternating between 0.3 and 0.9
        env_param_history.append(length)
    
    ax3.plot(env_param_history, label='Pole Length', color='purple')
    ax3.set_title('Environment Parameters Over Episodes')
    ax3.set_xlabel('Episode')
    ax3.set_ylabel('Pole Length (m)')
    ax3.grid(True)
    ax3.legend()
    
    plt.tight_layout()
    
    # Save plots
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filepath = os.path.join(folder_path, f"performance_comparison_{timestamp}.png")
    plt.savefig(filepath, bbox_inches='tight', dpi=300)
    print(f"Saved plot: performance_comparison_{timestamp}.png in {folder_path}")
    plt.close()
    
    
    
def calculateStability(rewards):
    """
    Calculate stability score based on reward variance in the last 100 episodes
    Lower variance = higher stability
    """
    if len(rewards) < 100:
        return 0.0
    
    last_hundred = rewards[-100:]
    mean_reward = np.mean(last_hundred)
    if mean_reward == 0:
        return 0.0
        
    # Calculate coefficient of variation (normalized standard deviation)
    stability = 1 - (np.std(last_hundred) / mean_reward)
    return max(0, min(1, stability))  # Normalize between 0 and 1

def calculateConvergenceTime(rewards, threshold=195, window=50):
    """
    Calculate the number of episodes needed to reach and maintain a certain performance
    threshold for a given window of episodes
    """
    if len(rewards) < window:
        return len(rewards)
    
    rolling_mean = pd.Series(rewards).rolling(window).mean()
    
    for episode in range(window, len(rewards)):
        if rolling_mean[episode] >= threshold:
            # Check if performance is maintained
            maintained = all(avg >= threshold * 0.9 for avg in rolling_mean[episode:episode+window])
            if maintained:
                return episode
    
    return len(rewards)  # If never converged, return total episodes

def calculatePerformanceMetrics(results):
    """Calculate performance metrics for each reward type"""
    metrics = {}
    for rewardname, rewardresults in results.items():
        metrics[rewardname] = {
            'finalavgreward': np.mean(rewardresults['rewards'][-100:]),
            'finalavgbalance': np.mean(rewardresults['balancetimes'][-100:]),
            'convergencetime': calculateConvergenceTime(rewardresults['rewards']),
            'stability': calculateStability(rewardresults['rewards'])
        }
    return pd.DataFrame(metrics).T
    
    
    
    
    
def saveMetricsTable(metrics, filename, folder_path):
    """Save metrics table to specified folder"""
    # Create figure for metrics table
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.axis('tight')
    ax.axis('off')
    
    # Create table with formatted metrics
    table = ax.table(
        cellText=metrics.values.round(3),
        colLabels=metrics.columns,
        rowLabels=metrics.index,
        cellLoc='center',
        loc='center'
    )
    
    # Adjust font size and scaling
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1.2, 1.5)
    
    plt.title("Performance Metrics Comparison")
    
    # Save with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filepath = os.path.join(folder_path, f"{filename}_{timestamp}.png")
    plt.savefig(filepath, bbox_inches='tight', dpi=300)
    print(f"Saved metrics table: {filename}_{timestamp}.png in {folder_path}")
    plt.close()



In [8]:

# Visualize the results
# visualizePerformanceComparison(results,changeInterval)


# # Calculate and display the metrics
# metrics = calculatePerformanceMetrics(results)
# print("\nPerformance Metrics:")
# print(metrics)
    
# Call the function
# saveMetricsTable(metrics)

### Multiple runs to generate confidence intervals

In [9]:
def createExperimentFolder():
    """Create a timestamped folder for experiment results"""
    from datetime import datetime
    import os
    
    # Create base experiment folder if it doesn't exist
    if not os.path.exists("PerformanceExperiment"):
        os.makedirs("PerformanceExperiment")
    
    # Create timestamped subfolder
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_folder = os.path.join("PerformanceExperiment", f"experiment_{timestamp}")
    os.makedirs(experiment_folder)
    
    return experiment_folder

def savePlot(fig, filename, folder_path):
    """Save plot to specified folder with timestamp"""
    from datetime import datetime
    import os
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filepath = os.path.join(folder_path, f"{filename}_{timestamp}.png")
    fig.savefig(filepath, bbox_inches='tight', dpi=300)
    print(f"Saved plot: {filename}_{timestamp}.png in {folder_path}")
    
def saveMetricsTable(metrics, filename, folder_path):
    """Save metrics table to specified folder"""
    import matplotlib.pyplot as plt
    from datetime import datetime
    import os
    
    # Style the DataFrame for visualization
    fig, ax = plt.subplots(figsize=(12, 4))
    ax.axis('tight')
    ax.axis('off')
    table = ax.table(cellText=metrics.values,
                    colLabels=metrics.columns,
                    rowLabels=metrics.index,
                    cellLoc='center',
                    loc='center')
    
    # Adjust font size and scaling
    table.auto_set_font_size(False)
    table.set_fontsize(9)
    table.scale(1.2, 1.5)
    
    plt.title("Performance Metrics Comparison")
    
    # Save with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filepath = os.path.join(folder_path, f"{filename}_{timestamp}.png")
    plt.savefig(filepath, bbox_inches='tight', dpi=300)
    print(f"Saved metrics table: {filename}_{timestamp}.png in {folder_path}")
    plt.close()

def runMultipleExperiments(numRuns=4, episodes=40000, changeInterval=20000):
    """
    Run multiple experiments and save results in organized folders
    """
    # Create main experiment folder
    experiment_folder = createExperimentFolder()
    
    allResults = []
    allMetrics = []
    aggregatedMetrics = {}
    
    # Create run folders
    for run in range(numRuns):
        print(f"\nStarting Run {run + 1}/{numRuns}")
        
        # Create folder for this run
        run_folder = os.path.join(experiment_folder, f"run_{run + 1}")
        os.makedirs(run_folder)
        
        # Run experiment
        results = runPerformanceComparisonTest(
            episodes=episodes,
            changeInterval=changeInterval,
            mass_cart=1.0,
            lengthchanges=[0.3, 0.9]
        )
        
        # Calculate metrics for this run
        metrics = calculatePerformanceMetrics(results)
        
        # Store results
        allResults.append(results)
        allMetrics.append(metrics)
        
        # Store metrics by reward type
        for idx, row in metrics.iterrows():
            if idx not in aggregatedMetrics:
                aggregatedMetrics[idx] = []
            aggregatedMetrics[idx].append(row.to_dict())
        
        # Visualize and save individual run results
        visualizePerformanceComparison(results, changeInterval, run_folder)
        saveMetricsTable(metrics, f"metrics_run_{run + 1}", run_folder)
    
    # Calculate confidence intervals (95%)
    confidenceIntervals = {}
    resultsTable = pd.DataFrame()
    
    for rewardType, metrics_list in aggregatedMetrics.items():
        metrics_df = pd.DataFrame(metrics_list)
        means = metrics_df.mean()
        cis = 1.96 * metrics_df.std() / np.sqrt(numRuns)
        
        # Create row for this reward type
        resultsTable.loc[rewardType, 'Average Reward'] = f"{means['finalavgreward']:.2f} ± {cis['finalavgreward']:.2f}"
        resultsTable.loc[rewardType, 'Average Balance'] = f"{means['finalavgbalance']:.2f} ± {cis['finalavgbalance']:.2f}"
        resultsTable.loc[rewardType, 'Convergence Time'] = f"{means['convergencetime']:.2f} ± {cis['convergencetime']:.2f}"
        resultsTable.loc[rewardType, 'Stability'] = f"{means['stability']:.2f} ± {cis['stability']:.2f}"
        
        # Store the raw values for possible further analysis
        confidenceIntervals[rewardType] = {
            'finalavgreward': {'mean': means['finalavgreward'], 'ci': cis['finalavgreward']},
            'finalavgbalance': {'mean': means['finalavgbalance'], 'ci': cis['finalavgbalance']},
            'convergencetime': {'mean': means['convergencetime'], 'ci': cis['convergencetime']},
            'stability': {'mean': means['stability'], 'ci': cis['stability']}
        }
    
    # Save aggregate statistics
    with open(os.path.join(experiment_folder, "aggregate_statistics.txt"), "w") as f:
        f.write("Aggregate Statistics:\n")
        for rewardType, metrics in confidenceIntervals.items():
            f.write(f"\n{rewardType}:\n")
            for metric, values in metrics.items():
                f.write(f"{metric}: {values['mean']:.2f} ± {values['ci']:.2f}\n")
    
    # Save final results table
    saveMetricsTable(resultsTable, "final_results", experiment_folder)
    
    print(f"\nExperiment results saved in: {experiment_folder}")
    return confidenceIntervals, allResults, resultsTable

In [10]:
# confidenceIntervals, allResults, resultsTable = runMultipleExperiments(
#     numRuns=4,
#     episodes=10000,
#     changeInterval=5000
# )

confidenceIntervals, allResults, resultsTable = runMultipleExperiments(
    numRuns=2,
    episodes=10000,
    changeInterval=5000
)


Starting Run 1/2
Starting Performance Comparison Test with seed 42...
Environment parameters updated: masscart=1.0, length=0.3, gravity=9.8

Testing reward function: adaptivereward

Metrics Debug at Episode 0:
Recent Average Reward: 20.82
Average Balance Time: 25.00
Balance Time Variance: 0.00
Component Weights - Stability: 0.60, Efficiency: 0.40

Episode 0 - Time Since Last Update: 0

Episode 0/10000
Average Reward: 20.82
Average Steps: 25.00

Episode 100/10000
Average Reward: 21.85
Average Steps: 22.61

Episode 200/10000
Average Reward: 35.34
Average Steps: 35.92

Episode 300/10000
Average Reward: 56.94
Average Steps: 57.51

Episode 400/10000
Average Reward: 112.95
Average Steps: 114.72

Episode 500/10000
Average Reward: 145.42
Average Steps: 147.54

Episode 600/10000
Average Reward: 77.41
Average Steps: 78.38

Episode 700/10000
Average Reward: 110.22
Average Steps: 110.58

Episode 800/10000
Average Reward: 345.41
Average Steps: 346.15

Episode 900/10000
Average Reward: 317.99
Avera

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


✓ LLM update for component 1 at episode 1505

Episode 1600/10000
Average Reward: 42.10
Average Steps: 87.55

Episode 1700/10000
Average Reward: 4.12
Average Steps: 10.81

Episode 1800/10000
Average Reward: 39.02
Average Steps: 97.77

Episode 1900/10000
Average Reward: 65.32
Average Steps: 163.52

Metrics Debug at Episode 2000:
Recent Average Reward: 35.11
Average Balance Time: 88.00
Balance Time Variance: 119261.88
Component Weights - Stability: 0.60, Efficiency: 0.40

Episode 2000 - Time Since Last Update: 495

Episode 2000/10000
Average Reward: 35.11
Average Steps: 88.00

Episode 2100/10000
Average Reward: 24.41
Average Steps: 61.17

Episode 2200/10000
Average Reward: 25.90
Average Steps: 64.90

Episode 2300/10000
Average Reward: 11.27
Average Steps: 28.32

Episode 2400/10000
Average Reward: 13.85
Average Steps: 34.91

Episode 2500/10000
Average Reward: 12.47
Average Steps: 31.49

Episode 2600/10000
Average Reward: 10.82
Average Steps: 27.36

Episode 2700/10000
Average Reward: 11.55


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


✓ LLM update for component 1 at episode 1505

Episode 1600/10000
Average Reward: 42.10
Average Steps: 87.55

Episode 1700/10000
Average Reward: 4.12
Average Steps: 10.81

Episode 1800/10000
Average Reward: 39.02
Average Steps: 97.77

Episode 1900/10000
Average Reward: 65.32
Average Steps: 163.52

Metrics Debug at Episode 2000:
Recent Average Reward: 35.11
Average Balance Time: 88.00
Balance Time Variance: 119261.88
Component Weights - Stability: 0.60, Efficiency: 0.40

Episode 2000 - Time Since Last Update: 495

Episode 2000/10000
Average Reward: 35.11
Average Steps: 88.00

Episode 2100/10000
Average Reward: 24.41
Average Steps: 61.17

Episode 2200/10000
Average Reward: 25.90
Average Steps: 64.90

Episode 2300/10000
Average Reward: 11.27
Average Steps: 28.32

Episode 2400/10000
Average Reward: 13.85
Average Steps: 34.91

Episode 2500/10000
Average Reward: 12.47
Average Steps: 31.49

Episode 2600/10000
Average Reward: 10.82
Average Steps: 27.36

Episode 2700/10000
Average Reward: 11.55
