In [1]:
#Also Implement these Ideas: vvvv

# Have many possible starting points. Many starting reward functions. Test if they start to perform.

# This bit will be an exploration of the weaknesses of the architecture, in turn checking robustness.

# This could compose of creating many 'bad' initial reward functions and seeing if it can re-adjust.

# I would also need to investigate the performance changes between changes of reward functions and see over time if this actually improves performance regularly, is it robust over many changes in this sense?


#Effective waiting time.
#Take the current implementation and change it to **measure balance time as a performance metric**

### **Here I am simply investigating the robustness of performance when adapting reward functions**

-> Does the performance have an unexpected jumps? Does the performance drop? Are the reward functions sensible in the context?

I do this in three stages of tests:
1. *Test a badly initilized composite reward function. See how it adaptively updates.*
2. *Start with good intial composite reward functions, but with varying environment states. See if it is robust to varying environment variables.*
3. *Both a bad intial set of composite reward function and also varying environment variables.*

In [2]:
# Cell 1: Common Imports and Setup
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import sys
from pathlib import Path
from datetime import datetime

# plt.style.use('seaborn')
# sns.set_palette("husl")

current_dir = os.getcwd()  
project_root = str(Path(current_dir).parent.parent)
sys.path.append(project_root)

# Initialize environment and device
from AdaptiveRewardFunctionLearning.Prompts.prompts import device, apiKey,modelName

#Cu stomCartPoleEnv
from RLEnvironment.env import CustomCartPoleEnv
#RewardUpdateSystem
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCritic import RewardUpdateSystem
#DQLearningAgent
from RLEnvironment.training.agent import DQLearningAgent
from RLEnvironment.training.training import trainDQLearning

#DynamicRewardFunction
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCodeGeneration import dynamicRewardFunction

#import
from AdaptiveRewardFunctionLearning.Visualisation.trainingTestFunctions import (
    runEpisode,
    detectJumps,
    analyzeRewardSensibility,
    performUpdate,
    plotExperimentResults,
    savePlot
)

# Set bad initial reward function
def badReward(observation, action):
    x, xDot, angle, angleDot = observation
    return float(-5.0 * abs(x) + 0.1 * np.cos(angle) - 0.1 * abs(xDot))
    

### **Experiment 1: Bad Initialization Test**

In [4]:
# Cell 2: Experiment 1 - Bad Initialization Test
def runBadInitializationTest(episodes=1000):
    print("Starting Bad Initialization Test...")
    
    env = gym.make('CartPole-v1')


    env = CustomCartPoleEnv(env)
    
    env.setRewardFunction(badReward)

    updateSystem = RewardUpdateSystem(apiKey, modelName)

    agent = DQLearningAgent(env, 4, 2, device)
    
    rewards = []
    balance_times = [] 
    metrics = {}

    
    for episode in range(episodes):
        state = env.reset()[0]
        episode_reward = 0
        timesteps = 0 
        done = False

        while not done:
            action = agent.chooseAction(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            episode_reward += reward
            timesteps += 1 
            
            agent.remember(state, action, reward, next_state, done)
            state = next_state

        rewards.append(episode_reward)
        balance_times.append(timesteps)
        
        if episode % 100 == 0:
            metrics[episode] = {
                'jumps': detectJumps(rewards),
                'averageReward': np.mean(rewards[-100:]),
                'sensibility': analyzeRewardSensibility(env.rewardFunction),
                'averageBalanceTime': np.mean(balance_times[-100:])
            }
            
        if updateSystem.waitingTime(episode):
            performUpdate(env, updateSystem, episode)
    
    # Plot results with two subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # Plot rewards
    ax1.plot(rewards, alpha=0.6, label='Episode Reward')
    ax1.plot(pd.Series(rewards).rolling(50).mean(), label='50-Episode Moving Average', linewidth=2)
    
    for i in range(0, len(rewards), 100):
        avg_reward = np.mean(rewards[i:i+100])
        ax1.axhline(y=avg_reward, xmin=i/len(rewards), xmax=(i+100)/len(rewards), 
                   color='r', linestyle='--', alpha=0.5)
    
    ax1.set_title('Rewards Over Time')
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('Total Reward')
    ax1.legend()
    ax1.grid(True)
    
    # Plot balance times
    ax2.plot(balance_times, alpha=0.6, label='Balance Time')
    ax2.plot(pd.Series(balance_times).rolling(50).mean(), label='50-Episode Moving Average', linewidth=2)
    
    for i in range(0, len(balance_times), 100):
        avg_time = np.mean(balance_times[i:i+100])
        ax2.axhline(y=avg_time, xmin=i/len(balance_times), xmax=(i+100)/len(balance_times), 
                   color='r', linestyle='--', alpha=0.5)
    
    ax2.set_title('Balance Time Over Episodes')
    ax2.set_xlabel('Episode')
    ax2.set_ylabel('Timesteps')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()

    savePlot(fig, "BadInitialization", plotType="training_results")

    plt.show()
    
    return rewards, metrics, balance_times

rewards, metrics, balance_times = runBadInitializationTest(1000)  

Starting Bad Initialization Test...


NameError: name 'CustomCartPoleEnv' is not defined

### **Experiment 2 - Environment Variation Test**


All relationships connections can be shown mathematically. So if I can show that it can optimally adapt for change in one environment variable it should abstract to other environment changes also

In [4]:
def runEnvironmentVariationTest(episodes=1000, changeInterval=200):
    print("Starting Environment Variation Test...")
    
    # Define length changes (5 values)
    length_changes = [0.5, 0.75, 1.0, 1.25, 1.5]  # Pole lengths in meters
    current_length_idx = 0
    
    # Setup environment with initial configuration
    env = gym.make('CartPole-v1')
    env = CustomCartPoleEnv(env)
    env.setEnvironmentParameters(masscart=1.0, length=length_changes[0], gravity=9.8)
    
    # Initialize update system and agent
    updateSystem = RewardUpdateSystem(apiKey, modelName)
    agent = DQLearningAgent(env, 4, 2, device)
    
    # Initialize metrics and tracking variables
    metrics = {}
    episode_rewards = []
    episode_balance_times = []
    reward_change_episodes = []  # Track reward function changes
    
    # Create callback for length changes
    def onEpisodeEnd(env, updateSystem, episode, reward, steps):
        nonlocal current_length_idx, episode_rewards, episode_balance_times, reward_change_episodes
        
        # Record reward and balance time
        episode_rewards.append(reward)
        episode_balance_times.append(steps)
        
        # Check if any composite reward function was updated
        if hasattr(dynamicRewardFunction, 'compositeHistory'):
            latest_updates = [
                update['episode'] for update in dynamicRewardFunction.compositeHistory 
                if update['episode'] == episode
            ]
            if latest_updates:
                reward_change_episodes.append(episode)
                print(f"\nReward function updated at episode {episode}")
        
        if episode % changeInterval == 0 and episode > 0:
            current_length_idx = (current_length_idx + 1) % len(length_changes)
            new_length = length_changes[current_length_idx]
            env.setEnvironmentParameters(length=new_length)
            print(f"\nChanged pole length to: {new_length}m at episode {episode}")
        
        # Record metrics every 100 episodes
        if episode % 100 == 0:
            metrics[episode] = {
                'jumps': detectJumps(episode_rewards),
                'averageReward': np.mean(episode_rewards[-100:]) if episode_rewards else 0,
                'sensibility': analyzeRewardSensibility(env.rewardFunction),
                'averageBalanceTime': np.mean(episode_balance_times[-100:]) if episode_balance_times else 0,
                'currentLength': length_changes[current_length_idx]
            }
            print(f"Episode {episode}: Avg Balance Time = {metrics[episode]['averageBalanceTime']:.2f}")
    
    # Train the agent
    agent, env, rewards = trainDQLearning(
        agent=agent,
        env=env,
        numEpisodes=episodes,
        updateSystem=updateSystem,
        onEpisodeEnd=lambda env, updateSystem, episode, reward, steps: onEpisodeEnd(env, updateSystem, episode, reward, steps)
    )
    
    # Create length history
    length_history = []
    for episode in range(episodes):
        idx = (episode // changeInterval) % len(length_changes)
        length_history.append(length_changes[idx])
    
    # Plot results
    fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(12, 15))
    
    # Plot rewards
    ax1.plot(episode_rewards, alpha=0.6, label='Episode Reward')
    ax1.plot(pd.Series(episode_rewards).rolling(50).mean(), 
             label='50-Episode Moving Average', linewidth=2)
    
    # Add vertical lines for length changes (red)
    for ep in range(changeInterval, episodes, changeInterval):
        ax1.axvline(x=ep, color='r', linestyle='--', alpha=0.3, 
                   label='Length Change' if ep == changeInterval else None)
    
    # Add vertical lines for reward function changes (green)
    for ep in reward_change_episodes:
        ax1.axvline(x=ep, color='g', linestyle='--', alpha=0.3, 
                   label='Reward Update' if ep == reward_change_episodes[0] else None)
    
    ax1.set_title('Rewards Over Time with Length and Reward Function Changes')
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('Total Reward')
    ax1.legend()
    ax1.grid(True)
    
    # Plot balance times
    ax2.plot(episode_balance_times, alpha=0.6, label='Balance Time')
    ax2.plot(pd.Series(episode_balance_times).rolling(50).mean(), 
             label='50-Episode Moving Average', linewidth=2)
    
    # Add vertical lines for length changes (red)
    for ep in range(changeInterval, episodes, changeInterval):
        ax2.axvline(x=ep, color='r', linestyle='--', alpha=0.3,
                   label='Length Change' if ep == changeInterval else None)
    
    # Add vertical lines for reward function changes (green)
    for ep in reward_change_episodes:
        ax2.axvline(x=ep, color='g', linestyle='--', alpha=0.3,
                   label='Reward Update' if ep == reward_change_episodes[0] else None)
    
    ax2.set_title('Balance Time Over Episodes')
    ax2.set_xlabel('Episode')
    ax2.set_ylabel('Timesteps')
    ax2.legend()
    ax2.grid(True)
    
    # Plot pole length changes
    ax3.plot(length_history, label='Pole Length')
    ax3.set_title('Pole Length Over Episodes')
    ax3.set_xlabel('Episode')
    ax3.set_ylabel('Length (m)')
    ax3.grid(True)
    
    plt.tight_layout()
    
    # Save results with timestamp
    timestamp = datetime.now().strftime("%d%m%Y_%H%M%S")
    filename = f"length_variation_test_{timestamp}.png"
    plt.savefig(filename)
    plt.close()
    
    print("\nExperiment completed!")
    
    return {
        'rewards': episode_rewards,
        'balance_times': episode_balance_times,
        'metrics': metrics,
        'length_history': length_history,
        'reward_changes': reward_change_episodes
    }

if __name__ == "__main__":
    results = runEnvironmentVariationTest(1000, 200)
    
    # Print final performance statistics
    final_avg_reward = np.mean(results['rewards'][-100:])
    final_avg_balance = np.mean(results['balance_times'][-100:])
    print(f"\nFinal performance:")
    print(f"Average Reward: {final_avg_reward:.2f}")
    print(f"Average Balance Time: {final_avg_balance:.2f}")

Starting Environment Variation Test...
Environment parameters updated: masscart=1.0, length=0.5, gravity=9.8
Episode 0: Avg Balance Time = 44.00
Episode 100: Avg Balance Time = 19.07
Environment parameters updated: masscart=1.0, length=0.75, gravity=9.8

Changed pole length to: 0.75m at episode 200
Episode 200: Avg Balance Time = 34.45
Episode 300: Avg Balance Time = 56.42
Environment parameters updated: masscart=1.0, length=1.0, gravity=9.8

Changed pole length to: 1.0m at episode 400
Episode 400: Avg Balance Time = 80.40
Episode 500: Avg Balance Time = 106.39
Environment parameters updated: masscart=1.0, length=1.25, gravity=9.8

Changed pole length to: 1.25m at episode 600
Episode 600: Avg Balance Time = 121.73
Episode 700: Avg Balance Time = 96.03
Environment parameters updated: masscart=1.0, length=1.5, gravity=9.8

Changed pole length to: 1.5m at episode 800
Episode 800: Avg Balance Time = 196.46
Episode 900: Avg Balance Time = 244.14

Experiment completed!

Final performance:
Av

### **Experiment 3 - Combined Challenge Test** - Currently Underconstruction

In [None]:
# Cell 4: Experiment 3 - Combined Challenge Test
def runCombinedChallengeTest(episodes=1000):
   print("Starting Combined Challenge Test...")
   
   envConfigs = [
       {"masscart": 1.0, "length": 0.5, "gravity": 9.8},
       {"masscart": 2.0, "length": 1.0, "gravity": 9.8},
   ]
   
   results = {}
   
   for configIdx, config in enumerate(envConfigs):
       print(f"\nTesting configuration {configIdx + 1}")
       
       env = gym.make('CartPole-v1')
       env = CustomCartPoleEnv(env)
       env.setEnvironmentParameters(**config)
       
       env.setRewardFunction(badReward)  # Changed from eval(badReward) since we defined it as a function
       
       updateSystem = RewardUpdateSystem(apiKey, modelName)
       agent = DQLearningAgent(env, 4, 2, device)
       
       rewards = []
       balance_times = []  # Added balance times tracking
       metrics = {}
       
       for episode in range(episodes):
           # Modified to track balance time
           state = env.reset()[0]
           episode_reward = 0
           timesteps = 0
           done = False

           while not done:
               action = agent.chooseAction(state)
               next_state, reward, terminated, truncated, _ = env.step(action)
               done = terminated or truncated
               episode_reward += reward
               timesteps += 1
               
               agent.remember(state, action, reward, next_state, done)
               state = next_state

           rewards.append(episode_reward)
           balance_times.append(timesteps)
           
           if episode % 100 == 0:
               metrics[episode] = {
                   'jumps': detectJumps(rewards),
                   'averageReward': np.mean(rewards[-100:]),
                   'sensibility': analyzeRewardSensibility(env.rewardFunction),
                   'averageBalanceTime': np.mean(balance_times[-100:])
               }
           
           if updateSystem.waitingTime(episode):
               performUpdate(env, updateSystem, episode)
       
       results[f"config_{configIdx}"] = {
           "rewards": rewards,
           "balance_times": balance_times,  # Added to results
           "metrics": metrics,
           "config": config
       }
       
       # Plot individual configuration results
       fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
       
       # Plot rewards
       ax1.plot(rewards, alpha=0.6, label='Episode Reward')
       ax1.plot(pd.Series(rewards).rolling(50).mean(), label='50-Episode Moving Average', linewidth=2)
       
       # Add horizontal lines for 100-episode averages
       for i in range(0, len(rewards), 100):
           avg_reward = np.mean(rewards[i:i+100])
           ax1.axhline(y=avg_reward, xmin=i/len(rewards), xmax=(i+100)/len(rewards), 
                      color='r', linestyle='--', alpha=0.5)
       
       ax1.set_title(f'Rewards Over Time - Combined Challenge Config {configIdx + 1}\n{config}')
       ax1.set_xlabel('Episode')
       ax1.set_ylabel('Total Reward')
       ax1.legend()
       ax1.grid(True)
       
       # Plot balance times
       ax2.plot(balance_times, alpha=0.6, label='Balance Time')
       ax2.plot(pd.Series(balance_times).rolling(50).mean(), label='50-Episode Moving Average', linewidth=2)
       
       # Add horizontal lines for 100-episode averages
       for i in range(0, len(balance_times), 100):
           avg_time = np.mean(balance_times[i:i+100])
           ax2.axhline(y=avg_time, xmin=i/len(balance_times), xmax=(i+100)/len(balance_times), 
                      color='r', linestyle='--', alpha=0.5)
       
       ax2.set_title('Balance Time Over Episodes')
       ax2.set_xlabel('Episode')
       ax2.set_ylabel('Timesteps')
       ax2.legend()
       ax2.grid(True)
       
       plt.tight_layout()
       plt.show()
   
   # Plot comparison across configurations (now with both metrics)
   fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
   
   # Rewards comparison
   for config, data in results.items():
       ax1.plot(pd.Series(data['rewards']).rolling(50).mean(), 
               label=f"Config {config}")
   ax1.set_title("Combined Challenge - Reward Comparison Across Configurations")
   ax1.set_xlabel("Episode")
   ax1.set_ylabel("Average Reward (50-episode window)")
   ax1.legend()
   ax1.grid(True)
   
   # Balance times comparison
   for config, data in results.items():
       ax2.plot(pd.Series(data['balance_times']).rolling(50).mean(), 
               label=f"Config {config}")
   ax2.set_title("Combined Challenge - Balance Time Comparison Across Configurations")
   ax2.set_xlabel("Episode")
   ax2.set_ylabel("Average Balance Time (50-episode window)")
   ax2.legend()
   ax2.grid(True)
   
   plt.tight_layout()
   plt.show()
   
   return results

runCombinedChallengeTest(500)

: 