Considering Adding:
- Confidence intervals for performance metrics (Most papers only really seem to go this far)
- Statistical significance tests between different approaches
- Variance analysis across multiple runs




### **This experiment is investigating the performance of an adaptive reward function to state of the art reward functions in environments with environmentally variable changes.**

-> Is there a statisitcally significant improvement in performance over time in this varying environment.


In [28]:
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import os
from pathlib import Path

import sys
from pathlib import Path


current_dir = os.getcwd()  
project_root = str(Path(current_dir).parent.parent)
sys.path.append(project_root)


# Initialize environment and device
from AdaptiveRewardFunctionLearning.Prompts.prompts import device, apiKey,modelName

#Cu stomCartPoleEnv
from RLEnvironment.env import CustomCartPoleEnv
#RewardUpdateSystem
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCritic import RewardUpdateSystem
#DQLearningAgent
from RLEnvironment.training.agent import DQLearningAgent
from RLEnvironment.training.training import trainDQLearning

#DynamicRewardFunction
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCodeGeneration import dynamicRewardFunction

#import
from AdaptiveRewardFunctionLearning.Visualisation.trainingTestFunctions import (
    runEpisode,
    detectJumps,
    analyzeRewardSensibility,
    performUpdate,
    updateCompositeRewardFunction,
    plotExperimentResults,
    savePlot
)

**State of the Art Reward Functions with Reference to Papers**

**Potential-based Reward Shaping (PBRS):**
```python
def potentialBasedRewardShaping(observation, action):
    x, xDot, angle, angleDot = observation
    gamma = 0.99  # Example discount factor

    def phi(x, xDot, angle, angleDot):
        # Example potential function
        return -abs(x) - abs(angle)

    current_potential = phi(x, xDot, angle, angleDot)
    next_potential = phi(x + xDot, angle + angleDot, xDot, angleDot)  # Simplified next state
    return float(gamma * next_potential - current_potential)
```

Paper: "Potential-based Shaping in Model-based Reinforcement Learning"

Link: https://cdn.aaai.org/AAAI/2008/AAAI08-096.pdf


**Parameterized Reward Shaping:**
```python
def parameterizedRewardShaping(observation, action):
    x, xDot, angle, angleDot = observation
    original_reward = 1.0  # Assuming default CartPole reward

    def f(x, xDot, angle, angleDot):
        # Example shaping reward function
        return -abs(angle)

    def z_phi(x, xDot, angle, angleDot):
        # Example shaping weight function
        return 0.5

    shaping_reward = f(x, xDot, angle, angleDot)
    shaping_weight = z_phi(x, xDot, angle, angleDot)
    return float(original_reward + shaping_weight * shaping_reward)
```

Paper: "Learning to Utilize Shaping Rewards: A New Approach of Reward Shaping"

Link: http://arxiv.org/pdf/2011.02669.pdf


**Energy Based Reward Function - Physics Based**

```python
def energyBasedReward(observation, action):
    x, xDot, angle, angleDot = observation
    
    # Calculate kinetic and potential energy components
    kineticEnergy = 0.5 * (xDot**2 + angleDot**2)
    potentialEnergy = 9.8 * (1 + cos(angle))  # g * h, where h depends on angle
    
    # Reward is inverse of total energy (less energy = more stable = better reward)
    energyPenalty = -(kineticEnergy + potentialEnergy)
    return float(1.0 + 0.1 * energyPenalty)  # Base reward plus energy term
```

Paper: "Energy-Based Control for Safe Robot Learning" (2019)

Link: https://ieeexplore.ieee.org/document/8794207


**Baseline Reward Function:**
```python
def baselineCartPoleReward(observation, action):
    return 1.0
```

### **Performance Experiment**

In [29]:
# State of the art Reward Functions

def potentialBasedRewardShaping(observation, action):
    x, xDot, angle, angleDot = observation
    gamma = 0.99  # Example discount factor

    def phi(x, xDot, angle, angleDot):
        # Example potential function
        return -abs(x) - abs(angle)

    current_potential = phi(x, xDot, angle, angleDot)
    next_potential = phi(x + xDot, angle + angleDot, xDot, angleDot)  # Simplified next state
    return float(gamma * next_potential - current_potential)


def parameterizedRewardShaping(observation, action):
    x, xDot, angle, angleDot = observation
    original_reward = 1.0  # Assuming default CartPole reward

    def f(x, xDot, angle, angleDot):
        # Example shaping reward function
        return -abs(angle)

    def z_phi(x, xDot, angle, angleDot):
        # Example shaping weight function
        return 0.5

    shaping_reward = f(x, xDot, angle, angleDot)
    shaping_weight = z_phi(x, xDot, angle, angleDot)
    return float(original_reward + shaping_weight * shaping_reward)


def energyBasedReward(observation, action):
    x, xDot, angle, angleDot = observation
    
    # Calculate kinetic and potential energy components
    kineticEnergy = 0.5 * (xDot**2 + angleDot**2)
    potentialEnergy = 9.8 * (1 + np.cos(angle))  # g * h, where h depends on angle
    
    # Reward is inverse of total energy (less energy = more stable = better reward)
    energyPenalty = -(kineticEnergy + potentialEnergy)
    return float(1.0 + 0.1 * energyPenalty)  # Base reward plus energy term


def baselineCartPoleReward(observation, action):
    return 1.0

In [30]:
def runPerformanceComparisonTest(episodes=1000, changeInterval=200, lengthchanges=[0.5, 0.75, 1.0, 1.25, 1.5]):
    print("Starting Performance Comparison Test...")
    
    currentlengthidx = 0
    
    # Initialize environments and agents
    env = gym.make('CartPole-v1')
    env = CustomCartPoleEnv(env)
    env.setEnvironmentParameters(masscart=1.0, length=lengthchanges[0], gravity=9.8)
    
    # Define all reward functions
    rewardfunctions = {
        'adaptivereward': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': RewardUpdateSystem(apiKey, modelName),
            'rewardfunction': None  # Uses your existing adaptive reward
        },
        'baselinereward': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': None,
            'rewardfunction': baselineCartPoleReward
        },
        'pbrs': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': None,
            'rewardfunction': potentialBasedRewardShaping
        },
        'parameterized': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': None,
            'rewardfunction': parameterizedRewardShaping
        },
        'energybased': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': None,
            'rewardfunction': energyBasedReward
        }
    }
    
    results = {}
    
    for rewardname, rewardinfo in rewardfunctions.items():
        print(f"\nTesting reward function: {rewardname}")
        
        # Reset environment for each test
        env.reset()
        
        # Set the reward function for this test
        if rewardname != 'adaptivereward':
            env.setRewardFunction(rewardinfo['rewardfunction'])
        
        # Training metrics
        episoderewards = []
        episodebalancetimes = []
        rewardchangeepisodes = [] if rewardname == 'adaptivereward' else None
        
        def onEpisodeEnd(env, updatesystem, episode, reward, steps):
            nonlocal episoderewards, episodebalancetimes, rewardchangeepisodes, currentlengthidx
            
            # Track metrics
            episoderewards.append(reward)
            episodebalancetimes.append(steps)
            
            # Handle adaptive reward updates
            if rewardname == 'adaptivereward':
                # Build metrics for composite reward update check
                metrics = {
                    'currentEpisode': episode,
                    'recentRewards': episoderewards[-100:] if len(episoderewards) > 100 else episoderewards,
                    'averageBalanceTime': np.mean(episodebalancetimes[-100:]) if episodebalancetimes else 0,
                    'balanceTimeVariance': np.var(episodebalancetimes[-100:]) if len(episodebalancetimes) > 1 else 0
                }
                
                # Check for composite reward updates
                updateCompositeRewardFunction(env, updatesystem, metrics, dynamicRewardFunction)
                
                # Check if any composite reward function was updated
                if hasattr(env.rewardFunction, 'compositeHistory'):
                    latest_updates = [
                        update['episode'] for update in env.rewardFunction.compositeHistory 
                        if update['episode'] == episode
                    ]
                    if latest_updates:
                        rewardchangeepisodes.append(episode)
                        print(f"\nReward function updated at episode {episode}")
            
            # Handle environment changes
            if episode % changeInterval == 0 and episode > 0:
                currentlengthidx = (currentlengthidx + 1) % len(lengthchanges)
                newlength = lengthchanges[currentlengthidx]
                env.setEnvironmentParameters(length=newlength)
                print(f"\nChanged pole length to: {newlength}m at episode {episode}")
            
            # Print progress
            if episode % 50 == 0:
                print(f"Episode {episode}: Average Balance Time = {np.mean(episodebalancetimes[-50:]):.2f}")

                print(f"Episode {episode}")
                print(f"  Steps: {steps}")
                print(f"  Total Reward: {reward}")
                print(f"  Average Balance Time = {np.mean(episodebalancetimes[-50:]):.2f}")
                print(f"  Average Reward = {np.mean(episoderewards[-50:]):.2f}")
        
        # Train using your existing function
        agent, env, rewards = trainDQLearning(
            agent=rewardinfo['agent'],
            env=env,
            numEpisodes=episodes,
            updateSystem=rewardinfo['updatesystem'],
            onEpisodeEnd=onEpisodeEnd
        )
        
        results[rewardname] = {
            'rewards': episoderewards,
            'balancetimes': episodebalancetimes,
            'rewardchanges': rewardchangeepisodes
        }
        
        print(f"\nCompleted testing {rewardname}")
        print(f"Final average reward: {np.mean(episoderewards[-100:]):.2f}")
        print(f"Final average balance time: {np.mean(episodebalancetimes[-100:]):.2f}")
    
    return results

In [31]:
# Run Experiment
results = runPerformanceComparisonTest(1000,200,lengthchanges = [0.5, 0.75, 1.0, 1.25, 1.5])

Starting Performance Comparison Test...
Environment parameters updated: masscart=1.0, length=0.5, gravity=9.8

Testing reward function: adaptivereward
Episode 0: Average Balance Time = 28.00
Episode 0
  Steps: 28
  Total Reward: 27.80975317955017
  Average Balance Time = 28.00
  Average Reward = 27.81


  if not isinstance(terminated, (bool, np.bool8)):


Episode 50: Average Balance Time = 21.60
Episode 50
  Steps: 11
  Total Reward: 10.926972568035126
  Average Balance Time = 21.60
  Average Reward = 21.48
Episode 100: Average Balance Time = 19.64
Episode 100
  Steps: 13
  Total Reward: 12.947589099407196
  Average Balance Time = 19.64
  Average Reward = 19.53
Episode 150: Average Balance Time = 15.30
Episode 150
  Steps: 8
  Total Reward: 7.9320149421691895
  Average Balance Time = 15.30
  Average Reward = 15.21

Checking stability component:
Time since update: 200
Short-term trend: -0.874
Medium-term trend: 0.597
Performance variance: 331.785

Checking efficiency component:
Time since update: 200
Short-term trend: -0.874
Medium-term trend: 0.597
Performance variance: 331.785

Checking time component:
Time since update: 200
Short-term trend: -0.874
Medium-term trend: 0.597
Performance variance: 331.785
Environment parameters updated: masscart=1.0, length=0.75, gravity=9.8

Changed pole length to: 0.75m at episode 200
Episode 200: Aver

In [35]:
def visualizePerformanceComparison(results):
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # Color map for different reward functions
    colors = ['b', 'g', 'r', 'c', 'm']  # Different color for each function
    
    # Plot rewards for each reward function with variance
    for idx, (rewardname, rewardresults) in enumerate(results.items()):
        rewards = pd.Series(rewardresults['rewards'])
        
        # Calculate rolling mean and standard deviation for rewards
        window = 50
        rolling_mean_rewards = rewards.rolling(window=window).mean()
        rolling_std_rewards = rewards.rolling(window=window).std()
        
        # Plot mean line for rewards
        ax1.plot(rolling_mean_rewards, 
                label=f'{rewardname}', 
                linewidth=2, 
                color=colors[idx])
        
        # Plot variance area for rewards
        ax1.fill_between(
            range(len(rewards)),
            rolling_mean_rewards - rolling_std_rewards,
            rolling_mean_rewards + rolling_std_rewards,
            color=colors[idx],
            alpha=0.2
        )
        
        # Add vertical lines for environment changes (red)
        for ep in range(200, len(rewards), 200):  # Every 200 episodes
            ax1.axvline(x=ep, color='r', linestyle='--', alpha=0.3,
                       label='Environment Change' if ep == 200 else None)
        
        # Add reward function change markers for adaptive model (green)
        if rewardname == 'adaptivereward' and rewardresults['rewardchanges']:
            for episode in rewardresults['rewardchanges']:
                ax1.axvline(x=episode, color='g', linestyle='--', alpha=0.3,
                          label='Reward Update' if episode == rewardresults['rewardchanges'][0] else None)
    
    ax1.set_title('Average Reward Over Time with Variance')
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('Reward')
    ax1.legend()
    ax1.grid(True)
    
    # Plot balance times with variance (same structure as rewards)
    for idx, (rewardname, rewardresults) in enumerate(results.items()):
        balancetimes = pd.Series(rewardresults['balancetimes'])
        
        rolling_mean_balance = balancetimes.rolling(window=window).mean()
        rolling_std_balance = balancetimes.rolling(window=window).std()
        
        ax2.plot(rolling_mean_balance,
                label=f'{rewardname}', 
                linewidth=2, 
                color=colors[idx])
        
        ax2.fill_between(
            range(len(balancetimes)),
            rolling_mean_balance - rolling_std_balance,
            rolling_mean_balance + rolling_std_balance,
            color=colors[idx],
            alpha=0.2
        )
        
        # Add same vertical lines to balance time plot
        for ep in range(200, len(balancetimes), 200):
            ax2.axvline(x=ep, color='r', linestyle='--', alpha=0.3,
                       label='Environment Change' if ep == 200 else None)
            
        if rewardname == 'adaptivereward' and rewardresults['rewardchanges']:
            for episode in rewardresults['rewardchanges']:
                ax2.axvline(x=episode, color='g', linestyle='--', alpha=0.3,
                          label='Reward Update' if episode == rewardresults['rewardchanges'][0] else None)
    
    ax2.set_title('Average Balance Time Over Episodes with Variance')
    ax2.set_xlabel('Episode')
    ax2.set_ylabel('Steps')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    savePlot(fig, "performancecomparison_with_variance", "PerformanceExperiment")
    plt.close()

def calculateStability(rewards):
    """
    Calculate stability score based on reward variance in the last 100 episodes
    Lower variance = higher stability
    """
    if len(rewards) < 100:
        return 0.0
    
    last_hundred = rewards[-100:]
    mean_reward = np.mean(last_hundred)
    if mean_reward == 0:
        return 0.0
        
    # Calculate coefficient of variation (normalized standard deviation)
    stability = 1 - (np.std(last_hundred) / mean_reward)
    return max(0, min(1, stability))  # Normalize between 0 and 1

def calculateConvergenceTime(rewards, threshold=195, window=50):
    """
    Calculate the number of episodes needed to reach and maintain a certain performance
    threshold for a given window of episodes
    """
    if len(rewards) < window:
        return len(rewards)
    
    rolling_mean = pd.Series(rewards).rolling(window).mean()
    
    for episode in range(window, len(rewards)):
        if rolling_mean[episode] >= threshold:
            # Check if performance is maintained
            maintained = all(avg >= threshold * 0.9 for avg in rolling_mean[episode:episode+window])
            if maintained:
                return episode
    
    return len(rewards)  # If never converged, return total episodes

def calculatePerformanceMetrics(results):
    metrics = {}
    for rewardname, rewardresults in results.items():
        metrics[rewardname] = {
            'finalavgreward': np.mean(rewardresults['rewards'][-100:]),
            'finalavgbalance': np.mean(rewardresults['balancetimes'][-100:]),
            'convergencetime': calculateConvergenceTime(rewardresults['rewards']),
            'stability': calculateStability(rewardresults['rewards'])
        }
    return pd.DataFrame(metrics).T

In [36]:

# Visualize the results
visualizePerformanceComparison(results)


# Calculate and display the metrics
metrics = calculatePerformanceMetrics(results)
print("\nPerformance Metrics:")
print(metrics)

Saved plot: performancecomparison_with_variance_results_27112024_104214.png in PerformanceExperiment

Performance Metrics:
                finalavgreward  finalavgbalance  convergencetime  stability
adaptivereward       86.131570            86.49           1000.0   0.000000
baselinereward      142.150000           142.15           1000.0   0.641348
pbrs                 -3.205471            33.76           1000.0   1.000000
parameterized       129.429630           133.99           1000.0   0.623931
energybased         -11.004419             9.87           1000.0   1.000000
