Considering Adding:
- Confidence intervals for performance metrics (Most papers only really seem to go this far)
- Statistical significance tests between different approaches
- Variance analysis across multiple runs




### **This experiment is investigating the performance of an adaptive reward function to state of the art reward functions in environments with environmentally variable changes.**

-> Is there a statisitcally significant improvement in performance over time in this varying environment.


In [2]:
import gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import os
from pathlib import Path

import sys
from pathlib import Path


current_dir = os.getcwd()  
project_root = str(Path(current_dir).parent.parent)
sys.path.append(project_root)


# Initialize environment and device
from AdaptiveRewardFunctionLearning.Prompts.prompts import device, apiKey,modelName

#Cu stomCartPoleEnv
from RLEnvironment.env import CustomCartPoleEnv
#RewardUpdateSystem
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCritic import RewardUpdateSystem
#DQLearningAgent
from RLEnvironment.training.agent import DQLearningAgent
from RLEnvironment.training.training import trainDQLearning

#DynamicRewardFunction
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCodeGeneration import dynamicRewardFunction

#import
from AdaptiveRewardFunctionLearning.Visualisation.trainingTestFunctions import (
    runEpisode,
    detectJumps,
    analyzeRewardSensibility,
    performUpdate,
    updateCompositeRewardFunction,
    plotExperimentResults,
    savePlot
)

**State of the Art Reward Functions with Reference to Papers**

**Potential-based Reward Shaping (PBRS):**
```python
def potentialBasedRewardShaping(observation, action):
    x, xDot, angle, angleDot = observation
    gamma = 0.99  # Example discount factor

    def phi(x, xDot, angle, angleDot):
        # Example potential function
        return -abs(x) - abs(angle)

    current_potential = phi(x, xDot, angle, angleDot)
    next_potential = phi(x + xDot, angle + angleDot, xDot, angleDot)  # Simplified next state
    return float(gamma * next_potential - current_potential)
```

Paper: "Potential-based Shaping in Model-based Reinforcement Learning"

Link: https://cdn.aaai.org/AAAI/2008/AAAI08-096.pdf


**Parameterized Reward Shaping:**
```python
def parameterizedRewardShaping(observation, action):
    x, xDot, angle, angleDot = observation
    original_reward = 1.0  # Assuming default CartPole reward

    def f(x, xDot, angle, angleDot):
        # Example shaping reward function
        return -abs(angle)

    def z_phi(x, xDot, angle, angleDot):
        # Example shaping weight function
        return 0.5

    shaping_reward = f(x, xDot, angle, angleDot)
    shaping_weight = z_phi(x, xDot, angle, angleDot)
    return float(original_reward + shaping_weight * shaping_reward)
```

Paper: "Learning to Utilize Shaping Rewards: A New Approach of Reward Shaping"

Link: http://arxiv.org/pdf/2011.02669.pdf


**Energy Based Reward Function - Physics Based**

```python
def energyBasedReward(observation, action):
    x, xDot, angle, angleDot = observation
    
    # Calculate kinetic and potential energy components
    kineticEnergy = 0.5 * (xDot**2 + angleDot**2)
    potentialEnergy = 9.8 * (1 + cos(angle))  # g * h, where h depends on angle
    
    # Reward is inverse of total energy (less energy = more stable = better reward)
    energyPenalty = -(kineticEnergy + potentialEnergy)
    return float(1.0 + 0.1 * energyPenalty)  # Base reward plus energy term
```

Paper: "Energy-Based Control for Safe Robot Learning" (2019)

Link: https://ieeexplore.ieee.org/document/8794207


**Baseline Reward Function:**
```python
def baselineCartPoleReward(observation, action):
    return 1.0
```

### **Performance Experiment**

In [3]:
# State of the art Reward Functions

def potentialBasedRewardShaping(observation, action):
    x, xDot, angle, angleDot = observation
    gamma = 0.99  # Example discount factor

    def phi(x, xDot, angle, angleDot):
        # Example potential function
        return -abs(x) - abs(angle)

    current_potential = phi(x, xDot, angle, angleDot)
    next_potential = phi(x + xDot, angle + angleDot, xDot, angleDot)  # Simplified next state
    return float(gamma * next_potential - current_potential)


def parameterizedRewardShaping(observation, action):
    x, xDot, angle, angleDot = observation
    original_reward = 1.0  # Assuming default CartPole reward

    def f(x, xDot, angle, angleDot):
        # Example shaping reward function
        return -abs(angle)

    def z_phi(x, xDot, angle, angleDot):
        # Example shaping weight function
        return 0.5

    shaping_reward = f(x, xDot, angle, angleDot)
    shaping_weight = z_phi(x, xDot, angle, angleDot)
    return float(original_reward + shaping_weight * shaping_reward)


def energyBasedReward(observation, action):
    x, xDot, angle, angleDot = observation
    
    # Calculate kinetic and potential energy components
    kineticEnergy = 0.5 * (xDot**2 + angleDot**2)
    potentialEnergy = 9.8 * (1 + cos(angle))  # g * h, where h depends on angle
    
    # Reward is inverse of total energy (less energy = more stable = better reward)
    energyPenalty = -(kineticEnergy + potentialEnergy)
    return float(1.0 + 0.1 * energyPenalty)  # Base reward plus energy term


def baselineCartPoleReward(observation, action):
    return 1.0

In [4]:
def runPerformanceComparisonTest(episodes=1000,changeInterval=50):
    print("Starting Performance Comparison Test...")
    
    # Initialize environments and agents
    env = gym.make('CartPole-v1')
    env = CustomCartPoleEnv(env)
    
    # Define all reward functions
    rewardfunctions = {
        'adaptivereward': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': RewardUpdateSystem(apiKey, modelName),
            'rewardfunction': None  # Uses your existing adaptive reward
        },
        'baselinereward': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': None,
            'rewardfunction': baselineCartPoleReward
        },
        'pbrs': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': None,
            'rewardfunction': potentialBasedRewardShaping
        },
        'parameterized': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': None,
            'rewardfunction': parameterizedRewardShaping
        },
        'energybased': {
            'agent': DQLearningAgent(env, 4, 2, device),
            'updatesystem': None,
            'rewardfunction': energyBasedReward
        }
    }
    
    results = {}
    
    for rewardname, rewardinfo in rewardfunctions.items():
        print(f"\nTesting reward function: {rewardname}")
        
        # Reset environment for each test
        env.reset()
        
        # Set the reward function for this test
        if rewardname != 'adaptivereward':
            env.setRewardFunction(rewardinfo['rewardfunction'])
        
        # Training metrics
        episoderewards = []
        episodebalancetimes = []
        rewardchangeepisodes = [] if rewardname == 'adaptivereward' else None
        
        def onEpisodeEnd(env, updatesystem, episode, reward, steps):
            nonlocal episoderewards, episodebalancetimes, rewardchangeepisodes
            
            episoderewards.append(reward)
            episodebalancetimes.append(steps)
            
            if rewardname == 'adaptivereward':
                # Your existing adaptive reward update logic
                if hasattr(env.rewardFunction, 'compositeHistory'):
                    latest_updates = [
                        update['episode'] for update in env.rewardFunction.compositeHistory 
                        if update['episode'] == episode
                    ]
                    if latest_updates:
                        rewardchangeepisodes.append(episode)
            
            if episode % changeInterval == 0:
                print(f"Episode {episode}: Average Balance Time = {np.mean(episodebalancetimes[-50:]):.2f}")
        
        # Train using your existing function
        agent, env, rewards = trainDQLearning(
            agent=rewardinfo['agent'],
            env=env,
            numEpisodes=episodes,
            updateSystem=rewardinfo['updatesystem'],
            onEpisodeEnd=onEpisodeEnd
        )
        
        results[rewardname] = {
            'rewards': episoderewards,
            'balancetimes': episodebalancetimes,
            'rewardchanges': rewardchangeepisodes
        }
        
        print(f"\nCompleted testing {rewardname}")
        print(f"Final average reward: {np.mean(episoderewards[-100:]):.2f}")
        print(f"Final average balance time: {np.mean(episodebalancetimes[-100:]):.2f}")
    
    return results

In [5]:
# Run Experiment
results = runPerformanceComparisonTest(1000,50)

Starting Performance Comparison Test...

Testing reward function: adaptivereward
Episode 0: Average Balance Time = 11.00


  if not isinstance(terminated, (bool, np.bool8)):


Episode 50: Average Balance Time = 19.86
Episode 100: Average Balance Time = 17.44
Episode 150: Average Balance Time = 13.16
Episode 200: Average Balance Time = 14.08
Episode 250: Average Balance Time = 17.66
Episode 300: Average Balance Time = 11.40
Episode 350: Average Balance Time = 16.64
Episode 400: Average Balance Time = 19.16
Episode 450: Average Balance Time = 32.12
Episode 500: Average Balance Time = 48.74
Episode 550: Average Balance Time = 66.12
Episode 600: Average Balance Time = 66.88
Episode 650: Average Balance Time = 81.70
Episode 700: Average Balance Time = 108.12
Episode 750: Average Balance Time = 51.98
Episode 800: Average Balance Time = 70.70
Episode 850: Average Balance Time = 62.54
Episode 900: Average Balance Time = 89.70
Episode 950: Average Balance Time = 81.88

Completed testing adaptivereward
Final average reward: 130.09
Final average balance time: 130.54

Testing reward function: baselinereward
Episode 0: Average Balance Time = 36.00
Episode 50: Average Bal

In [None]:
def visualizePerformanceComparison(results):
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10))
    
    # Color map for different reward functions
    colors = ['b', 'g', 'r', 'c', 'm']  # Different color for each function
    
    # Plot rewards for each reward function
    for idx, (rewardname, rewardresults) in enumerate(results.items()):
        rewards = rewardresults['rewards']
        ax1.plot(pd.Series(rewards).rolling(50).mean(), 
                label=f'{rewardname}', linewidth=2, color=colors[idx])
        
        # Add reward function change markers for adaptive model
        if rewardname == 'adaptivereward' and rewardresults['rewardchanges']:
            for episode in rewardresults['rewardchanges']:
                ax1.axvline(x=episode, color='g', linestyle='--', alpha=0.3)
    
    ax1.set_title('Average Reward Over Time')
    ax1.set_xlabel('Episode')
    ax1.set_ylabel('Reward')
    ax1.legend()
    ax1.grid(True)
    
    # Plot balance times
    for idx, (rewardname, rewardresults) in enumerate(results.items()):
        balancetimes = rewardresults['balancetimes']
        ax2.plot(pd.Series(balancetimes).rolling(50).mean(),
                label=f'{rewardname}', linewidth=2, color=colors[idx])
    
    ax2.set_title('Average Balance Time Over Episodes')
    ax2.set_xlabel('Episode')
    ax2.set_ylabel('Steps')
    ax2.legend()
    ax2.grid(True)
    
    plt.tight_layout()
    savePlot(fig, "performancecomparison", "PerformanceExperiment")
    plt.close()

def calculateStability(rewards):
    """
    Calculate stability score based on reward variance in the last 100 episodes
    Lower variance = higher stability
    """
    if len(rewards) < 100:
        return 0.0
    
    last_hundred = rewards[-100:]
    mean_reward = np.mean(last_hundred)
    if mean_reward == 0:
        return 0.0
        
    # Calculate coefficient of variation (normalized standard deviation)
    stability = 1 - (np.std(last_hundred) / mean_reward)
    return max(0, min(1, stability))  # Normalize between 0 and 1

def calculateConvergenceTime(rewards, threshold=195, window=50):
    """
    Calculate the number of episodes needed to reach and maintain a certain performance
    threshold for a given window of episodes
    """
    if len(rewards) < window:
        return len(rewards)
    
    rolling_mean = pd.Series(rewards).rolling(window).mean()
    
    for episode in range(window, len(rewards)):
        if rolling_mean[episode] >= threshold:
            # Check if performance is maintained
            maintained = all(avg >= threshold * 0.9 for avg in rolling_mean[episode:episode+window])
            if maintained:
                return episode
    
    return len(rewards)  # If never converged, return total episodes

def calculatePerformanceMetrics(results):
    metrics = {}
    for rewardname, rewardresults in results.items():
        metrics[rewardname] = {
            'finalavgreward': np.mean(rewardresults['rewards'][-100:]),
            'finalavgbalance': np.mean(rewardresults['balancetimes'][-100:]),
            'convergencetime': calculateConvergenceTime(rewardresults['rewards']),
            'stability': calculateStability(rewardresults['rewards'])
        }
    return pd.DataFrame(metrics).T