In [1]:
import os
import time
import numpy as np
import gymnasium as gym
import random
from datetime import datetime, timedelta
import torch  # Make sure this is imported

import sys
from pathlib import Path

current_dir = os.getcwd()  
project_root = str(Path(current_dir).parent.parent)
sys.path.append(project_root)

# Import your custom components
from RLEnvironment.env import CustomCartPoleEnv
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCritic import RewardUpdateSystem
from RLEnvironment.training.agent import DQLearningAgent
from AdaptiveRewardFunctionLearning.RewardGeneration.rewardCodeGeneration import stabilityReward, efficiencyReward
from AdaptiveRewardFunctionLearning.Prompts.prompts import apiKey, modelName

def run_continuous_cartpole_demo(use_pretrained=True, save_model=True):
    """
    Run a continuous CartPole demonstration where the pole length 
    randomly changes every 2 minutes of real-life time.
    
    Args:
        use_pretrained: Whether to load a pre-trained model if available
        save_model: Whether to save the model after pre-training
    """
    print("Setting up CartPole for Open Day demonstration...")
    
    # Initialize environment with human rendering
    env = gym.make('CartPole-v1', render_mode='human')
    env = CustomCartPoleEnv(env)
    
    # Set initial pole length
    possible_lengths = [0.5, 3.0, 1.5, 0.5, 2.5]
    current_length = random.choice(possible_lengths)
    env.setEnvironmentParameters(length=current_length)
    print(f"Starting with pole length: {current_length}m")
    
    # Set up adaptive reward function
    env.setComponentReward(1, stabilityReward)  # Stability component
    env.setComponentReward(2, efficiencyReward)  # Efficiency component
    
    # Initialize update system
    update_system = RewardUpdateSystem(apiKey, modelName)
    
    # Create agent
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQLearningAgent(env, state_size, action_size, "cpu", epsilon=0.01)
    
    # Define model save path
    model_path = os.path.join(current_dir, "cartpole_pretrained_model.pt")
    
    # Check if we should use a pre-trained model
    if use_pretrained and os.path.exists(model_path):
        print(f"Loading pre-trained model from {model_path}")
        try:
            # Load the model state dict
            agent.model.load_state_dict(torch.load(model_path))
            # Also copy to target model
            agent.targetModel.load_state_dict(agent.model.state_dict())
            print("Pre-trained model loaded successfully!")
        except Exception as e:
            print(f"Error loading model: {e}")
            print("Will pre-train a new model instead.")
            use_pretrained = False
    else:
        use_pretrained = False
        
    # Pre-train the agent only if not using pre-trained model
    if not use_pretrained:
        print("Pre-training agent...")
        for episode in range(5000):  # Increased episodes for better performance
            state = env.reset()[0]
            episode_reward = 0
            done = False
            while not done:
                action = agent.chooseAction(state)
                next_state, reward, terminated, truncated, _ = env.step(action)
                done = terminated or truncated
                agent.remember(state, action, reward, next_state, done)
                state = next_state
                episode_reward += reward
            agent.replay(64)  # Larger batch for better learning
            
            # Print progress occasionally
            if (episode + 1) % 50 == 0:
                print(f"Pre-training episode {episode + 1}/500, Reward: {episode_reward}")
                
        # Save the trained model if requested
        if save_model:
            print(f"Saving pre-trained model to {model_path}")
            torch.save(agent.model.state_dict(), model_path)
            print("Model saved for future demonstrations!")
    
    # Set up timing for pole length changes
    change_interval_seconds = 120  # 2 minutes
    last_change_time = datetime.now()
    next_change_time = last_change_time + timedelta(seconds=change_interval_seconds)
    
    print(f"\nðŸŽ® DEMO STARTED! Pole length will change every 2 minutes")
    print(f"Next change scheduled at: {next_change_time.strftime('%H:%M:%S')}")
    print("Press Ctrl+C to stop the demonstration")
    
    # Reset environment to start demonstration
    state = env.reset()[0]
    done = False
    episode_steps = 0
    episodes_completed = 0
    
    try:
        # Main demonstration loop - runs until interrupted
        while True:
            # Rest of your code remains the same...
            # Check if it's time to change the pole length
            current_time = datetime.now()
            
            if current_time >= next_change_time:
                # Choose a new length that's different from the current one
                new_lengths = [l for l in possible_lengths if l != current_length]
                new_length = random.choice(new_lengths)
                
                print(f"\nðŸš¨ ENVIRONMENT CHANGE ðŸš¨")
                print(f"Changing pole length from {current_length}m to {new_length}m")
                env.setEnvironmentParameters(length=new_length)
                current_length = new_length
                
                # Update timing
                last_change_time = current_time
                next_change_time = current_time + timedelta(seconds=change_interval_seconds)
                print(f"Next change scheduled at: {next_change_time.strftime('%H:%M:%S')}")
                
                # Trigger reward function update after length change
                print("ðŸ§  Updating reward function to adapt to new length...")
                for component in [1, 2]:
                    update_system.targetComponent = component
                    current_func = env.rewardComponents[f'rewardFunction{component}']
                    new_function, updated = update_system.validateAndUpdate(current_func)
                    if updated:
                        env.setComponentReward(component, new_function)
                
                # Reset the environment for a clean start with new length
                state = env.reset()[0]
                done = False
                episode_steps = 0
            
            # Choose action and step environment
            action = agent.chooseAction(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            
            # Track progress
            episode_steps += 1
            
            # Train agent with experience
            done = terminated or truncated
            agent.remember(state, action, reward, next_state, done)
            
            # Update state
            state = next_state
            
            # Reset if done
            if done:
                episodes_completed += 1
                print(f"Episode completed: #{episodes_completed}, Steps: {episode_steps}, Length: {current_length}m")
                state = env.reset()[0]
                done = False
                episode_steps = 0
                agent.replay(32)
                
            # Small sleep to prevent CPU overuse
            time.sleep(0.001)
            
    except KeyboardInterrupt:
        print("\nDemonstration stopped by user")
    finally:
        env.close()
        print("Environment closed. Demo ended.")

# Run the continuous demonstration
if __name__ == "__main__":
    # Set to True to use a pre-trained model if available
    run_continuous_cartpole_demo(use_pretrained=True, save_model=True)

Setting up CartPole for Open Day demonstration...
Environment parameters updated: masscart=1.0, length=2.5, gravity=9.8
Starting with pole length: 2.5m
Loading pre-trained model from /home/sd37/BachelorsThesis/Using-LLMs-to-Generate-Reward-Functions-from-Natural-Language-in-RL-Environments/ExtraNotebooksCodeExamples/Final Report Graph Generations/cartpole_pretrained_model.pt
Pre-trained model loaded successfully!

ðŸŽ® DEMO STARTED! Pole length will change every 2 minutes
Next change scheduled at: 09:13:56
Press Ctrl+C to stop the demonstration


  agent.model.load_state_dict(torch.load(model_path))


Episode completed: #1, Steps: 88, Length: 2.5m


  states = torch.tensor([t[0] for t in minibatch], dtype=torch.float32).to(self.device)


Episode completed: #2, Steps: 92, Length: 2.5m
Episode completed: #3, Steps: 64, Length: 2.5m
Episode completed: #4, Steps: 48, Length: 2.5m
Episode completed: #5, Steps: 39, Length: 2.5m
Episode completed: #6, Steps: 87, Length: 2.5m
Episode completed: #7, Steps: 57, Length: 2.5m
Episode completed: #8, Steps: 39, Length: 2.5m
Episode completed: #9, Steps: 35, Length: 2.5m
Episode completed: #10, Steps: 27, Length: 2.5m
Episode completed: #11, Steps: 27, Length: 2.5m
Episode completed: #12, Steps: 23, Length: 2.5m
Episode completed: #13, Steps: 23, Length: 2.5m
Episode completed: #14, Steps: 10, Length: 2.5m
Episode completed: #15, Steps: 20, Length: 2.5m
Episode completed: #16, Steps: 9, Length: 2.5m
Episode completed: #17, Steps: 10, Length: 2.5m
Episode completed: #18, Steps: 10, Length: 2.5m
Episode completed: #19, Steps: 10, Length: 2.5m
Episode completed: #20, Steps: 10, Length: 2.5m
Episode completed: #21, Steps: 13, Length: 2.5m
Episode completed: #22, Steps: 11, Length: 2.5m
E

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)



Generating new efficiency reward function...
Update count: 0/3

Proposed Function:
Here's a modified version of the reward function with detailed inline comments, focusing on efficiency aspects while adhering to the given constraints and requirements:

```python
def reward_function(observation, action):
    # Extract state variables from observation
    cart_position = observation[0]
    cart_velocity = observation[1]
    pole_angle = observation[2]
    pole_angular_velocity = observation[3]

    # Primary reward component: Pole angle stability (highest positive weight)
    angle_stability_reward = 2.0 * (1 - abs(pole_angle))  # Weight: 2.0, decreases as angle deviates from 0

    # Secondary penalty: Cart position (smaller negative weight)
    position_penalty = -0.2 * abs(cart_position)  # Weight: -0.2, increases as cart moves from center

    # Secondary penalty: Cart velocity (smaller negative weight)
    velocity_penalty = -0.1 * abs(cart_velocity)  # Weight: -0.1, increases with