In [48]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, ExtraTreesRegressor
import gym
import pandas
import numpy as np

import warnings
warnings.filterwarnings("ignore")

def warn(*args, **kwargs):
    return None

warnings.warn = warn

## Code to Collect and Analyze Memory in CartPole Environment

In [49]:
# Create a new CartPole environment
env = gym.make('CartPole-v1')

# Number of episodes to run
num_episodes = 100

# Initialize a list to store the memory of all episodes
life_memory = []

# Loop through each episode
for i in range(num_episodes):
    # Start a new episode and reset the environment
    old_observation, _ = env.reset()  # Get the initial observation and additional info (if any)
    done = False  # Flag to indicate if the episode is finished
    tot_reward = 0  # Variable to accumulate the total reward for the episode
    ep_memory = []  # List to store the memory of the current episode
    
    # Run the episode until it finishes
    while not done:
        # Take a random action from the action space
        new_action = env.action_space.sample()
        
        # Step the environment using the chosen action
        # Unpack the values returned from env.step()
        observation, reward, done, truncated, info = env.step(new_action)
        
        # Update the total reward for this episode
        tot_reward += reward
        
        # Ensure that the observation has the expected shape (4 values for CartPole)
        if len(old_observation) != 4:
            print(f"Unexpected shape of old_observation: {old_observation}")
            break  # Exit the loop if the observation shape is unexpected
        
        # Record the memory of the current step
        ep_memory.append({
            "obs0": old_observation[0],  # Position of the cart
            "obs1": old_observation[1],  # Velocity of the cart
            "obs2": old_observation[2],  # Angle of the pole
            "obs3": old_observation[3],  # Angular velocity of the pole
            "action": new_action,        # Action taken (0 or 1)
            "reward": reward,            # Reward received for this step
            "episode": i,                # Current episode number
        })
        
        # Update the old observation to the current observation
        old_observation = observation
    
    # Add the total reward to each memory entry in the episode
    for ep_mem in ep_memory:
        ep_mem["tot_reward"] = tot_reward  # Assign total reward for the episode
    
    # Extend the life memory with the current episode's memory
    life_memory.extend(ep_memory)

# Convert the collected memory into a Pandas DataFrame
memory_df = pd.DataFrame(life_memory)

# Compute the average reward per episode
average_reward = memory_df.groupby("episode").reward.sum().mean()

# Print the average reward
print(f"Average reward over {num_episodes} episodes: {average_reward}")

# Close the environment to free resources
env.close()

Average reward over 100 episodes: 21.15


## Training Multiple Regression Models on Reinforcement Learning Memory Data

In [50]:
# Define a list of models to train
# - ExtraTreesRegressor: A tree-based model that uses an ensemble of randomized decision trees.
# - AdaBoostRegressor: A boosting algorithm that combines weak learners to create a strong learner.
# - ExtraTreesRegressor: Added another instance of ExtraTreesRegressor for experimentation.
models = [
    ExtraTreesRegressor(n_estimators=50),  # First ExtraTrees model
    AdaBoostRegressor(n_estimators=50),   # AdaBoost model
    ExtraTreesRegressor(n_estimators=50)  # Second ExtraTrees model
]

# Create a new column `comb_reward` in the memory DataFrame
# - This column is a weighted combination of step-wise rewards (`reward`) 
#   and the total reward for the episode (`tot_reward`).
memory_df["comb_reward"] = 0.5 * memory_df.reward + memory_df.tot_reward

# Loop through each model in the list and train it
for model in models:
    # Fit the model to the training data
    # - Input features: `obs0`, `obs1`, `obs2`, `obs3` (state observations) and `action`.
    # - Target variable: `comb_reward` (the combined reward we defined earlier).
    model.fit(
        memory_df[["obs0", "obs1", "obs2", "obs3", "action"]],  # Features
        memory_df.comb_reward                                   # Target
    )
    # Once fitted, the model can be used to predict rewards or guide decisions.

## Evaluation of Models in the CartPole Environment

In [51]:
# Evaluation setup
num_episodes = 100  # Number of episodes to evaluate each model
random_per = 0      # Probability of taking a random action (0 means always using the model's prediction)
results = {}        # Dictionary to store the average reward for each model

# Evaluate each model
for model_idx, model in enumerate(models):
    # Reset life memory for the current model
    life_memory = []
    
    # Create a new CartPole environment for each model's evaluation
    env = gym.make('CartPole-v1')
    
    # Loop through the number of episodes
    for i in range(num_episodes):
        # Reset the environment and get the initial state
        old_observation, _ = env.reset()
        done = False  # Flag to indicate if the episode is done
        tot_reward = 0  # Total reward for the episode
        ep_memory = []  # Store episode-level memory

        # Loop until the episode is done
        while not done:
            # Ensure the observation is in the correct shape (1D array)
            old_observation = np.array(old_observation).flatten()
            
            # Decide the action
            if np.random.rand() < random_per:
                # Take a random action with probability `random_per`
                new_action = env.action_space.sample()
            else:
                # Use the model to predict the best action
                # Create input data for prediction (current state + possible actions)
                pred_in = [list(old_observation) + [i] for i in range(2)]  # Actions: 0 or 1
                pred_in = np.array(pred_in)  # Ensure consistent 2D shape
                new_action = np.argmax(model.predict(pred_in))  # Predict the best action
            
            # Take the chosen action in the environment
            observation, reward, done, truncated, info = env.step(new_action)
            tot_reward += reward  # Update total reward
            
            # Save the transition data to episode memory
            ep_memory.append({
                "obs0": old_observation[0],
                "obs1": old_observation[1],
                "obs2": old_observation[2],
                "obs3": old_observation[3],
                "action": new_action,
                "reward": reward,
                "episode": i,
            })
            
            # Update the observation
            old_observation = observation
        
        # Add the total reward to each step in the episode memory
        for ep_mem in ep_memory:
            ep_mem["tot_reward"] = tot_reward
        
        # Append the episode memory to life memory
        life_memory.extend(ep_memory)

    # Create a DataFrame from the collected memory of all episodes for this model
    memory_df2 = pd.DataFrame(life_memory)
    
    # Recalculate the combined reward for evaluation purposes
    memory_df2["comb_reward"] = memory_df2.reward + memory_df2.tot_reward

    # Compute the average reward over all episodes for this model
    avg_reward = memory_df2.groupby("episode").reward.sum().mean()
    results[f"Model {model_idx + 1}"] = avg_reward  # Store the result

    # Print the average reward for the current model
    print(f"Model {model_idx + 1}: Average reward over {num_episodes} episodes: {avg_reward}")

# Close the environment to free up resources
env.close()

Model 1: Average reward over 100 episodes: 122.88
Model 2: Average reward over 100 episodes: 9.29
Model 3: Average reward over 100 episodes: 99.82
