In [4]:
from pyvirtualdisplay import Display
import gymnasium as gym
from stable_baselines3 import A2C

# Create the CartPole environment
env = gym.make('CartPole-v1')

# Create a model using Stable Baselines3
model_a2c = A2C('MlpPolicy', env, tensorboard_log="./tensorboard_logs/")

# Train the model for 10000 timesteps
model_a2c.learn(total_timesteps=50000)

env.close()

In [5]:
# Start virtual display with Xvfb
display = Display(visible=0, size=(1400, 900))
display.start()

env_test = gym.make('CartPole-v1', render_mode="human")

# Evaluate the model
total_reward = 0
episodes = 10
for episode in range(episodes):
    obs, info = env_test.reset()  # Corrected: Unpack reset() correctly
    done = False
    episode_reward = 0

    while not done:
        action, _ = model_a2c.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env_test.step(action)  # Correct unpacking
        episode_reward += reward

        # Render the environment to watch the agent play
        #env_test.render(mode="human")  # Rendering for human visualization

    total_reward += episode_reward
    print(f"Episode {episode + 1}: Total Reward = {episode_reward}")

print(f"Average reward over {episodes} episodes: {total_reward / episodes}")

Episode 1: Total Reward = 2145.0


KeyboardInterrupt: 

In [8]:
from pyvirtualdisplay import Display
import gymnasium as gym
from stable_baselines3 import PPO

# Start virtual display
#display = Display(visible=0, size=(1400, 900))
#display.start()

# Create the CartPole environment
env = gym.make('CartPole-v1')

# Create a model using Stable Baselines3 PPO
model_ppo = PPO('MlpPolicy', env, tensorboard_log="./tensorboard_logs/")

# Train the model for 50000 timesteps
model_ppo.learn(total_timesteps=50000)

env.close()

In [9]:
env = gym.make('CartPole-v1', render_mode='human')

# Evaluate the model
total_reward = 0
episodes = 10
for episode in range(episodes):
    obs, info = env.reset()  # Corrected: Unpack reset() correctly
    done = False
    episode_reward = 0

    while not done:
        action, _ = model_ppo.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = env.step(action)  # Correct unpacking
        episode_reward += reward

        # Render the environment to watch the agent play
        #env.render()

    total_reward += episode_reward
    print(f"Episode {episode + 1}: Total Reward = {episode_reward}")

print(f"Average reward over {episodes} episodes: {total_reward / episodes}")

KeyboardInterrupt: 

In [None]:
from pyvirtualdisplay import Display
import gymnasium as gym
from stable_baselines3 import PPO

# Create the CartPole environment
env = gym.make('CartPole-v1')

# Create a model using Stable Baselines3 PPO
model_ppo = PPO('MlpPolicy', env, tensorboard_log="./tensorboard_logs/")

# Train the model for 50000 timesteps
model_ppo.learn(total_timesteps=50000)

env.close()