### Explore all the environments!

In [1]:
import gymnasium as gym
import math
import ale_py
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import random
from tqdm import tqdm

# set seeds
seed = 42
torch.manual_seed(seed)
random.seed(seed)

### Environments

In [2]:
env_cartpole = gym.make('CartPole-v0')
env_cartpole.action_space.seed(seed)

# print some information
print('Action space:', env_cartpole.action_space)
print('Observation space:', env_cartpole.observation_space)
print('Max episode steps:', env_cartpole.spec.max_episode_steps)

Action space: Discrete(2)
Observation space: Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
Max episode steps: 200


  logger.deprecation(


In [3]:
env_lunar_lander = gym.make('LunarLander-v3')
env_lunar_lander.action_space.seed(seed)

# print some information
print('Action space:', env_lunar_lander.action_space)
print('Observation space:', env_lunar_lander.observation_space)
print('Max episode steps:', env_lunar_lander.spec.max_episode_steps)

Action space: Discrete(4)
Observation space: Box([ -2.5        -2.5       -10.        -10.         -6.2831855 -10.
  -0.         -0.       ], [ 2.5        2.5       10.        10.         6.2831855 10.
  1.         1.       ], (8,), float32)
Max episode steps: 1000


In [4]:
env_mountain_car = gym.make('MountainCar-v0')
env_mountain_car.action_space.seed(seed)

# print some information
print('Action space:', env_mountain_car.action_space)
print('Observation space:', env_mountain_car.observation_space)
print('Max episode steps:', env_mountain_car.spec.max_episode_steps)

Action space: Discrete(3)
Observation space: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
Max episode steps: 200


The state space for pong is a bit unique, as each individual state is an image of the current state of the game.

In [5]:
env_pong = gym.make("ALE/Pong-v5")
env_pong.action_space.seed(seed)

# print some information
print('Action space:', env_pong.action_space)
print('Observation space:', env_pong.observation_space)
print('Max episode steps:', env_pong.spec.max_episode_steps)

A.L.E: Arcade Learning Environment (version 0.10.1+unknown)
[Powered by Stella]


Action space: Discrete(6)
Observation space: Box(0, 255, (210, 160, 3), uint8)
Max episode steps: None


### Random Agent to explore the environments

In [6]:
class RandAgent:
    def __init__(self, env):
        self.env = env
        self.action_space = env.action_space

    def select_action(self):
        return self.action_space.sample()

In [7]:
def evaluate(agent, env, n_trajectories=100, seed=42):
    rewards_per_trajectory = []

    for i in tqdm(range(n_trajectories)):
        total_reward = 0
        state, _ = env.reset(seed=seed + i)
        done = False
        
        while not done:
            action = agent.select_action()
            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            state = next_state
            done = terminated or truncated
        
        rewards_per_trajectory.append(total_reward)

    return rewards_per_trajectory
    

In [8]:
rand_agent_1 = RandAgent(env_cartpole)
rand_agent_2 = RandAgent(env_lunar_lander)
rand_agent_3 = RandAgent(env_mountain_car)
rand_agent_4 = RandAgent(env_pong)

In [9]:
all_rewards = []
for agent, env in [(rand_agent_1, env_cartpole), (rand_agent_2, env_lunar_lander), (rand_agent_3, env_mountain_car), (rand_agent_4, env_pong)]:
    rewards = evaluate(agent, env)
    all_rewards.append(rewards)
    print(f'Average reward for {env.spec.id}: {sum(rewards) / len(rewards)}')

100%|██████████| 100/100 [00:00<00:00, 1254.25it/s]


Average reward for CartPole-v0: 25.22


100%|██████████| 100/100 [00:00<00:00, 133.66it/s]


Average reward for LunarLander-v3: -174.82355641274222


100%|██████████| 100/100 [00:00<00:00, 182.39it/s]


Average reward for MountainCar-v0: -200.0


100%|██████████| 100/100 [01:34<00:00,  1.06it/s]

Average reward for ALE/Pong-v5: -20.34





### Observations
Source: [Info on environments](https://github.com/openai/gym/wiki/Leaderboard)

- In the cartpole environment, a reward of +1 is provided for every timestep that the pole remains upright. The environment is considered solved when a reward of around 195 is obtained. The random agent clearly fails at this task, and is only able to keep the pole upright for around 25 timesteps.
- In the lunar lander environment, firing the main engine costs -0.3, crashing costs -100, landing results in a reward of +100, and making touching the leg with the ground results in a reward of +10. The average reward of -170 clearly shows that the random agent almost always crashes.
- In the mountaincar environment, a reward of -1 is provided for every timestep, until the car reaches the flag. This goes on for 200 timesteps. Clearly, the random agent never reaches the flag.
- In the pong environment, a reward of +1 is provided for every point scored, and -1 for every point the opponent scores. The game is to 21. The random agent sometimes gets lucky, but usually loses by a huge margin.
