# Introduction to Gymnasium in Colab

This is a quick lab to learn how to use [`gymnasium`](https://gymnasium.farama.org/) a Python module wrapping several environments under the same API.

We provide a class to record videos, so that environment runs can be rendered easily in Colaboratory and Jupyter Lab.

## Installing gymnasium and the Atari ROMs

In [None]:
!pip install gymnasium
!pip install gymnasium[accept-rom-license]



In [None]:
import gymnasium as gym
from IPython.display import clear_output, HTML, display
import matplotlib.pyplot as plt
%matplotlib notebook

In [None]:
#@title Wrapper for recording an environment into a video

from __future__ import annotations

from copy import deepcopy
from typing import Any, SupportsFloat

from gymnasium.core import ActType, ObsType, RenderFrame, WrapperActType, WrapperObsType
from gymnasium.error import DependencyNotInstalled

class RecordVideo(gym.Wrapper):
    """Adapted from https://github.com/Farama-Foundation/Gymnasium/blob/main/gymnasium/experimental/wrappers/rendering.py#L87
    """

    def __init__(self, env):
        """Initialize a :class:`HumanRendering` instance.
        Args:
            env: The environment that is being wrapped
        """
        super().__init__(env)
        assert env.render_mode in [
            "rgb_array",
            "rgb_array_list",
        ], f"Expected env.render_mode to be one of 'rgb_array' or 'rgb_array_list' but got '{env.render_mode}'"

        if "render_fps" not in env.metadata:
            env.metadata["render_fps"] = 24

        assert (
            "render_fps" in env.metadata
        ), "The base environment must specify 'render_fps' to be used with the HumanRendering wrapper"

        if "human" not in self.metadata["render_modes"]:
            self.metadata = deepcopy(self.env.metadata)
            self.metadata["render_modes"].append("human")

        self.artists = []
        self.figure = None

    @property
    def render_mode(self):
        """Always returns ``'human'``."""
        return "human"

    def step(
        self, action: WrapperActType
    ) -> tuple[WrapperObsType, SupportsFloat, bool, bool, dict]:
        """Perform a step in the base environment and render a frame to the screen."""
        result = super().step(action)
        self._render_frame()
        return result

    def reset(
        self, *, seed: int | None = None, options: dict[str, Any] | None = None
    ) -> tuple[WrapperObsType, dict[str, Any]]:
        """Reset the base environment and render a frame to the screen."""
        result = super().reset(seed=seed, options=options)
        self._render_frame()
        return result

    def video(self):
        """This method renders all frames collected up to now."""
        if self.figure is not None:
            from IPython.display import HTML
            import matplotlib.animation

            animation = matplotlib.animation.ArtistAnimation(self.figure, self.artists,
                                                             interval=1000//self.metadata["render_fps"],
                                                             blit=True,
                                                             repeat=True,
                                                             repeat_delay=2000)
            return HTML(animation.to_html5_video())

        return None

    def _render_frame(self):
        """Fetch the last frame from the base environment and render it to the screen."""
        try:
            import matplotlib.animation
            import numpy as np
        except ImportError:
            raise DependencyNotInstalled(
                "matplotlib is not installed, run `pip install matplotlib`"
            )
        if self.env.render_mode == "rgb_array_list":
            rgb_arrays = self.env.render()
        elif self.env.render_mode == "rgb_array":
            rgb_arrays = [self.env.render()]
        else:
            raise Exception(
                f"Wrapped environment must have mode 'rgb_array' or 'rgb_array_list', actual render mode: {self.env.render_mode}"
            )

        assert isinstance(rgb_arrays, list)

        for rgb_array in rgb_arrays:
            assert isinstance(rgb_array, np.ndarray)

        if self.figure is None:
            self.figure = plt.figure()
            plt.axis('off')

        self.artists.append([plt.imshow(rgb_array) for rgb_array in rgb_arrays])

    def close(self):
        """Close the rendering window."""
        result = self.video()
        super().close()

        return result

## Toy text environments

In [None]:
env = RecordVideo(gym.make("FrozenLake-v1", render_mode="rgb_array"))
env.reset()

for i in range(100):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)

    done = terminated or truncated
    if done:
        env.reset()

display(env.video())

<IPython.core.display.Javascript object>

## Classic control environments

In [None]:
env = RecordVideo(gym.make("CartPole-v1", render_mode="rgb_array"))
env.reset()

for i in range(100):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)

    done = terminated or truncated
    if done:
        env.reset()

display(env.video())

<IPython.core.display.Javascript object>

## Atari environments

In [None]:
!pip install gymnasium[atari,accept-rom-license]
import ale_py


Collecting ale-py>=0.9 (from gymnasium[accept-rom-license,atari])
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ale-py
Successfully installed ale-py-0.10.1


In [None]:
env = RecordVideo(gym.make("ALE/Breakout-v5", render_mode="rgb_array"))
env.reset()

for i in range(100):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)

    done = terminated or truncated
    if done:
        env.reset()

display(env.video())

<IPython.core.display.Javascript object>

## 2D physics environments

In [None]:
!pip install "gymnasium[box2d]"



In [None]:
!pip install swig



In [None]:
!pip install "gymnasium[box2d]"
env = RecordVideo(gym.make("LunarLander-v3", render_mode="rgb_array"))
env.reset()

for i in range(100):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)

    done = terminated or truncated
    if done:
        env.reset()

display(env.video())

Collecting box2d-py==2.3.5 (from gymnasium[box2d])
  Using cached box2d-py-2.3.5.tar.gz (374 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: box2d-py
  Building wheel for box2d-py (setup.py) ... [?25l[?25hdone
  Created wheel for box2d-py: filename=box2d_py-2.3.5-cp310-cp310-linux_x86_64.whl size=2376133 sha256=4e385e0c93612f4f6ea14cc12cc56eef745a723160e684d502c8d04802b9954f
  Stored in directory: /root/.cache/pip/wheels/db/8f/6a/eaaadf056fba10a98d986f6dce954e6201ba3126926fc5ad9e
Successfully built box2d-py
Installing collected packages: box2d-py
Successfully installed box2d-py-2.3.5


<IPython.core.display.Javascript object>

# The `gym` API

In [None]:
env = gym.make("FrozenLake-v1")

print("Action space: ", env.action_space)
print("Observation space: ", env.observation_space)

Action space:  Discrete(4)
Observation space:  Discrete(16)


### Exercise 1: Explore the attributes of the environment (e.g. sliperiness). What does it do?


In [None]:
env = RecordVideo(gym.make("FrozenLake-v1", render_mode="rgb_array", is_slippery = True))
env.reset()

for i in range(100):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)

    done = terminated or truncated
    if done:
        env.reset()

display(env.video())

<IPython.core.display.Javascript object>

### Exercise 2: Modify other environement attributes (e.g. map size)

map_name = "8*8"

is_slippery = False

desc=["SFFF", "FHFH", "FFFH", "HFFG"]

render_mode="human"

reward_threshold

In [None]:
env = RecordVideo(gym.make("FrozenLake-v1", render_mode="rgb_array", is_slippery = True,map_name = "8x8" ))
env.reset()

for i in range(100):
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)

    done = terminated or truncated
    if done:
        env.reset()

display(env.video())

<IPython.core.display.Javascript object>

### Exercise 3: Modify other environement attributes (e.g. map size)

In [None]:
env = RecordVideo(gym.make("FrozenLake-v1", render_mode="rgb_array", desc=["SG"],is_slippery = True ))
env.reset()

for i in range(100):
    action = env.action_space.sample() # 랜덤샘플 액션으로 취하기
    obs, reward, terminated, truncated, info = env.step(action) # 옵절베이션 리워드 등등 저장

    done = terminated or truncated # terminated 나 truncated 면 리셋하고 다시
    if done:
        env.reset()

display(env.video())

# 이 에이젼트는 폴리시가 없음. 랜덤액션을 취함

<IPython.core.display.Javascript object>

Now let's access the internal dynamics of the environment

Reset the environement to the standard 4x4 frozenlake.

`env.env.P` is a dictionnary containging the following

```
{state: {action: [(probability, next_state, reward, is_next_state_terminal) for each possible action outcome]}}
```

### Exercise 4: Which are the terminal states of this environment?

In [None]:
env = gym.make("FrozenLake-v1", render_mode="rgb_array")
env.reset()

(0, {'prob': 1})

P = env.unwrapped.P

{
    state: {

        action: [

            (probability, next_state, reward, is_terminal),
            ...

        ]

    }
}

print(P) : every state (16) and every action(4) and outcome(3)

In [None]:
P = env.unwrapped.P
print(P)

{0: {0: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 4, 0.0, False)], 1: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 4, 0.0, False), (0.3333333333333333, 1, 0.0, False)], 2: [(0.3333333333333333, 4, 0.0, False), (0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False)], 3: [(0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False)]}, 1: {0: [(0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 5, 0.0, True)], 1: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 5, 0.0, True), (0.3333333333333333, 2, 0.0, False)], 2: [(0.3333333333333333, 5, 0.0, True), (0.3333333333333333, 2, 0.0, False), (0.3333333333333333, 1, 0.0, False)], 3: [(0.3333333333333333, 2, 0.0, False), (0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False)]}, 2: {0: [(0.3333333333333333, 2, 0.0, False), (0.3333333333333333

P[state] : show every action based on only one specific state - this case show every action atthe terminal state

In [None]:
print(P[state])

NameError: name 'state' is not defined

In [None]:
print(P[state][action])

NameError: name 'state' is not defined

In [None]:
P = env.unwrapped.P
for state in range(16):  # 4x4 격자는 총 16개의 상태
    print(f"State {state}:")
    for action, outcomes in P[state].items():
        print(f"  Action {action}: {outcomes}")

State 0:
  Action 0: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 4, 0.0, False)]
  Action 1: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 4, 0.0, False), (0.3333333333333333, 1, 0.0, False)]
  Action 2: [(0.3333333333333333, 4, 0.0, False), (0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False)]
  Action 3: [(0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False)]
State 1:
  Action 0: [(0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 5, 0.0, True)]
  Action 1: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 5, 0.0, True), (0.3333333333333333, 2, 0.0, False)]
  Action 2: [(0.3333333333333333, 5, 0.0, True), (0.3333333333333333, 2, 0.0, False), (0.3333333333333333, 1, 0.0, False)]
  Action 3: [(0.3333333333333333, 2, 0.0, False), (0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, F

### Exercise 5: Show the reward for each state transition

In [None]:
for state in range(16):
  print(f"state{state}:")
  for action, outcomes in P[state].items():
    print(f"action{action}:")
    for prob, next_state,reward,is_terminal in outcomes:
      print(f"next_state : {next_state},Reward : {reward}")

state0:
action0:
next_state : 0,Reward : 0.0
next_state : 0,Reward : 0.0
next_state : 4,Reward : 0.0
action1:
next_state : 0,Reward : 0.0
next_state : 4,Reward : 0.0
next_state : 1,Reward : 0.0
action2:
next_state : 4,Reward : 0.0
next_state : 1,Reward : 0.0
next_state : 0,Reward : 0.0
action3:
next_state : 1,Reward : 0.0
next_state : 0,Reward : 0.0
next_state : 0,Reward : 0.0
state1:
action0:
next_state : 1,Reward : 0.0
next_state : 0,Reward : 0.0
next_state : 5,Reward : 0.0
action1:
next_state : 0,Reward : 0.0
next_state : 5,Reward : 0.0
next_state : 2,Reward : 0.0
action2:
next_state : 5,Reward : 0.0
next_state : 2,Reward : 0.0
next_state : 1,Reward : 0.0
action3:
next_state : 2,Reward : 0.0
next_state : 1,Reward : 0.0
next_state : 0,Reward : 0.0
state2:
action0:
next_state : 2,Reward : 0.0
next_state : 1,Reward : 0.0
next_state : 6,Reward : 0.0
action1:
next_state : 1,Reward : 0.0
next_state : 6,Reward : 0.0
next_state : 3,Reward : 0.0
action2:
next_state : 6,Reward : 0.0
next_stat

In [None]:
from pprint import pprint
pprint(env.unwrapped.P)

{0: {0: [(0.3333333333333333, 0, 0.0, False),
         (0.3333333333333333, 0, 0.0, False),
         (0.3333333333333333, 4, 0.0, False)],
     1: [(0.3333333333333333, 0, 0.0, False),
         (0.3333333333333333, 4, 0.0, False),
         (0.3333333333333333, 1, 0.0, False)],
     2: [(0.3333333333333333, 4, 0.0, False),
         (0.3333333333333333, 1, 0.0, False),
         (0.3333333333333333, 0, 0.0, False)],
     3: [(0.3333333333333333, 1, 0.0, False),
         (0.3333333333333333, 0, 0.0, False),
         (0.3333333333333333, 0, 0.0, False)]},
 1: {0: [(0.3333333333333333, 1, 0.0, False),
         (0.3333333333333333, 0, 0.0, False),
         (0.3333333333333333, 5, 0.0, True)],
     1: [(0.3333333333333333, 0, 0.0, False),
         (0.3333333333333333, 5, 0.0, True),
         (0.3333333333333333, 2, 0.0, False)],
     2: [(0.3333333333333333, 5, 0.0, True),
         (0.3333333333333333, 2, 0.0, False),
         (0.3333333333333333, 1, 0.0, False)],
     3: [(0.3333333333333333,

### Exercise 6: Build the transition matrix assuming a uniform policy.

The reason the transition matrix has a 16*16 shape is each state has a possible chance to go to other state from 0 to 15.
It means that every state has a 16 probable route. As a result, the shape of the transition matrix is 16*16

In [None]:
import numpy as np

env = gym.make("FrozenLake-v1",desc = None, is_slippery = True)
P = env.unwrapped.P
n_states = env.observation_space.n
n_actions = env.action_space.n
T_m = np.zeros((n_states, n_states))
re_v=np.zeros(n_states)
for state in range(n_states):
  for action in P[state]:
    for prob, next_state,reward, is_terminal in P[state][action]:
      if is_terminal == True and next_state==state:
        pass
      else:
        re_v[next_state]=reward
      T_m[state][next_state]  += prob * (1/n_actions) # sum transition probablity

print("Transition Matrix:")
print(T_m)



Transition Matrix:
[[0.5  0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.25 0.25 0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.25 0.25 0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.25 0.5  0.   0.   0.   0.25 0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.25 0.   0.   0.   0.25 0.25 0.   0.   0.25 0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.25 0.   0.   0.25 0.   0.25 0.   0.   0.25 0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.25 0.   0.   0.   0.25 0.25 0.   0.   0.25 0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.25 0.   0.   0.25 0.   0.25 0.   0.   0.25
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.25 0.   0.   0.25 0.   0.25 0.   0.
  0.25 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.  

### Exercise 7: Solve the direct Bellman equation formulation.

there are 3 possible situation
1. step somewhere else
2. go back to the state where It has just stayed
3. get to hole or goal, terminated

In [None]:
rew = np.zeros(n_states)
for state in range(n_states):
  for action in P[state]:
    for prob, next_state,reward,is_terminal in P[state][action]:
      if state == next_state :
        print(state, next_state, reward, '1')
        pass
      if is_terminal == True :
        print(state, next_state, reward,'2')
        pass
      if state == next_state and is_terminal == True :
       print(state, next_state, reward,'3')
       pass
      else :
        print(state, next_state, reward,'4')
        rew[next_state] = reward
print(rew)



0 0 0.0 1
0 0 0.0 4
0 0 0.0 1
0 0 0.0 4
0 4 0.0 4
0 0 0.0 1
0 0 0.0 4
0 4 0.0 4
0 1 0.0 4
0 4 0.0 4
0 1 0.0 4
0 0 0.0 1
0 0 0.0 4
0 1 0.0 4
0 0 0.0 1
0 0 0.0 4
0 0 0.0 1
0 0 0.0 4
1 1 0.0 1
1 1 0.0 4
1 0 0.0 4
1 5 0.0 2
1 5 0.0 4
1 0 0.0 4
1 5 0.0 2
1 5 0.0 4
1 2 0.0 4
1 5 0.0 2
1 5 0.0 4
1 2 0.0 4
1 1 0.0 1
1 1 0.0 4
1 2 0.0 4
1 1 0.0 1
1 1 0.0 4
1 0 0.0 4
2 2 0.0 1
2 2 0.0 4
2 1 0.0 4
2 6 0.0 4
2 1 0.0 4
2 6 0.0 4
2 3 0.0 4
2 6 0.0 4
2 3 0.0 4
2 2 0.0 1
2 2 0.0 4
2 3 0.0 4
2 2 0.0 1
2 2 0.0 4
2 1 0.0 4
3 3 0.0 1
3 3 0.0 4
3 2 0.0 4
3 7 0.0 2
3 7 0.0 4
3 2 0.0 4
3 7 0.0 2
3 7 0.0 4
3 3 0.0 1
3 3 0.0 4
3 7 0.0 2
3 7 0.0 4
3 3 0.0 1
3 3 0.0 4
3 3 0.0 1
3 3 0.0 4
3 3 0.0 1
3 3 0.0 4
3 3 0.0 1
3 3 0.0 4
3 2 0.0 4
4 0 0.0 4
4 4 0.0 1
4 4 0.0 4
4 8 0.0 4
4 4 0.0 1
4 4 0.0 4
4 8 0.0 4
4 5 0.0 2
4 5 0.0 4
4 8 0.0 4
4 5 0.0 2
4 5 0.0 4
4 0 0.0 4
4 5 0.0 2
4 5 0.0 4
4 0 0.0 4
4 4 0.0 1
4 4 0.0 4
5 5 0 1
5 5 0 2
5 5 0 3
5 5 0 1
5 5 0 2
5 5 0 3
5 5 0 1
5 5 0 2
5 5 0 3
5 5 0 1
5 5 0 2
5 5 0 3
6 2 

In [None]:
n_states

16

In [None]:
P_m= T_m
print(T_m)
R_m=rew
print(rew)
for row in T_m:
    print(f"Sum of probabilities: {sum(row)}")
gamma = 0.9
#det = np.linalg.det(I - gamma * P_m)
#print(f"Determinant: {det}")
I = np.eye(P_m.shape[0])
V = np.linalg.inv(I-gamma*P_m).dot(R_m)

print(V)



[[0.5  0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.25 0.25 0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.25 0.25 0.25 0.   0.   0.25 0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.25 0.5  0.   0.   0.   0.25 0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.25 0.   0.   0.   0.25 0.25 0.   0.   0.25 0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.25 0.   0.   0.25 0.   0.25 0.   0.   0.25 0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.   0.   0.   0.   0.25 0.   0.   0.   0.25 0.25 0.   0.   0.25 0.
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.25 0.   0.   0.25 0.   0.25 0.   0.   0.25
  0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.25 0.   0.   0.25 0.   0.25 0.   0.
  0.25 0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.
  0.   0.  ]
 [

### Exercise 8: Move the agent to an arbitrary position

The state is stored in `env.env.s`



In [None]:
env = gym.make("FrozenLake-v1",desc = None, is_slippery = True)
P = env.unwrapped.P
env.reset()
env = env.unwrapped

print("Initial state" , env.s)

env.s = 5

print("Current state" , env.s)

env.render()

### Exercise 9: Print information provided by each state. How does the probabilities changed between the two slipery options?