In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np

In [2]:
class FrozenLake_nojala(gym.Env):
    def __init__(self, size=5, trap_prob=0.1, slip_prob=0.1):
        super(FrozenLake_nojala, self).__init__()
        self.size = size
        self.trap_prob = trap_prob
        self.slip_prob = slip_prob
        self.observation_space = spaces.Discrete(size * size)
        self.action_space = spaces.Discrete(4)  
        self.start_pos = (0, 0)
        self.goal_pos = (size - 1, size - 1)
        self.state = self.start_pos
        self.traps = set()
        self.steps = 0

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.state = self.start_pos
        self.steps = 0
        self._sample_traps()
        return self._get_obs(), {}

    def step(self, action):
        self.steps += 1

        if np.random.rand() < self.slip_prob:
            action = self.action_space.sample()

        # movimiento
        x, y = self.state
        if action == 0 and x > 0:
            x -= 1
        elif action == 1 and y < self.size - 1:
            y += 1
        elif action == 2 and x < self.size - 1:
            x += 1
        elif action == 3 and y > 0:
            y -= 1

        self.state = (x, y)
        terminated = False
        reward = 0.0

        if self.state in self.traps:
            terminated = True
        elif self.state == self.goal_pos:
            reward = 1.0
            terminated = True

        truncated = False
        return self._get_obs(), reward, terminated, truncated, {}

    def _get_obs(self):
        return self.state[0] * self.size + self.state[1]

    def _sample_traps(self):
        self.traps = set()
        num_cells = self.size * self.size
        for i in range(num_cells):
            if np.random.rand() < self.trap_prob:
                cell = (i // self.size, i % self.size)
                if cell != self.start_pos and cell != self.goal_pos:
                    self.traps.add(cell)

    def render(self):
        grid = np.full((self.size, self.size), '.', dtype=str)
        for trap in self.traps:
            grid[trap] = 'X'
        grid[self.goal_pos] = 'G'
        grid[self.state] = 'A'
        print("\n".join([" ".join(row) for row in grid]) + "\n")



In [3]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

env = FrozenLake_nojala(size=6, trap_prob=np.random.rand(), slip_prob=np.random.rand())
check_env(env)

model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=100000)

obs, _ = env.reset()
for _ in range(50):
    action, _ = model.predict(obs)
    obs, reward, done, _, _ = env.step(action)
    env.render()
    if done:
        break


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 6.62     |
|    ep_rew_mean     | 0        |
| time/              |          |
|    fps             | 1354     |
|    iterations      | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 6.17        |
|    ep_rew_mean          | 0           |
| time/                   |             |
|    fps                  | 783         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.015302431 |
|    clip_fraction        | 0.102       |
|    clip_range           | 0.2         |
|    entropy_loss   