In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import gymnasium as gym

# from bettermdptools.algorithms.planner import Planner
from planner import Planner
from plots import Plots

from bettermdptools.utils.test_env import TestEnv
from gymnasium.envs.toy_text.frozen_lake import generate_random_map

from bettermdptools.utils.grid_search import GridSearch
from bettermdptools.utils.blackjack_wrapper import BlackjackWrapper
from bettermdptools.algorithms.rl import RL
import itertools

# Explore the setup

In [58]:
MAP_SIZE = 16 #
P_FROZEN = 0.8 # Probability of tile being frozne. if this 0 then there is no path!
N_ITERS = 100000 # all iterations may not be used
THETA = 1e-10 # Convergence threshold
GAMMA = 0.9 # Discount factor
RANDOM_SEED = 42

In [59]:
# in generate_random_map, `S` denotes the start state, `F` denotes the frozen state, `H` denotes the hole state, and `G` denotes the goal state
# `p` is the probability of a tile being frozen
# A path is always guaranteed.
generate_random_map(size=MAP_SIZE, p=P_FROZEN)

['SFFFFFFFFHFFFHFF',
 'FFFFFHFHFFFFFFFF',
 'FFHHFFFHFFFFFFFF',
 'FFHFFFFFHFFFFFHF',
 'FFFFFFFHHFFFFFFH',
 'FHFFHFFFFFFFHFFF',
 'HFHFFHFHHFFFFFHF',
 'FHFHFHFFFFFFFFFF',
 'FFHHFHFFFFFFHFFH',
 'FFFFHFFFFFFFFFFF',
 'HFFFFFHFHFHFFHHF',
 'FFFFHFFFFFFFFFFH',
 'FFHFFHHFFFFFFFFH',
 'FFFFFFHFHFHHFFHF',
 'FHFFFFFFFFFFFFHF',
 'FFFFHFFFFFFFHFFG']

In [76]:
# `desc` parameter is a list of strings, where each string corresponds to a row in the grid
# `is_slippery` parameter determines whether the environment is slippery or not
# P(move left)=1/3
# P(move up)=1/3
# P(move down)=1/3

frozen_lake = gym.make('FrozenLake-v1', desc=generate_random_map(size=MAP_SIZE, p=P_FROZEN), is_slippery=False, render_mode="rgb_array")

# desc = ['SFFF', 'FFFF', 'FFHF', 'FFHG']
# frozen_lake = gym.make('FrozenLake-v1', desc=desc, is_slippery=False, render_mode="rgb_array")

In [77]:
# There are 4 actions we can take: up, down, left, right
frozen_lake.action_space

Discrete(4)

In [78]:
# The total number of states we can be in - MAP_SIZE * MAP_SIZE
frozen_lake.observation_space

Discrete(256)

In [79]:
s_0, info = frozen_lake.reset(seed=RANDOM_SEED)
print("State: {s_0}, Info: {info}")

State: {s_0}, Info: {info}


In [82]:
frozen_lake.unwrapped.

(0, 1)

In [85]:
print(frozen_lake.unwrapped.desc.shape)
frozen_lake.unwrapped.desc

(16, 16)


array([[b'S', b'F', b'F', b'H', b'F', b'H', b'F', b'F', b'F', b'F', b'F',
        b'F', b'F', b'F', b'F', b'H'],
       [b'F', b'F', b'H', b'H', b'F', b'F', b'F', b'F', b'F', b'H', b'F',
        b'F', b'H', b'F', b'H', b'F'],
       [b'F', b'H', b'F', b'H', b'F', b'F', b'F', b'H', b'F', b'F', b'F',
        b'F', b'F', b'F', b'H', b'H'],
       [b'F', b'F', b'F', b'F', b'F', b'F', b'H', b'F', b'F', b'F', b'F',
        b'F', b'H', b'F', b'F', b'H'],
       [b'H', b'F', b'H', b'F', b'F', b'F', b'F', b'H', b'F', b'F', b'F',
        b'F', b'H', b'H', b'F', b'F'],
       [b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F',
        b'F', b'F', b'F', b'F', b'H'],
       [b'F', b'F', b'H', b'F', b'F', b'H', b'H', b'F', b'F', b'F', b'F',
        b'F', b'F', b'F', b'F', b'H'],
       [b'H', b'F', b'H', b'H', b'H', b'F', b'F', b'F', b'F', b'F', b'H',
        b'F', b'F', b'H', b'F', b'H'],
       [b'F', b'F', b'F', b'H', b'F', b'H', b'F', b'F', b'F', b'F', b'F',
        b'F', b'F', b'

In [8]:
# What is frozen_lake.P? - The reward matrix
# The keys are the states, the values are the actions we can take from that state
# The values are a list of tuples, where each tuple is (probability, next_state, reward, done)
frozen_lake.P  

{0: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 16, 0.0, False)],
  2: [(1.0, 1, 0.0, False)],
  3: [(1.0, 0, 0.0, False)]},
 1: {0: [(1.0, 0, 0.0, False)],
  1: [(1.0, 17, 0.0, False)],
  2: [(1.0, 2, 0.0, False)],
  3: [(1.0, 1, 0.0, False)]},
 2: {0: [(1.0, 1, 0.0, False)],
  1: [(1.0, 18, 0.0, True)],
  2: [(1.0, 3, 0.0, False)],
  3: [(1.0, 2, 0.0, False)]},
 3: {0: [(1.0, 2, 0.0, False)],
  1: [(1.0, 19, 0.0, True)],
  2: [(1.0, 4, 0.0, True)],
  3: [(1.0, 3, 0.0, False)]},
 4: {0: [(1.0, 4, 0, True)],
  1: [(1.0, 4, 0, True)],
  2: [(1.0, 4, 0, True)],
  3: [(1.0, 4, 0, True)]},
 5: {0: [(1.0, 4, 0.0, True)],
  1: [(1.0, 21, 0.0, True)],
  2: [(1.0, 6, 0.0, False)],
  3: [(1.0, 5, 0.0, False)]},
 6: {0: [(1.0, 5, 0.0, False)],
  1: [(1.0, 22, 0.0, False)],
  2: [(1.0, 7, 0.0, False)],
  3: [(1.0, 6, 0.0, False)]},
 7: {0: [(1.0, 6, 0.0, False)],
  1: [(1.0, 23, 0.0, True)],
  2: [(1.0, 8, 0.0, False)],
  3: [(1.0, 7, 0.0, False)]},
 8: {0: [(1.0, 7, 0.0, False)],
  1: [(1.0, 24, 0.0

In [9]:
# All the states
frozen_lake.P .keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,

In [10]:
# What is stored here?
# 0: left, 1: down, 2: right, 3: up
# From state 254 going right will take us to state 255 (Goal) with a reward of  1.  all other cells are 0
frozen_lake.P[254]

{0: [(1.0, 254, 0, True)],
 1: [(1.0, 254, 0, True)],
 2: [(1.0, 254, 0, True)],
 3: [(1.0, 254, 0, True)]}

In [11]:
# What is stored here?
V, V_track, pi, n_i = Planner(frozen_lake.P).value_iteration(gamma=GAMMA, n_iters=N_ITERS, theta=THETA)

runtime = 0.02 seconds


In [12]:
frozen_lake.P[3]

{0: [(1.0, 2, 0.0, False)],
 1: [(1.0, 19, 0.0, True)],
 2: [(1.0, 4, 0.0, True)],
 3: [(1.0, 3, 0.0, False)]}

In [13]:
# show the map of the environment
frozen_lake.unwrapped.desc

array([[b'S', b'F', b'F', b'F', b'H', b'F', b'F', b'F', b'F', b'F', b'F',
        b'F', b'F', b'F', b'F', b'F'],
       [b'F', b'F', b'H', b'H', b'H', b'H', b'F', b'H', b'H', b'F', b'F',
        b'F', b'H', b'F', b'H', b'F'],
       [b'H', b'F', b'H', b'F', b'F', b'F', b'F', b'F', b'H', b'F', b'F',
        b'F', b'F', b'F', b'F', b'H'],
       [b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F',
        b'F', b'F', b'F', b'F', b'F'],
       [b'H', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'H',
        b'F', b'F', b'F', b'F', b'H'],
       [b'H', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F',
        b'F', b'F', b'F', b'F', b'F'],
       [b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'H', b'F', b'F',
        b'F', b'F', b'H', b'F', b'F'],
       [b'F', b'F', b'F', b'H', b'F', b'F', b'F', b'F', b'F', b'F', b'F',
        b'F', b'F', b'F', b'F', b'F'],
       [b'H', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F', b'F',
        b'F', b'F', b'

In [14]:
# We take a step and see the outcome
# TODO: This is what is needed to be done in the RL algorithm - define the correct step function. based on Policy iteration or Value iteration
a_t = frozen_lake.action_space.sample()
s_t_next, r_t, d_t, truncated,info = frozen_lake.step(a_t)

# Tuncated means that the episode has ended
# terminated means that the robot has reached goal or fallen into a hole


print(f"Action taken: {a_t}, Next state: {s_t_next}, Reward: {r_t}, Terminated: {d_t}, Truncated: {truncated}, Info: {info}")

Action taken: 3, Next state: 0, Reward: 0.0, Terminated: False, Truncated: False, Info: {'prob': 1.0}


  if not isinstance(terminated, (bool, np.bool8)):
