In [56]:
import gym
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation  # Also required for creating animations
from IPython.display import HTML
import pygame
from pygame import gfxdraw
env = gym.make("FrozenLake-v1", is_slippery=True)


In [6]:
env.reset()
env.render()

If you want to render in human mode, initialize the environment in this way: gym.make('EnvName', render_mode='human') and don't call the render method.
See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


In [44]:
def value_iteration(env, gamma = 0.9):
    value_table = np.zeros(env.observation_space.n)
    no_of_iterations = 1000
    threshold = 1e-20
    for i in range(no_of_iterations):
        updated_value_table = np.copy(value_table)
        for state in range(env.observation_space.n):
            Q_value = []
            for action in range(env.action_space.n):
                next_states_rewards = []
                for next_sr in env.P[state][action]:
                    trans_prob, next_state, reward_prob, _ = next_sr
                    next_states_rewards.append((trans_prob * (reward_prob + gamma * updated_value_table[next_state])))

                Q_value.append(np.sum(next_states_rewards))

            value_table[state] = max(Q_value)
        if (np.sum(np.fabs(updated_value_table - value_table)) <= threshold):
             print ('Value-iteration converged at iteration# %d.' %(i+1))
             break

    return value_table

In [45]:
optimal_value_function = value_iteration(env=env,gamma=0.9)

Value-iteration converged at iteration# 267.


In [46]:
def extract_policy(value_table, gamma=0.9):
    policy = np.zeros(env.observation_space.n)
    for state in range(env.observation_space.n):
        Q_table = np.zeros(env.action_space.n)
        for action in range(env.action_space.n):
            for next_sr in env.P[state][action]:
                trans_prob, next_state, reward_prob, _ = next_sr
                Q_table[action] += trans_prob * (reward_prob + gamma * value_table[next_state])
        policy[state] = np.argmax(Q_table)
    return policy

In [47]:
optimal_policy = extract_policy(optimal_value_function, gamma=0.9)

In [48]:
print(optimal_policy)

[0. 3. 0. 3. 0. 0. 0. 0. 3. 1. 0. 0. 0. 2. 1. 0.]


In [60]:
def display_video(frames):

    orig_backend = matplotlib.get_backend()
    matplotlib.use('Agg')
    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    matplotlib.use(orig_backend)
    ax.set_axis_off()
    ax.set_aspect('equal')
    ax.set_position([0, 0, 1, 1])
    im = ax.imshow(frames[0])
    def update(frame):
        im.set_data(frame)
        return [im]
    anim = animation.FuncAnimation(fig=fig, func=update, frames=frames,
                                    interval=50, blit=True, repeat=False)
    return HTML(anim.to_html5_video())


def test_agent(env, policy, episodes=10):
    frames = []
    for episode in range(episodes):
        state = env.reset()
        done = False
        frames.append(env.render(mode="rgb_array"))
        while not done:
            action = int(policy[state])
            next_state, reward, done, _ = env.step(action)
            img = env.render(mode="rgb_array")
            frames.append(img)
            state = next_state
    return display_video(frames)



In [61]:
test_agent(env, optimal_policy)

See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(
