# Frozen Lake - V0
The agent controls the movement of a character in a grid world. Some tiles of the grid are walkable, and others lead to the agent falling into the water. Additionally, the movement direction of the agent is uncertain and only partially depends on the chosen direction. The agent is rewarded for finding a walkable path to a goal tile. For more information visit https://gym.openai.com/envs/FrozenLake-v0/.

## 1. Import libraries

In [1]:
# Import libraries
import gym
import time
import random
import numpy as np
from tqdm.auto import tqdm, trange
from IPython.display import clear_output

## 2. Parameters

In [2]:
episodes = 20000 # Number of episodes (epochs)
steps = 100 # Number of steps (iterations) per episodes
lr = 0.1 # Learning rate
gamma = 0.99  # Discount rate
exp_rate = 1  # Epsilon (Expectation rate)
exp_decay = 0.001 # Expecation rate decay

## 3. Initialization

In [3]:
env = gym.make("FrozenLake-v0")

action_space = env.action_space.n # Number of possible actions
state_space = env.observation_space.n # Number of existing spaces
q_table = np.zeros((state_space, action_space)) # Init Q-Table to all zeros

## 4. Training (Iterative Q-Learning)

In [4]:
 rewards = [] # all rewards throughout different episodes
 for e in trange(episodes):
    state = env.reset()
    done = False
    reward = 0

    for _ in range(steps):
        thresh = random.uniform(0, 1)  # Threshold
        # Exploitation: Choose the most economical path
        if thresh > exp_rate:  
            action = np.argmax(q_table[state, :])
        # Exploration: Discover new paths
        else:
            action = env.action_space.sample()

        # Take a step
        new_state, new_reward, done, info = env.step(action)

        # Update Q-Table
        # S_t * (1-a) + a * (R_t+1 + y * q_prime(S_t+1))
        q_table[state, action] = q_table[state, action] * (1 - lr) + lr * (
            new_reward + gamma * np.max(q_table[new_state, :])
        )

        state = new_state
        reward += new_reward

        if done:
            break

    # Update expectation rate
    exp_rate = 0.01 + 0.99 * np.exp(-exp_decay * e)
    rewards.append(reward)

100%|██████████| 20000/20000 [00:15<00:00, 1306.63it/s]


## 5. Analysis and Visualization

In [5]:
rewards = np.split(np.array(rewards), 10)
for i, reward in enumerate(rewards):
    print(f"Reward for {(i+1)*len(reward)}: {np.mean(reward)}")

Reward for 2000: 0.121
Reward for 4000: 0.4745
Reward for 6000: 0.6385
Reward for 8000: 0.671
Reward for 10000: 0.6725
Reward for 12000: 0.663
Reward for 14000: 0.6615
Reward for 16000: 0.6835
Reward for 18000: 0.679
Reward for 20000: 0.6585


In [6]:
for e in range(6):
    state = env.reset()
    done = False
    print(f"> EPISODE {e+1}")
    time.sleep(1)

    for _ in range(steps):
        clear_output(wait=True)
        env.render()
        time.sleep(0.5)
        # Only use exploitation since the Q-table is trained
        action = np.argmax(q_table[state, :])

        # Take a step
        new_state, reward, done, info = env.step(action)

        if done:
            clear_output(wait=True)
            env.render()
            print("Yay!" if reward else "Game over...")
            time.sleep(2)
            clear_output(wait=True)
            break
        
        state = new_state
env.close()

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m
Yay!
