In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym
from IPython.display import clear_output
import time
from tqdm import tqdm

In [2]:
from gym.envs.registration import register

try:

    register(
        id='FrozenLakeNotSlippery-v0', 
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '8x8', 'is_slippery': False},
        max_episode_steps=400,
        reward_threshold=.8196, 
    )
except:
    print('Already registered')

In [3]:
env = gym.make('FrozenLakeNotSlippery-v0')

In [4]:
epochs = 8
for epoch in range(epochs):
    score = 0
    state = env.reset()
    done = False
    while not done:
        env.render()
        action = env.action_space.sample()
        state,reward,done,info = env.step(action)
        score += reward
        clear_output(wait=True)
        time.sleep(0.2)
    print(f"Episode: {epoch + 1} Score: {score}")

Episode: 8 Score: 0.0


In [5]:
num_states = env.observation_space.n
num_actions = env.action_space.n

In [6]:
num_states,num_actions

(64, 4)

In [7]:
Q_table = np.zeros((num_states,num_actions))

In [8]:
Q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],


In [9]:
def policy(state,epsilon=0.2):
    if np.random.random()<epsilon:
        action = np.random.randint(num_actions)
    else:
        av = Q_table[state]
        action = np.random.choice(np.flatnonzero(av==av.max()))
    return action

In [10]:
def sarsa(Q_table,policy,epochs,alpha=0.1,gamma=0.99,epsilon=0.2):
    for epoch in tqdm(range(1,epochs+1)):
        state = env.reset()
        action = policy(state,epsilon)
        done = False
        
        while not done:
            next_state,reward,done,_ = env.step(action)
            next_action = policy(next_state,epsilon)
            
            qsa = Q_table[state,action]
            next_qsa = Q_table[next_state,next_action]
            Q_table[state,action] = qsa + alpha*(reward + gamma*next_qsa - qsa)
            state = next_state
            action = next_action

In [11]:
sarsa(Q_table,policy,epochs = 20000)

100%|██████████| 20000/20000 [00:30<00:00, 650.55it/s]


In [15]:
"""The Agent achieves the goal all the time after training"""
for step in range(8):
    state = env.reset()
    done = False
    score = 0

    for step in range(20):
        env.render()
        action = np.argmax(Q_table[state,:])
        #action = policy(state)
        next_state,rew,done,info = env.step(action)
        time.sleep(0.5)
        #clear_output(wait=True)
        #score += reward
        state = next_state
    #print(f"Episode: {epoch+1} Score: {score}")
env.close()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFF[41mF[0mFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFF[41mF[0mFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFF[41mF[0mFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFF[41mF[0mF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFF[41mF[0mF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFFFFFFF
FFFFFFF[41mF[0m
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFF[41mF[0m
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHF[41mF[0m
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Down)
SF

  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFH[41mF[0m
FHFFHFHF
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFH[41mF[0m
FFFHFFFG
  (Down)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m
  (Left)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m
  (Left)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m
  (Left)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m
  (Left)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m
  (Left)
SFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFF[41mG[0m

[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
S[41mF[0mFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SF[41mF[0mFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG
  (Right)
SFF[4