In [28]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import time
from IPython.display import clear_output

In [4]:
"""Setting is_slippery = False"""
from gym.envs.registration import register
try:
    register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.78, # optimum = .8196
)
except:
    print("Already Registered")

In [5]:
env = gym.make('FrozenLakeNotSlippery-v0')

In [7]:
env.observation_space.n,env.action_space.n

(16, 4)

In [12]:
Q_table = np.zeros((env.observation_space.n,env.action_space.n))

In [13]:
Q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [16]:
def policy(state,epsilon=0.2):
    if np.random.random()<epsilon:
        action = np.random.randint(env.action_space.n)
    else:
        av = Q_table[state]
        action = np.random.choice(np.flatnonzero(av == av.max()))
    return action

In [21]:
def sarsa(Q_table,policy,epochs,alpha=0.1,gamma=0.99,epsilon=0.2):
    for epoch in range(epochs):
        state = env.reset()
        action = policy(state,epsilon)
        done =False
        
        while not done:
            next_state,reward,done,_ = env.step(action)
            next_action = policy(next_state,epsilon)
            qsa = Q_table[state,action]
            next_qsa = Q_table[next_state,next_action]
            Q_table[state,action] = qsa + alpha *(reward + gamma *next_qsa - qsa)
            state = next_state
            action = next_action

In [22]:
sarsa(Q_table,policy,10000)

In [23]:
Q_table

array([[0.65936643, 0.75593393, 0.55735137, 0.65541865],
       [0.6517528 , 0.        , 0.50538146, 0.57402497],
       [0.56224314, 0.36977132, 0.03132374, 0.11756628],
       [0.20295307, 0.        , 0.        , 0.        ],
       [0.6960299 , 0.80959808, 0.        , 0.66970049],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.93996767, 0.        , 0.26510294],
       [0.        , 0.        , 0.        , 0.        ],
       [0.69430539, 0.        , 0.85764958, 0.66885071],
       [0.8009572 , 0.80586129, 0.91361086, 0.        ],
       [0.83056768, 0.96769257, 0.        , 0.86521862],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.91549921, 0.96466697, 0.89652374],
       [0.92939635, 0.98422772, 1.        , 0.74657103],
       [0.        , 0.        , 0.        , 0.        ]])

In [30]:
"""Playing the Game After Training. The agent succeeds all thee time after training"""
state = env.reset()
for step in range(20):
    env.render()
    action = np.argmax(Q_table[state,:])
    next_state,rew,done,info = env.step(action)
    time.sleep(0.5)
    clear_output(wait=True)
    if done:
        env.reset()
    state=next_state
env.close()

  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
