In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt 
import time
import gym
from IPython.display import clear_output
from tqdm import tqdm
from utils import test_agent

In [2]:
from gym.envs.registration import register

try:

    register(
        id='FrozenLakeNotSlippery-v0', 
        entry_point='gym.envs.toy_text:FrozenLakeEnv',
        kwargs={'map_name' : '4x4', 'is_slippery': False},
        max_episode_steps=100,
        reward_threshold=.8196, 
    )
except:
    print('Already registered')

In [3]:
"""Random Agent->Visualizing the virtual environment"""
env = gym.make('FrozenLakeNotSlippery-v0')
env.reset()
for step in range(10):
    env.render()
    action=env.action_space.sample()
    obs,reward,done,info = env.step(action)
    time.sleep(0.5)
    clear_output(wait = True)
    if done:
        env.reset()
env.close()



[41mS[0mFFF
FHFH
FFFH
HFFG


In [4]:
num_states = env.observation_space.n
num_actions = env.action_space.n

In [5]:
print(f"Number of states: {num_states} Number of actions: {num_actions}")

Number of states: 16 Number of actions: 4


In [6]:
Q_table = np.zeros(shape=(num_states,num_actions))

In [7]:
Q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [8]:
def policy(state,epsilon):
    if np.random.random() < epsilon:
        action = np.random.randint(num_actions)
    else:
        av = Q_table[state]
        action = np.random.choice(np.flatnonzero(av == av.max()))
    return action

In [15]:
def reduce_epsilon(epsilon,epoch,min_epsilon,max_epsilon,decay_rate):
    return min_epsilon + (max_epsilon-min_epsilon)*np.exp(-decay_rate*epoch)

In [21]:
def n_step_sarsa(Q_table,policy,epochs,alpha=0.2,gamma=0.99,epsilon=1.0,max_epsilon=1.0,
                 min_epsilon=0.01,decay_rate=0.001,n=8):
    for epoch in tqdm(range(1,epochs+1)):
        state = env.reset()
        action = policy(state,epsilon)
        transitions = []
        done = False
        t = 0
        while t-n < len(transitions):
            if not done:
                next_state,reward,done,_ = env.step(action)
                next_action = policy(next_state,epsilon)
                transitions.append([state,action,reward])
            if t >= n:
                G = (1-done)*Q_table[next_state,next_action]
                for state_t,action_t,reward_t in reversed(transitions[t-n:]):
                    G = reward_t + gamma*G
                Q_table[state_t,action_t] += alpha*(G-Q_table[state_t,action_t])
                
            t += 1
            state = next_state
            action = next_action
            epsilon = reduce_epsilon(epsilon,epoch,min_epsilon,max_epsilon,decay_rate)
            

In [22]:
n_step_sarsa(Q_table,policy,10000)

100%|██████████| 10000/10000 [00:04<00:00, 2369.90it/s]


In [23]:
Q_table

array([[0.91190806, 0.9509899 , 0.78835377, 0.7126138 ],
       [0.53431323, 0.        , 0.96056188, 0.31532597],
       [0.42227349, 0.97026453, 0.27197896, 0.64866345],
       [0.33746401, 0.        , 0.18682901, 0.10161457],
       [0.27787443, 0.96059594, 0.        , 0.71490053],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.98006518, 0.        , 0.74868721],
       [0.        , 0.        , 0.        , 0.        ],
       [0.42237953, 0.        , 0.97029892, 0.91271457],
       [0.93001137, 0.98009996, 0.8546472 , 0.        ],
       [0.96990849, 0.98996605, 0.        , 0.94968827],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.77775466, 0.98999996, 0.74763974],
       [0.97940327, 0.98999753, 1.        , 0.97226195],
       [0.        , 0.        , 0.        , 0.        ]])

In [27]:
"""Playing the Game After Training"""
state = env.reset()
for steps in range(100):
    env.render()
    #action = np.argmax(Q_table[state,:])
    action = policy(state,epsilon=0.01)
    state,reward,done,info = env.step(action)
    time.sleep(0.75)
    clear_output(wait=True)
    if done:
        break
env.close()

  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
