In [1]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output




### 4X4 GRID

In [2]:
env = gym.make("FrozenLake-v1",desc=None,map_name="4x4", is_slippery=True)

In [3]:
action_space = env.action_space.n
state_space = env.observation_space.n
q_table = np.zeros((state_space,action_space))
#print(q_table)

In [4]:
num_episodes = 10000
max_steps_ep = 100

lr = 0.1
dr = 0.99

expr_rate = 1
max_expr_rate = 1
min_expr_rate = 0.01
expr_decay_rate = 0.001

In [5]:
all_rewards = []
for episode in range(num_episodes):
    if episode%2000 == 0:
        print(episode," processing")
    state = env.reset()
    done = False
    rewards_curr = 0
    for step in range(max_steps_ep):
        expr_thresh = random.uniform(0,1)
        if expr_thresh > expr_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
        new_state, reward, done, info = env.step(action)
        #update Q-table
        q_table[state,action] = q_table[state,action]*(1 - lr) + \
        lr*(reward + dr*np.max(q_table[new_state,:]))
        
        state = new_state
        rewards_curr += reward
        if done == True:
            break
            
    expr_rate = min_expr_rate + (max_expr_rate - min_expr_rate)* \
    np.exp(-expr_decay_rate*episode)
    all_rewards.append(rewards_curr)
rewards_per_thousand = np.split(np.array(all_rewards), num_episodes/1000)
count = 1000
print("Avg reward per thousand eps\n")
for r in rewards_per_thousand:
    print(count, ": ",str(sum(r/1000)))
    count += 1000
print("\n\n Q-Table\n")
print(q_table)

0  processing
2000  processing
4000  processing
6000  processing
8000  processing
Avg reward per thousand eps

1000 :  0.05300000000000004
2000 :  0.21000000000000016
3000 :  0.4100000000000003
4000 :  0.5500000000000004
5000 :  0.6300000000000004
6000 :  0.6610000000000005
7000 :  0.6830000000000005
8000 :  0.6950000000000005
9000 :  0.6860000000000005
10000 :  0.6750000000000005


 Q-Table

[[0.53184115 0.49097931 0.51119436 0.50428542]
 [0.37726233 0.32869003 0.32006548 0.49359124]
 [0.40112169 0.37752708 0.39586975 0.46920416]
 [0.33183291 0.29995638 0.28053628 0.4456123 ]
 [0.55777753 0.39017337 0.44091821 0.27177551]
 [0.         0.         0.         0.        ]
 [0.25172546 0.16534112 0.21811238 0.14313956]
 [0.         0.         0.         0.        ]
 [0.46249299 0.41523555 0.37662811 0.59116805]
 [0.35466161 0.63057536 0.48316393 0.41609131]
 [0.62025451 0.41762332 0.31378225 0.41103318]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.   

### 8X8 GRID

In [2]:
env = gym.make("FrozenLake-v1",desc=None,map_name="8x8", is_slippery=True)

In [3]:
action_space = env.action_space.n
state_space = env.observation_space.n
q_table = np.zeros((state_space,action_space))
#print(q_table)

In [4]:
num_episodes = 50000
max_steps_ep = 1000

lr = 0.1
dr = 0.99

expr_rate = 1
max_expr_rate = 1
min_expr_rate = 0.01
expr_decay_rate = 0.0001

In [5]:
all_rewards = []
for episode in range(num_episodes):
    if episode%5000 == 0:
        print(episode," processing")
    state = env.reset()
    done = False
    rewards_curr = 0
    for step in range(max_steps_ep):
        expr_thresh = random.uniform(0,1)
        if expr_thresh > expr_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()
        new_state, reward, done, info = env.step(action)
        #update Q-table
        q_table[state,action] = q_table[state,action]*(1 - lr) + \
        lr*(reward + dr*np.max(q_table[new_state,:]))
        
        state = new_state
        rewards_curr += reward
        if done == True:
            break
            
    expr_rate = min_expr_rate + (max_expr_rate - min_expr_rate)* \
    np.exp(-expr_decay_rate*episode)
    all_rewards.append(rewards_curr)
rewards_per_thousand = np.split(np.array(all_rewards), num_episodes/1000)
count = 1000
print("Avg reward per thousand eps\n")
for r in rewards_per_thousand:
    print(count, ": ",str(sum(r/1000)))
    count += 1000
print("\n\n Q-Table\n")
#print(q_table)

0  processing
5000  processing
10000  processing
15000  processing
20000  processing
25000  processing
30000  processing
35000  processing
40000  processing
45000  processing
Avg reward per thousand eps

1000 :  0.002
2000 :  0.005
3000 :  0.016000000000000007
4000 :  0.010000000000000002
5000 :  0.03300000000000002
6000 :  0.03200000000000002
7000 :  0.035000000000000024
8000 :  0.05100000000000004
9000 :  0.06800000000000005
10000 :  0.07700000000000005
11000 :  0.11400000000000009
12000 :  0.1230000000000001
13000 :  0.1420000000000001
14000 :  0.1460000000000001
15000 :  0.18700000000000014
16000 :  0.18300000000000013
17000 :  0.22400000000000017
18000 :  0.2460000000000002
19000 :  0.2440000000000002
20000 :  0.2570000000000002
21000 :  0.2840000000000002
22000 :  0.32700000000000023
23000 :  0.33000000000000024
24000 :  0.36300000000000027
25000 :  0.36100000000000027
26000 :  0.3730000000000003
27000 :  0.36600000000000027
28000 :  0.4170000000000003
29000 :  0.4380000000000003

### DISPLAY RESULT

In [None]:
for episode in range(3):
    state = env.reset()
    print("******EPISODE ",episode+1,"******\n\n")
    time.sleep(3)
    for step in range(max_steps_ep):
        clear_output(wait = True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait = True)
            env.render()
            if reward == 1:
                print("you have reached the goal")
                time.sleep(3)
            else:
                print("you fell through a hole")
            break
        state = new_state
    env.reset()
    env.close() 