## Imports and enviroment set up

In [1]:
import time
import numpy as np
import gym
import random
import pygame
from IPython.display import clear_output

In [2]:
env = gym.make("FrozenLake-v1")

## Initializing Q-Table

In [3]:
# Initialize Q-table
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table=np.zeros((state_space_size,action_space_size))
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

## Initializing parameters 

In [4]:
# Episodes param
nbr_episdoes = 10000
max_steps_per_ep = 100

# Learning rate and discount rate
learning_rate = 0.1
discount_rate = 0.99

# Epsilon and  exploration params
explo_rate = 1
max_explo_rate = 1
min_explo_rate = 0.01
explo_decay_rate = 0.001

## Epsilon Greedy Algorithm

In [5]:
reward_all_ep = []
for episode in range(nbr_episdoes): # What happens in one episode
    
    state = env.reset() # reset state
    done = False # Check if episdoe finished (Got to G or fall)
    reward_current_episode = 0 # Reward of current episode
    
    for step in range(max_steps_per_ep): # What happen during each step of episode
        
        exploration_rate_thresh = random.uniform(0,1) # random threshold from 0 to 1
        
        if exploration_rate_thresh > explo_rate: # If threshold < exploration rate we exploit
            action = np.argmax(q_table[state,:])
        else: # Else we explore
            action = env.action_space.sample()
                    
        new_state, reward, done, info = env.step(action) # Update state, reward, done
        
        # Update q_table for (state, action) pair
        q_table[state,action] = (1-learning_rate)*q_table[state,action] + learning_rate*(reward + \
                                                                           discount_rate*np.max(q_table[new_state, : ]))
        # Update state and reward for episode
        state = new_state
        reward_current_episode+=reward
        
        if done == True:
            break
            
    # Append reward after each episode
    reward_all_ep.append(reward_current_episode)
    
    # Modify exploration decay => exploration, explotation trade-off
    explo_rate = min_explo_rate + (max_explo_rate-min_explo_rate)*np.exp(-explo_decay_rate*episode)
    
    

## Print reward / 1000 of episode

In [6]:
reward_per_thousand_ep = np.split(np.array(reward_all_ep),nbr_episdoes / 1000)

count = 1000

print("***** Avg Reward *****")
print("")

for r in reward_per_thousand_ep:
    print(count, " : ", str(sum(r/1000)))
    count += 1000
    
print("")
print("***** Q-Table *****")
print("")
print(q_table)

***** Avg Reward *****

1000  :  0.04000000000000003
2000  :  0.19700000000000015
3000  :  0.4200000000000003
4000  :  0.6040000000000004
5000  :  0.6060000000000004
6000  :  0.6580000000000005
7000  :  0.7100000000000005
8000  :  0.7030000000000005
9000  :  0.6380000000000005
10000  :  0.6980000000000005

***** Q-Table *****

[[0.55926814 0.50790806 0.51979522 0.5117063 ]
 [0.3443446  0.34892395 0.20102642 0.50300599]
 [0.41252651 0.4119806  0.39861745 0.46507587]
 [0.26589432 0.22689194 0.23053896 0.45064583]
 [0.57849991 0.39250938 0.46584799 0.35970723]
 [0.         0.         0.         0.        ]
 [0.20566265 0.14992995 0.40446188 0.15257737]
 [0.         0.         0.         0.        ]
 [0.43600873 0.36893373 0.47107293 0.6166963 ]
 [0.47091761 0.66986049 0.42097783 0.43748201]
 [0.71761458 0.38174585 0.3655922  0.35315819]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.48944075 0.46647762 0.74887869 0.53402452]
 [0.72570087 

## Agent play frozen lake

In [7]:
for episode in range(3):
    env = gym.make("FrozenLake-v1")
    state = env.reset()
    done = False
    
    
    print("***** ", episode+1," *****")
    time.sleep(1)
    
    for step in range(max_steps_per_ep):
        clear_output(wait=True)
        env.render()
        action = np.argmax(q_table[state,:])
        new_state,reward,done,info = env.step(action)
        if done:
            clear_output(wait=True)
            env.render()
            time.sleep(0.3)
            if reward == 1:
                print( "***** Congratulation you reached your goal *****")
                time.sleep(3)
            else:
                print("***** GAME OVER *****")
                time.sleep(3)
            clear_output(wait=True)
            break
        state = new_state

env.close()
pygame.quit()

***** Better Luck Next Time *****
