## Imports and enviroment set up

In [1]:
import time
import numpy as np
import gym
import random
import pygame
from IPython.display import clear_output

In [2]:
env = gym.make("FrozenLake-v1")

## Initializing Q-Table

In [3]:
# Initialize Q-table
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

q_table=np.zeros((state_space_size,action_space_size))
q_table

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

## Initializing parameters 

In [4]:
# Episodes param
nbr_episdoes = 10000
max_steps_per_ep = 100

# Learning rate and discount rate
learning_rate = 0.1
discount_rate = 0.99

# Epsilon and  exploration params
explo_rate = 1
max_explo_rate = 1
min_explo_rate = 0.01
explo_decay_rate = 0.001

## Epsilon Greedy Algorithm

In [5]:
reward_all_ep = []
for episode in range(nbr_episdoes): # What happens in one episode
    
    state = env.reset() # reset state
    done = False # Check if episdoe finished (Got to G or fall)
    reward_current_episode = 0 # Reward of current episode
    
    for step in range(max_steps_per_ep): # What happen during each step of episode
        
        exploration_rate_thresh = random.uniform(0,1) # random threshold from 0 to 1
        
        if exploration_rate_thresh > explo_rate: # If threshold < exploration rate we exploit
            action = np.argmax(q_table[state,:])
        else: # Else we explore
            action = env.action_space.sample()
                    
        new_state, reward, done, info = env.step(action) # Update state, reward, done
        
        # Update q_table for (state, action) pair
        q_table[state,action] = (1-learning_rate)*q_table[state,action] + learning_rate*(reward + \
                                                                           discount_rate*np.max(q_table[new_state, : ]))
        # Update state and reward for episode
        state = new_state
        reward_current_episode+=reward
        
        if done == True:
            break
            
    # Append reward after each episode
    reward_all_ep.append(reward_current_episode)
    
    # Modify exploration decay => exploration, explotation trade-off
    explo_rate = min_explo_rate + (max_explo_rate-min_explo_rate)*np.exp(-explo_decay_rate*episode)
    
    

## Print reward / 1000 of episode

In [6]:
reward_per_thousand_ep = np.split(np.array(reward_all_ep),nbr_episdoes / 1000)

count = 1000

print("***** Avg Reward *****")
print("")

for r in reward_per_thousand_ep:
    print(count, " : ", str(sum(r/1000)))
    count += 1000
    
print("")
print("***** Q-Table *****")
print("")
print(q_table)

***** Avg Reward *****

1000  :  0.04100000000000003
2000  :  0.22000000000000017
3000  :  0.4130000000000003
4000  :  0.5380000000000004
5000  :  0.6180000000000004
6000  :  0.6460000000000005
7000  :  0.6660000000000005
8000  :  0.6710000000000005
9000  :  0.6790000000000005
10000  :  0.6640000000000005

***** Q-Table *****

[[0.46514625 0.45039169 0.4479397  0.44934754]
 [0.24982514 0.38093247 0.23049441 0.42927368]
 [0.38638233 0.38783025 0.35953183 0.41433772]
 [0.36845956 0.35483315 0.24261976 0.41020976]
 [0.48145382 0.39774804 0.34674918 0.28161305]
 [0.         0.         0.         0.        ]
 [0.16478781 0.10368202 0.2435314  0.12137558]
 [0.         0.         0.         0.        ]
 [0.3964354  0.48141111 0.44278172 0.51360857]
 [0.48063138 0.56173916 0.50228109 0.34416442]
 [0.36459917 0.57198758 0.33893232 0.35355604]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.53300453 0.58345247 0.70128871 0.38546802]
 [0.72967101 

## Agent play frozen lake

In [12]:
for episode in range(3):
    env = gym.make("FrozenLake-v1")
    state = env.reset()
    done = False
    
    
    print("***** ", episode+1," *****")
    time.sleep(1)
    
    for step in range(max_steps_per_ep):
        clear_output(wait=True)
        env.render()
        action = np.argmax(q_table[state,:])
        new_state,reward,done,info = env.step(action)
        if done:
            clear_output(wait=True)
            env.render()
            time.sleep(0.3)
            if reward == 1:
                print( "***** Congratulation you reached your goal *****")
                time.sleep(3)
            else:
                print("***** Better Luck Next Time *****")
                time.sleep(3)
            clear_output(wait=True)
            break
        state = new_state

env.close()
pygame.quit()

***** Congratulation you reached your goal *****
