In [2]:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output

# Environment Creation

In [5]:
env = gym.make("FrozenLake-v1")

# Creating Q Table & Intialization

In [6]:
action_space_size = env.action_space.n
state_space_size = env.observation_space.n

In [7]:
q_table = np.zeros((state_space_size,action_space_size))
print(q_table)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


# Parameters for Learning

In [10]:
num_episodes = 10000            
max_steps_per_episode = 100      

learning_rate = 0.1
discount_rate = 0.99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001

In [11]:
rewards_all_episodes = []

# Q learning Algorithm
for episode in range(num_episodes):
    state = env.reset()
    
    done = False
    rewards_current_episode = 0
    
    for step in range(max_steps_per_episode):
        
        # Exploration-Exploitation trade off
        
        exploration_rate_threshold = random.uniform(0,1)
        
        if exploration_rate_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
            
        else:
            action = env.action_space.sample()
            
        new_state, reward, done, info = env.step(action)
        
        
        # Update Q-tabel for Q(s,a)
        
        q_table[state,action] = q_table[state,action]*(1-learning_rate) + \
            learning_rate*(reward + discount_rate*np.max(q_table[new_state,:]))
        
        state = new_state
        rewards_current_episode += reward
        
        if done == True:
            break
        
    
    # Exploration rate decay
    
    exploration_rate = min_exploration_rate + \
        (max_exploration_rate-min_exploration_rate)*np.exp(-exploration_decay_rate*episode)
    
    
    rewards_all_episodes.append(rewards_current_episode)


# Avwerage Reward per thousand episodes

rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes),num_episodes/1000)
count = 1000

print("--------Average reward per thousand episodes--------\n")
for r in rewards_per_thousand_episodes:
    print(count, ": ", str(sum(r/1000)))
    count += 1000



--------Average reward per thousand episodes--------

1000 :  0.04000000000000003
2000 :  0.21300000000000016
3000 :  0.4020000000000003
4000 :  0.5590000000000004
5000 :  0.6420000000000005
6000 :  0.7100000000000005
7000 :  0.6770000000000005
8000 :  0.6630000000000005
9000 :  0.7060000000000005
10000 :  0.6580000000000005


In [15]:
# Print final updated Q-table
print("------- updated Q-table------\n")
print(q_table)

------- updated Q-table------

[[0.59569426 0.51230323 0.51288354 0.50750226]
 [0.25744645 0.3647274  0.3026097  0.52129895]
 [0.40568347 0.41416226 0.41335174 0.48822076]
 [0.27145833 0.30797598 0.3567334  0.45182715]
 [0.61294595 0.33839684 0.39067767 0.3366957 ]
 [0.         0.         0.         0.        ]
 [0.37000427 0.09266115 0.18882527 0.1012663 ]
 [0.         0.         0.         0.        ]
 [0.36469119 0.35379447 0.4642201  0.64809152]
 [0.50141391 0.69117461 0.50142785 0.46786012]
 [0.62826376 0.39032551 0.39161086 0.32824592]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.35664254 0.58794056 0.7817631  0.52048468]
 [0.71182553 0.89408652 0.71818521 0.73354311]
 [0.         0.         0.         0.        ]]


# Watch Agent Play

In [17]:
for episode in range(3):
    state = env.reset()
    done = False
    
    print("------EPISODE ",episode+1, " -------\n\n")
    time.sleep(1)
    
    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(0.3)
        
        action = np.argmax(q_table[state,:])
        new_state, reward, done, info = env.step(action)
        
        if done:
            clear_output(wait=True)
            env.render()
            
            if reward == 1:
                print("---Goal Reached---")
                time.sleep(3)
            else:
                print("--- Failed---")
                time.sleep(1)
            
            clear_output(wait=True)
            break
            
        state = new_state

env.close()

---Goal Reached---
