In [1]:
!pip install numpy
!pip install gym



In [2]:
import numpy as np
import gym
import random

In [14]:
env = gym.make("FrozenLake-v0")
env.render()


[41mS[0mFFF
FHFH
FFFH
HFFG


In [15]:
state_space = env.observation_space.n
print("There are ", state_space, " possible states")
action_space = env.action_space.n
print("There are ", action_space, " possible actions")

There are  16  possible states
There are  4  possible actions


In [16]:
# Create our Q table with state_size rows and action_size columns (500x6)
Q = np.zeros((state_space, action_space))
print(Q)
print(Q.shape)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
(16, 4)


In [68]:
# Defining Hyperparameters

total_episodes = 10000        # Total number of training episodes
total_test_episodes = 200     # Total number of test episodes
max_steps = 200               # Max steps per episode

learning_rate = 0.1          # Learning rate
gamma = 0.96             # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

In [69]:
# Defining the epsilon greedy policy

def epsilon_greedy_policy(Q, state):
  # if random number > greater than epsilon --> exploitation
  if(random.uniform(0,1) > epsilon):
    action = np.argmax(Q[state])
  # else --> exploration
  else:
    action = env.action_space.sample()
  
  return action

In [70]:
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False

    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    
    for step in range(max_steps):
        #
        action = epsilon_greedy_policy(Q, state)

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        Q[state][action] = Q[state][action] + learning_rate * (reward + gamma * 
                                    np.max(Q[new_state]) - Q[state][action])      
        # If done : finish episode
        if done == True: 
            break
        
        # Our new state is state
        state = new_state

In [71]:
print("New Q-values:")
print(Q)

New Q-values:
[[0.24358275 0.19448257 0.181718   0.19720396]
 [0.12811402 0.08160377 0.11361442 0.20256416]
 [0.17506591 0.13446365 0.13750879 0.1400366 ]
 [0.11526041 0.10774993 0.11048938 0.13318918]
 [0.26923606 0.16471407 0.12063481 0.12548958]
 [0.         0.         0.         0.        ]
 [0.16504408 0.07856005 0.08734179 0.03489358]
 [0.         0.         0.         0.        ]
 [0.1623719  0.18730233 0.20499773 0.3119479 ]
 [0.23353753 0.43161072 0.25983124 0.19402667]
 [0.37467206 0.28722407 0.22832211 0.15078816]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.29809447 0.41627497 0.52197875 0.36451443]
 [0.53083148 0.73144935 0.58946274 0.57402418]
 [0.         0.         0.         0.        ]]


In [74]:
rewards = []

for episode in range(total_test_episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("****************************************************")
    print("EPISODE ", episode)
    for step in range(max_steps):
        env.render()
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Q[state][:])
        new_state, reward, done, info = env.step(action)
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            #print ("Score", total_rewards)
            break
        state = new_state

env.close()
print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Right)
SFFF
FHFH
FFFH
HF[41mF[0mG
****************************************************
EPISODE  174

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Left)
SFFF
FHFH
FFFH
HF[41mF

In [None]:
####### FOR WATCHING THE AGENT PLAY ######

# import time
# from IPython.display import clear_output
# rewards = []

# for episode in range(total_test_episodes):
#     state = env.reset()
#     step = 0
#     done = False
#     total_rewards = 0
#     print("****************************************************")
#     print("EPISODE ", episode)
#     time.sleep(0.5)
#     for step in range(max_steps):
#         clear_output(wait=True)
#         env.render()     
#         time.sleep(0.5)
#         # Take the action (index) that have the maximum expected future reward given that state
#         action = np.argmax(Q[state][:])
#         new_state, reward, done, info = env.step(action)
#         total_rewards += reward
        
#         if done:
#             clear_output(wait=True)
#             env.render()
#             if(reward==1):
#                 print("*****You reached the goal!*****")
#                 time.sleep(0.5)
#             else:
#                 print("*****You fell through a hole!*****")
#                 time.sleep(0.5)
#             rewards.append(total_rewards)
#             clear_output(wait=True)
#             print ("Score", total_rewards)
#             break
#         state = new_state

# env.close()
# print ("Score over time: " +  str(sum(rewards)/total_test_episodes))