In [25]:
!pip install pip --upgrade --user -q --no-warn-script-location
!pip install numpy pandas sklearn matplotlib gym==0.17.3 --user -q --no-warn-script-location


[0m

In [26]:
#import the required libraries.
import numpy as np
import gym
import random

In [27]:
pip install gym[toy_text]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [28]:
pip install pygame

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0m

In [31]:
import os
os.environ["SDL_VIDEODRIVER"] = "dummy"

In [33]:
#create the environment usign OpenAI Gym
env = gym.make("FrozenLake-v1")

  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


#**Creating and initailizing Q-Table**

In [34]:
# Get the dimensions for Q-table
action_size = env.action_space.n
state_size = env.observation_space.n

print(f"Action Space : {action_size} | State Space: {state_size}")

Action Space : 4 | State Space: 16


In [35]:
# Creating a Q-table
qtable = np.zeros((state_size, action_size))
print('Shape of Q-Table',qtable.shape)
print(qtable)

Shape of Q-Table (16, 4)
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


#**Initializing required Hyperparameters**

In [36]:
total_episodes = 20000        # Total episodes
learning_rate = 0.01           # Learning rate
max_steps = 99                # Max steps per episode
gamma = 0.95                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.01            # Minimum exploration probability 
decay_rate = 0.005             # Exponential decay rate for exploration prob

#**Q-Learning Algorithm**

In [37]:
# List of rewards
rewards = []

#until learning is stopped
for episode in range(total_episodes):
    # Reset the environment
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    
    for step in range(max_steps):
        #Choose an action a in the current world state (s)
        exp_exp_tradeoff = random.uniform(0, 1)
        
        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state,:])

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * \
                                                        (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
        
        total_rewards += reward
        
        # Our new state is state
        state = new_state
        
        # If done (if we're dead) : finish episode
        if done == True: 
            break
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode) 

    rewards.append(total_rewards)

print("Score over time: " +  str(sum(rewards)/total_episodes))
print(qtable)

Score over time: 0.4377
[[1.62723407e-01 1.39898374e-01 1.44513280e-01 1.31705323e-01]
 [6.97051326e-02 7.52103310e-02 6.72947656e-02 1.41445283e-01]
 [1.42187802e-01 6.61073862e-02 7.20446005e-02 6.09621056e-02]
 [3.91046877e-02 2.87880026e-09 0.00000000e+00 0.00000000e+00]
 [1.85458085e-01 1.34583504e-01 1.09887784e-01 9.58488088e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.74259820e-01 7.69178211e-02 1.04642382e-01 1.98051930e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.10631125e-01 1.64509792e-01 1.43121046e-01 2.38574985e-01]
 [1.19651857e-01 3.60503303e-01 1.51857986e-01 1.09207127e-01]
 [3.76026980e-01 2.40661290e-01 1.49642461e-01 7.71510227e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [1.28787193e-01 1.55914936e-01 4.70068993e-01 1.11469350e-01]
 [2.31693531e-01 3.26467761e-01 6.58888882e-01 3.22632985e-01]
 [0.00000000e+00 0.00000000e+00

#**Getting the Result**

In [38]:
for episode in range(5):
    state = env.reset()
    step = 0
    done = False
    print("****************************************************")
    print("EPISODE ", episode)

    for step in range(max_steps):
        
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(qtable[state,:])
        
        new_state, reward, done, info = env.step(action)
        
        if done:
            # Here, we decide to only print the last state (to see if our agent is on the goal or fall into an hole)
            env.render()
            
            # We print the number of step it took.
            print("Number of steps", step)
            break
        state = new_state
env.close()

****************************************************
EPISODE  0
Number of steps 12
****************************************************
EPISODE  1
Number of steps 62
****************************************************
EPISODE  2
Number of steps 7
****************************************************
EPISODE  3
****************************************************
EPISODE  4
Number of steps 50
