In [1]:
#Dependencies: Gym, Numpy, Random
#Installing Gym in Linux using conda https://anaconda.org/akode/gym

#Install the dependencies using
!python -m pip install pyvirtualdisplay
!apt-get install xvfb
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

E: Could not open lock file /var/lib/dpkg/lock-frontend - open (13: Permission denied)
E: Unable to acquire the dpkg frontend lock (/var/lib/dpkg/lock-frontend), are you root?


In [18]:
import gym
import sys
import itertools
import numpy as np
import random

import matplotlib.pyplot as plt
%matplotlib inline   

#This will lead to static images of your plot embedded in the notebook

In [9]:
# Create  environment of Frozen-Lake-v0
env = gym.make('FrozenLake-v0')
print('observation space:', env.observation_space)
print('action space:', env.action_space)
# Every environment comes with an action_space and an observation_space. These attributes are of type Space
#The Discrete space allows a fixed range of non-negative numbers, so in this case, observations are 16 positive
#numbers, actions are 4 positive numbers 


observation space: Discrete(16)
action space: Discrete(4)


In [19]:
observation = env.reset()
# env.render(mode='rgb_array', close=True)
print('initial observation:', observation)

action = env.action_space.sample() # Select random action
print('random action:', action)

observation, reward, is_finished, info = env.step(action)  
#used to take action and observe reward & next state
#observation: a next observation
#reaward: a scalar reward
#is_finished: a boolean value indicating whether the current state is terminal or not
#info: additional information

print('next observation:', observation)
print('reward:', reward)
print('is_finished:', is_finished)
print(random.random())

initial observation: 0
random action: 2
next observation: 0
reward: 0.0
is_finished: False
0.353898273377075


In [23]:
def q_learning(env, num_episodes, alpha=0.85, discount_factor=0.99):
    """
    Q learning algorithm, off-polics TD control. Finds optimal gready policies
    Args:
    - env: Given environment to solve
    - num_episodes: Number of episodes to learn
    - alpha: learning rate
    - discount factor: weight/importance given to future rewards
    - epsilon: probability of taking random action. 
             We are using decaying epsilon, 
             i.e high randomness at beginning and low towards end
    Returns:
    - Optimal Q
    """
     
    # decaying epsilon, i.e we will divide num of episodes passed
    epsilon = 1.0

    # create a numpy array filled with zeros 
    # rows = number of observations & cols = possible actions

    action_size = env.action_space.n
    print("Action size ", action_size)

    state_size = env.observation_space.n
    print("State size ", state_size)
    
    q_table = np.zeros([env.observation_space.n, env.action_space.n]) 
    print(q_table)
    
    for i_episode in range(num_episodes):
            # reset the env
            state = env.reset()
            # itertools.count() has similar to 'while True:'
            for t in itertools.count():
                # generate a random num between 0 and 1 e.g. 0.35, 0.73 etc..
                # if the generated num is smaller than epsilon, we follow exploration policy 
                if random.uniform(0, 1) <epsilon:
                    action = env.step(env.action_space.sample())   
                    # select a random action from set of all actions
                else:
                    action = np.argmax(q_table[state])
                    # if the generated num is greater than epsilon, we follow exploitation policy
                    # select an action with highest value for current state
                
                next_state, reward, is_finished, info = env.step(action)
                # apply selected action, collect values for next_state and reward
                old_value = q_table[state, action]
                next_max  = np.max(q_table[next_state])
                new_value = old_value+alpha*(reward+(discount_factor*next_max)-old_value)
                # Update the Q table, alpha is the learning rate
                q_table[state,action] = new_value
                
                # break if done, i.e. if end of this episode
                if done:
                    break
                # make the next_state into current state as we go for next iteration
                state = next_state
            # gradualy decay the epsilon
            if epsilon > 0.1:
                epsilon -= 1.0/num_episodes
    
    return Q    # return optimal Q

In [24]:
def test_algorithm(env, Q):
    """
    Test script for Q function
    Args:
    - env: Given environment to test Q function
    - Q: Q function to verified
    Returns:
    - Total rewards for one episode
    """
    
    state = env.reset()
    total_reward = 0
    
    while True:
        # selection the action with highest values i.e. best action
        action = np.argmax(Q[state, :])
        # apply selected action
        next_state, reward, done, _ = env.step(action)
        # render environment
        env.render()
        # calculate total reward
        total_reward += reward
        
        if done:
            print(total_reward)
            break
            
        state = next_state
    
    return total_reward 

In [22]:
Q = q_learning(env, 20000)

Action size  4
State size  16
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


UnboundLocalError: local variable 'random' referenced before assignment