## Import Packages

In [14]:
import numpy as np
import gym 
import random 

## Init Environment

"This task was introduced in [Dietterich2000] to illustrate some issues in hierarchical reinforcement learning. There are 4 locations (labeled by different letters) and your job is to pick up the passenger at one location and drop him off in another. You receive +20 points for a successful dropoff, and lose 1 point for every timestep it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions."

https://gym.openai.com/envs/Taxi-v2/

In [40]:
env = gym.make('Taxi-v2')
env.reset()
env.render()

+---------+
|[35mR[0m: | :[43m [0m:G|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



## Init Q-Table

In [12]:
num_actions = env.action_space.n
num_states = env.observation_space.n

print ('Number of actions: ', num_actions)
print ('Number of states: ', num_states)

Number of actions:  6
Number of states:  500


In [17]:
qtable = np.zeros((num_states, num_actions))
qtable

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

## Hyper Params

In [32]:
#env params 
total_episodes = 50000 #total num of episodes 
total_test_episodes = 100 #total num of episodes to test on 
max_steps = 99 #max num of steps per episode 

#update params 
lr = 0.7 
gamma = 0.6

#exploration params 
epsilon = 1.0 #starting value for eps greedy (explore)
max_eps = 1.0  #max value for eps greey 
min_eps = 0.01 #min value for eps greedy 
decay = 0.01 #decay rate for eps 

## Q-Learning: Training

In [33]:
#iterate over total number of episodes 
for episode in range(total_episodes):
    #reset env after each episode 
    state = env.reset() #returns init obervation 
    step = 0 #keep track of number of steps so that < max_steps
    done = False #flag for breaking 
    
    for step in range(max_steps):
        #explore or exploit? 
        eps_compare = random.uniform(0,1)

        #exploit
        if eps_compare>epsilon:
            action = np.argmax(qtable[state,:])

        #explore
        else: 
            action = env.action_space.sample() #sample from available actions 

        #take a step to the next state 
        obs, reward, done, info = env.step(action)

        #update q value of the state 
        q_curr = qtable[state,action]
        q_max_n = np.max(qtable[obs, :])
        qtable[state,action] = q_curr + lr*(reward + gamma * q_max_n - q_curr)

        #update to next state 
        state = obs 


        #check if we reach term state, if so break  
        if done:
            break 

    #update epsilon value 
    epsilon =  min_eps + (max_eps-min_eps) * np.exp(-decay*episode)

In [34]:
qtable

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [ -1.97090084,  -1.61605725,  -1.9709196 ,  -1.61817729,
         -1.030336  , -10.61813873],
       [ -1.02657094,  -0.05053118,  -1.03031482,  -0.05055815,
          1.5824    ,  -9.05055864],
       ...,
       [  1.94620115,   4.304     ,   2.33739302,   0.31877055,
         -6.6629789 ,  -6.66217757],
       [ -2.0967502 ,  -1.85466724,  -2.38706533,  -1.6181262 ,
        -10.91824342, -10.89042629],
       [ -0.7       ,  -0.7       ,  11.6354    ,  29.        ,
          5.5454    ,   0.        ]])

## Q-Learning: Testing

Let agent ask deterministically towards best Q-values 

In [35]:
env.reset()

#list of cumulative rewards recieved from each episode 
rewards = []

#iterate over num of test episodes 
for test_episode in range(total_test_episodes):
    state = env.reset() #returns init obervation 
    step = 0 #keep track of number of steps so that < max_steps
    done = False #flag for breaking 
    total_rewards = 0 
    
    #logging 
    print ("******************************************")
    print ("EPISODE: \t", test_episode)
    
    for test_step in range(max_steps):
        env.render()
        #choose best action and act on it 
        best_action = np.argmax(qtable[state, :])
        obs, reward, done, info = env.step(best_action)
        #add to rewards for this episode 
        total_rewards += reward
        state = obs 
        
        
        if done:
            #append to rewards list, each entry for an episode 
            rewards.append(total_rewards)
            break 
            
env.close()
#prints average cumulative reward across all episodes 
print ("Score over time: " + str(sum(rewards)/total_test_episodes))
        

******************************************
EPISODE: 	 0
+---------+
|R: | :[43m [0m:[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+

+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[42mG[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
| : : : :[42m_[0m|
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : :[42m_[0m|
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (South)
************************************