## Imports

In [None]:
import numpy as np
from tqdm import tqdm
import random
import gym
import matplotlib.pyplot as plt
from IPython.display import clear_output
from time import sleep
import seaborn as sns

## Environment

Defining the Taxi-v3 environment

In [None]:
env = gym.make("Taxi-v3")

Since the observation is an integer encoding information about the state, we need a function to decode the information as below:

In [None]:
def decodeObservation(state):
  position = state//20
  col = position%5
  row = position//5
  pick_drop_information = state%20
  pick_location = pick_drop_information//4
  drop_location = pick_drop_information%4
  return row, col, pick_location, drop_location

In [None]:
taxi_row = 4
taxi_col = 2
passenger_location = 4
destination = 3
decodeObservation(((taxi_row * 5 + taxi_col) * 5 + passenger_location) * 4 + destination)

## Epsilon-Greedy

In [None]:
def epsilon_greedy(q_values, epsilon):
  if q_values.any() and np.random.rand() > epsilon:
    action = np.argmax(q_values)
  else:
    action = np.random.choice(len(q_values))
  return action

## Options and Actions

We define four options here:


*   Go to R (0)
*   Go to G (1)
*   Go to Y (2)
*   Go to B (3)



In [None]:
colour_states_positions = [(0,0), (0,4), (4,0), (4,3)]

In [None]:
def OptionGiven(state, q_table_option_policies, epsilon, goal_row, goal_column, option_number):

  optdone = False
  taxi_row, taxi_column, _, _ = decodeObservation(state)

  if (taxi_row==goal_row and taxi_column==goal_column):
    optdone = True

  optact = epsilon_greedy(q_table_option_policies[option_number][state], epsilon)

  return optdone, optact

## SMDP Q-Learning

Function to run SMDP Q-Learning algorithm on the environment.



In [None]:
def SMDP_Q_Learning(env, gamma, alpha, epsilon_start, epsilon_end, epsilon_decay, num_episodes, Option, q_table_option_policies, q_table_SMDP, final_options, frames):
  rewards = []
  epsilon = epsilon_start
  successes = 0
  episode = 0
  for _ in tqdm(range(num_episodes)):
    episode+=1
    epsilon = max(epsilon_end, epsilon*epsilon_decay)
    state = env.reset()
    done = False
    total_reward = 0

    while not done:

      action = epsilon_greedy(q_table_SMDP[state], epsilon)

      if(episode==num_episodes-1):
        final_options.append(action)

      # checking if option chosen
      reward_bar = 0
      gamma_option = 1
      curr_state = state

      if action >= 0:

        optdone = False
        option_number = action
        goal_row, goal_column = colour_states_positions[option_number]
        while (optdone == False and done == False):

          optdone,optact = Option(state, q_table_option_policies, epsilon, goal_row, goal_column, option_number)
          next_state, reward, done, _ = env.step(optact)
          if(episode==num_episodes):
            frames.append({
                'frame': env.render(mode='ansi'),
                'state': state,
                'action': action,
                'reward': reward
                }
            )
          total_reward+= reward

          if(reward == 20):
            successes+=1

          reward_bar = reward_bar + gamma_option*reward
          gamma_option*=gamma
          q_table_option_policies[option_number][state][optact] += alpha * (reward + gamma * np.max(q_table_option_policies[option_number][next_state]) - q_table_option_policies[option_number][state][optact])

          state = next_state

        q_table_SMDP[curr_state][action] += alpha * (reward_bar + gamma_option * np.max(q_table_SMDP[state]) - q_table_SMDP[curr_state][action])

    rewards.append(total_reward)

  print(successes/num_episodes * 100)
  return rewards

The code below considers 5 train runs.

In [None]:
reward_vals = []
q_tables_option_policies = []
q_tables_SMDP = []
for i in range(5):
  final_options = []
  frames = []
  q_table_option_policies = np.zeros((4, 500, 6)) # num_options x num_states x num_primitive_actions_for_moving
  q_table_SMDP = np.zeros((500, 4)) # num_states x num_options
  rewards = SMDP_Q_Learning(env, 0.9, 0.1, 1, 0.0001, 0.99, 10000, OptionGiven, q_table_option_policies, q_table_SMDP, final_options, frames)
  q_tables_SMDP.append(q_table_SMDP)
  q_tables_option_policies.append(q_table_option_policies)
  reward_vals.append(rewards)

### Heatmaps of Q Tables

Averaging the q tables for further inference.

In [None]:
q_tables_option_policies = np.array(q_tables_option_policies)
q_tables_SMDP = np.array(q_tables_SMDP)
q_table_option_policies_avgd = np.mean(q_tables_option_policies, axis=0)
q_table_SMDP_avgd = np.mean(q_tables_SMDP, axis=0)

Plotting the pickup and drop phase option policy heatmaps.

In [None]:
passenger_loc = 2
drop_location = 3

heatmap_pick = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == passenger_loc and drop == drop_location:
    heatmap_pick[row,col] = np.argmax(q_table_option_policies_avgd[drop_location][state])


plt.imshow(heatmap_pick, cmap='viridis', interpolation='nearest')
labels = {0:'south',1:'north', 2:'east', 3:'west', 4:'pickup', 5:'drop'}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_pick[i, j])], ha='center', va='center', color='white')

plt.show()

heatmap_drop = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == 4 and drop == drop_location:
    heatmap_drop[row,col] = np.argmax(q_table_option_policies_avgd[drop_location][state])


plt.imshow(heatmap_drop, cmap='viridis', interpolation='nearest')
labels = {0:'south',1:'north', 2:'east', 3:'west', 4:'pickup', 5:'drop'}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_drop[i, j])], ha='center', va='center', color='white')

plt.show()

In [None]:
passenger_loc = 3 # B
drop_location = 2 # Y

heatmap = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == passenger_loc and drop == drop_location:
    heatmap[row,col] = np.argmax(q_table_SMDP_avgd[state])


plt.imshow(heatmap, cmap='viridis', interpolation='nearest')
labels = {0:'gotoR',1:'gotoG', 2:'gotoY', 3:'gotoB'}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap[i, j])], ha='center', va='center', color='white')

plt.show()

### Reward Plots

In [None]:
avg_rewards = []
for i in range(10000):
  sum = 0
  for j in range(5):
    sum += reward_vals[j][i]
  avg_rewards.append(sum/5)

In [None]:
avg_rews = [np.average(avg_rewards[i:i+100]) for i in range(len(avg_rewards)-100)]
x = np.arange(9900)
t = [5] * len(avg_rews)
plt.figure(figsize = (10,5))
plt.plot(x, np.array(avg_rews))
plt.plot(x, t)

### Visulaizing agent's action

In [None]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.7)

print_frames(frames)

## Intra Option Q-Learning

Function to run Intra Option Q-Learning algorithm on the environment.


In [None]:
def IntraOption_Q_Learning(env, gamma, alpha, epsilon_start, epsilon_end, epsilon_decay, num_episodes, Option, q_table_option_policies, q_table_io, final_options, frames):
  rewards = []
  epsilon = epsilon_start
  successes = 0
  episode = 0
  for _ in tqdm(range(num_episodes)):
    episode+=1
    epsilon = max(epsilon_end, epsilon*epsilon_decay)
    state = env.reset()
    done = False
    total_reward = 0

    while not done:

      action = epsilon_greedy(q_table_SMDP[state], epsilon)

      if(episode==num_episodes-1):
        final_options.append(action)

      reward_bar = 0
      gamma_option = 1
      curr_state = state

      if action >= 0:

        optdone = False
        option_number = action
        goal_row, goal_column = colour_states_positions[option_number]
        while (optdone == False and done == False):

          optdone,optact = Option(state, q_table_option_policies, epsilon, goal_row, goal_column, option_number)
          next_state, reward, done, _ = env.step(optact)
          if(episode==num_episodes):
            frames.append({
                'frame': env.render(mode='ansi'),
                'state': state,
                'action': action,
                'reward': reward
                }
            )
          total_reward+= reward

          if(reward == 20):
            successes+=1

          reward_bar = reward_bar + gamma_option*reward
          gamma_option*=gamma
          q_table_option_policies[option_number][state][optact] += alpha * (reward + gamma * np.max(q_table_option_policies[option_number][next_state]) - q_table_option_policies[option_number][state][optact])
          for opt_num in range(4):
            q_table_io[state][opt_num] += alpha*(reward+gamma*((1-optdone)*q_table_io[next_state][opt_num]+optdone*np.max(q_table_io[next_state]))-q_table_io[state][opt_num])

          state = next_state

    rewards.append(total_reward)

  print(successes/num_episodes * 100)
  return rewards

The below cell consdiers 5 runs of the algorithm.

In [None]:
reward_vals = []
q_tables_option_policies = []
q_tables_io = []
for i in range(5):
  final_options = []
  frames = []
  q_table_option_policies = np.zeros((4, 500, 6)) # num_options x num_states x num_primitive_actions_for_moving
  q_table_io = np.zeros((500, 4)) # num_states x num_options
  rewards = IntraOption_Q_Learning(env, 0.9, 0.1, 1, 0.001, 0.99, 10000, OptionGiven, q_table_option_policies, q_table_io, final_options, frames)
  q_tables_io.append(q_table_io)
  q_tables_option_policies.append(q_table_option_policies)
  reward_vals.append(rewards)

### Heatmaps

Averaging q tables for further inference.

In [None]:
q_tables_option_policies = np.array(q_tables_option_policies)
q_tables_io = np.array(q_tables_io)
q_table_option_policies_avgd = np.mean(q_tables_option_policies, axis=0)
q_table_io_avgd = np.mean(q_tables_io, axis=0)

Plotting option policy q table heatmap

In [None]:
passenger_loc = 0
drop_location = 3

heatmap_pick = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == passenger_loc and drop == drop_location:
    heatmap_pick[row,col] = np.argmax(q_table_option_policies_avgd[drop_location][state])


plt.imshow(heatmap_pick, cmap='viridis', interpolation='nearest')
labels = {0:'south',1:'north', 2:'east', 3:'west', 4:'pickup', 5:'drop'}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_pick[i, j])], ha='center', va='center', color='white')

plt.show()

heatmap_drop = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == 4 and drop == drop_location:
    heatmap_drop[row,col] = np.argmax(q_table_option_policies_avgd[drop_location][state])


plt.imshow(heatmap_drop, cmap='viridis', interpolation='nearest')
labels = {0:'south',1:'north', 2:'east', 3:'west', 4:'pickup', 5:'drop'}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_drop[i, j])], ha='center', va='center', color='white')

plt.show()

Plotting heatmap for Intra-Option q table

In [None]:
passenger_loc = 1 # Y
drop_location = 3 # B

heatmap_pick = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == passenger_loc and drop == drop_location:
    heatmap_pick[row,col] = np.argmax(q_table_io_avgd[state])


plt.imshow(heatmap_pick, cmap='viridis', interpolation='nearest')
labels = {0:'gotoR',1:'gotoG', 2:'gotoY', 3:'gotoB'}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_pick[i, j])], ha='center', va='center', color='white')

plt.show()

### Reward Plot

In [None]:
avg_rewards = []
for i in range(10000):
  sum = 0
  for j in range(5):
    sum += reward_vals[j][i]
  avg_rewards.append(sum/5)

In [None]:
avg_rews2 = [np.average(avg_rewards[i:i+100]) for i in range(len(avg_rewards)-100)]
x = np.arange(9900)
t = [9.7] * len(avg_rews2)
plt.figure(figsize = (10,5))
plt.plot(x, np.array(avg_rews2))
plt.plot(x, t)

Visualizing agent's action

In [None]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.7)

print_frames(frames)