## Imports

In [1]:
import numpy as np
from tqdm import tqdm
import random
import gym
import matplotlib.pyplot as plt

## Environment

Defining the Taxi-v3 environment

In [2]:
env = gym.make("Taxi-v3")

  deprecation(
  deprecation(


Since the observation is an integer encoding information about the state, we need a function to decode the information as below:

In [3]:
def decodeObservation(state):
  position = state//20
  col = position%5
  row = position//5
  pick_drop_information = state%20
  pick_location = pick_drop_information//4
  drop_location = pick_drop_information%4
  return row, col, pick_location, drop_location

## Epsilon-Greedy

In [4]:
def epsilon_greedy(q_values, epsilon):
  if q_values.any() and np.random.rand() > epsilon:
    action = np.argmax(q_values)
  else:
    action = np.random.choice(len(q_values))
  return action

In [6]:
colour_states_positions = [(0,0), (0,4), (4,0), (4,3)]

## Option Function:
we define an alternate option function different to the previous option set

In [14]:
def OptionAlternate(state, q_table_option_policies, epsilon, goal_row, goal_column, option_number):

  optdone = False
  optact = epsilon_greedy(q_table_option_policies[option_number][state], epsilon)
  next_state, reward, done, _ = env.step(optact)
  taxi_row, taxi_col, pick, drop = decodeObservation(next_state)

  if (pick < 4 and option_number == 0 and (taxi_row, taxi_col) == colour_states_positions[pick]):#check if the option is completed
    optdone = True

  elif (pick == 4 and option_number == 1 and (taxi_row, taxi_col) == colour_states_positions[drop]):#check if the option is completed
    optdone = True

  return optdone,optact,next_state,reward,done

In [15]:
def SMDP_Q_Learning(env, gamma, alpha, epsilon_start, epsilon_end, epsilon_decay, num_episodes, Option, q_table_option_policies, q_table_SMDP, final_options, frames):
  rewards = []
  epsilon = epsilon_start
  successes = 0
  episode = 0
  for _ in tqdm(range(num_episodes)):
    episode+=1

    state = env.reset()
    done = False
    total_reward = 0

    while not done:

      action = epsilon_greedy(q_table_SMDP[state], epsilon)
      epsilon = max(epsilon_end, epsilon*epsilon_decay)
      if(episode==num_episodes):
        final_options.append(action)
      curr_state = state
      #if the option is drop or fetch
      if action>1:
        next_state, reward, done,terminated= env.step(action+2)
        q_table_SMDP[curr_state][action] += alpha * (reward + gamma* np.max(q_table_SMDP[next_state]) - q_table_SMDP[curr_state][action])
        state=next_state
        total_reward+=reward
      if(episode==num_episodes):#for rendering purposes
        frames.append({
          'frame': env.render(mode='ansi'),
          'state': state,
          'action': action,
          'reward': reward
          }
          )
      #if the option is gotodrop or gotofetch
      if action <= 1:
        reward_bar = 0
        gamma_option = 1
        optdone = False
        option_number = action
        goal_row, goal_column = colour_states_positions[option_number]
        while (optdone == False and done == False):

          optdone,optact,next_state,reward,done= OptionAlternate(state, q_table_option_policies, epsilon, goal_row, goal_column, option_number)
          if(episode==num_episodes):
            frames.append({
                'frame': env.render(mode='ansi'),
                'state': state,
                'action': action,
                'reward': reward
                }
            )
          total_reward+= reward
          if(reward == 20):
            successes+=1
          reward_bar = reward_bar + gamma_option*reward
          gamma_option*=gamma
          q_table_option_policies[option_number][state][optact] += alpha * (reward + gamma * np.max(q_table_option_policies[option_number][next_state]) - q_table_option_policies[option_number][state][optact])
          state = next_state
        # Complete SMDP Q-Learning Update
        q_table_SMDP[curr_state][action] += alpha * (reward_bar + gamma_option * np.max(q_table_SMDP[state]) - q_table_SMDP[curr_state][action])
    rewards.append(total_reward)
  print(successes/num_episodes * 100)
  return rewards

## Averaging over 5 runs:

In [None]:
reward_vals = []
q_tables_option_policies=[]
q_tables_SMDP=[]
for i in range(5):#averaging over 5 runs
  final_options = []
  frames = []
  q_table_option_policies = np.zeros((2, 500, 4)) # num_options x num_states x num_primitive_actions_for_moving
  q_table_SMDP = np.zeros((500, 4)) # num_states x num_options
  rewards = SMDP_Q_Learning(env, 0.9, 0.1, 1, 0.0001, 0.99, 10000, OptionAlternate, q_table_option_policies, q_table_SMDP, final_options, frames)
  reward_vals.append(rewards)
  q_tables_option_policies.append(q_table_option_policies)
  q_tables_SMDP.append(q_table_SMDP)
q_table_option_policies_avg=np.mean(q_tables_option_policies,axis=0)
q_table_SMDP_avg=np.mean(q_tables_SMDP,axis=0)

## For plotting:

In [132]:
avg_rewards = []
for i in range(10000):
  sum = 0
  for j in range(5):
    sum += reward_vals[j][i]
  avg_rewards.append(sum/5)

In [None]:
x = np.arange(10000)
t = [10] * 10000
plt.figure(figsize = (10,5))
plt.plot(x, np.array(avg_rewards))
plt.plot(x, t)

## For animation:

In [None]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.7)

print_frames(frames)

## Running Average plot:

In [None]:
avg_rews2 = [np.average(avg_rewards[i:i+100]) for i in range(len(avg_rewards)-100)]
x = np.arange(9900)
t = [8.5] * len(avg_rews2)
plt.figure(figsize = (10,5))
plt.plot(x, np.array(avg_rews2))
plt.plot(x, t)


## For plotting option policy Heatmap

In [None]:
passenger_loc = 0
drop_location = 2
map={(0,0):0,(0,4):1,(4,0):2,(4,3):3}
heatmap_pick = np.zeros((5,5))# gridsize
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == passenger_loc: #and drop == drop_location:
    heatmap_pick[row,col] = np.argmax(q_table_option_policies_avg[0][state])
    if ((row,col) in map and map[(row,col)]==passenger_loc):
      heatmap_pick[row,col]=np.argmax(q_table_SMDP_avg[state])+2

plt.imshow(heatmap_pick, cmap='viridis', interpolation='nearest')
labels = {0:'south',1:'north', 2:'east', 3:'west', 4:'pickup', 5:'drop'}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_pick[i, j])], ha='center', va='center', color='white')

plt.show()

heatmap_drop = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == 4 and drop == drop_location:
    heatmap_drop[row,col] = np.argmax(q_table_option_policies_avg[1][state])
    if ((row,col) in map and map[(row,col)]==drop_location):
      heatmap_drop[row,col]=np.argmax(q_table_SMDP_avg[state])+2

plt.imshow(heatmap_drop, cmap='viridis', interpolation='nearest')
labels = {0:'south',1:'north', 2:'east', 3:'west', 4:'pickup', 5:'drop'}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_drop[i, j])], ha='center', va='center', color='white')

plt.show()

## For plotting overall policy heatmap:

In [None]:
passenger_loc = 0
drop_location = 3
map={(0,0):0,(0,4):1,(4,0):2,(4,3):3}
heatmap_pick = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == passenger_loc: #and drop == drop_location:
    heatmap_pick[row,col] = np.argmax(q_table_SMDP_avg[state])

plt.imshow(heatmap_pick, cmap='viridis', interpolation='nearest')
labels = {0:"fetch",1:"dest",2:"pick",3:"drop"}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_pick[i, j])], ha='center', va='center', color='white')

plt.show()

heatmap_drop = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == 4 and drop == drop_location:
    heatmap_drop[row,col] = np.argmax(q_table_SMDP_avg[state])

plt.imshow(heatmap_drop, cmap='viridis', interpolation='nearest')
labels = {0:"fetch",1:"dest",2:"pick",3:"drop"}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_drop[i, j])], ha='center', va='center', color='white')

plt.show()

# Intra Option Q learning:

### Option function for intra q:

In [18]:
def OptionAlternate(state, q_table_option_policies, epsilon, option_number,optdone):
          optact=epsilon_greedy(q_table_option_policies[option_number][state],epsilon)
          next_state,reward,done,terminated=env.step(optact)
          taxi_row, taxi_col, pick, drop = decodeObservation(next_state)
          if pick!=4 and option_number==0 and (taxi_row,taxi_col)==colour_states_positions[pick]:
            optdone=True
          if pick==4 and option_number==1 and (taxi_row,taxi_col)==colour_states_positions[drop]:
            optdone=True

          return optdone,optact,next_state,reward,done,taxi_row,taxi_col,pick,drop

  and should_run_async(code)


### Qlearning function

In [19]:
def IntraOption_Q_Learning(env, gamma, alpha, epsilon_start, epsilon_end, epsilon_decay, num_episodes, Option, q_table_option_policies, q_table_io, final_options, frames):
  rewards = []
  epsilon = epsilon_start
  successes = 0
  episode = 0
  for _ in tqdm(range(num_episodes)):
    episode+=1
    state = env.reset()
    done = False
    total_reward = 0
    while not done:
      epsilon=max(epsilon_end,epsilon*epsilon_decay)
      option=epsilon_greedy(q_table_io[state],epsilon)
      #if option is pick or drop
      if option>1:
        next_state, reward, done,terminated= env.step(option+2)
        total_reward+=reward
        q_table_io[state][option]+=alpha*(reward + gamma*np.max(q_table_io[next_state])-q_table_io[state][option])
        if(episode==num_episodes):#for rendering purposes
            frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': option,
            'reward': reward
            }
            )
        state=next_state
      #if option is gotopickup or gotodrop
      if option<=1:
        optdone=False
        while(optdone==False):
          optdone,optact,next_state,reward,done,taxi_row,taxi_col,pick,drop=OptionAlternate(state, q_table_option_policies, epsilon, option,optdone)
          if(episode==num_episodes):#for rendering purposes
            frames.append({
            'frame': env.render(mode='ansi'),
            'state': state,
            'action': option,
            'reward': reward
            }
            )
          total_reward+=reward
          #IntraQ update rules
          for i in range(2):
            if (pick!=4 and i==0 and (taxi_row,taxi_col)==colour_states_positions[pick]) or (pick==4 and i==1 and (taxi_row,taxi_col)==colour_states_positions[drop]):
                q_table_io[state][i]+=alpha*(reward+gamma*np.max(q_table_io[next_state])-q_table_io[state][i])
                q_table_option_policies[option][state][optact]+=alpha*(reward+gamma*np.max(q_table_option_policies[option][next_state])-q_table_option_policies[option][state][optact])
            else:
                q_table_io[state][i]+=alpha*(reward+gamma*(q_table_io[next_state][i])-q_table_io[state][i])
                q_table_option_policies[option][state][optact]+=alpha*(reward+gamma*np.max(q_table_option_policies[option][next_state])-q_table_option_policies[option][state][optact])

          state=next_state
          if done:
            break
    rewards.append(total_reward)
  return rewards

## Averaging over 5 runs

In [None]:
reward_vals = []
option_tables=[]
ioql_tables=[]
for i in range(5):
  final_options = []
  frames = []
  q_table_option_policies = np.zeros((2, 500, 4)) # num_options x num_states x num_primitive_actions_for_moving
  q_table_io = np.zeros((500, 4)) # num_states x num_options
  rewards = IntraOption_Q_Learning(env, 0.9, 0.1, 0.1, 0.0001, 0.99, 10000, OptionAlternate, q_table_option_policies, q_table_io, final_options, frames)
  option_tables.append(q_table_option_policies)
  ioql_tables.append(q_table_io)
  reward_vals.append(rewards)
option_avg=np.mean(option_tables,axis=0)
ioql_avg=np.mean(ioql_tables,axis=0)

## For plotting purposes:

In [22]:
avg_rewards = []
for i in range(10000):
  sum = 0
  for j in range(5):
    sum += reward_vals[j][i]
  avg_rewards.append(sum/5)

In [None]:
x = np.arange(10000)
t = [10] * 10000
plt.figure(figsize = (10,5))
plt.plot(x, np.array(avg_rewards))
plt.plot(x, t)

In [None]:
avg_rews2 = [np.average(avg_rewards[i:i+100]) for i in range(len(avg_rewards)-100)]
x = np.arange(9900)
t = [8.5] * len(avg_rews2)
plt.figure(figsize = (10,5))
plt.plot(x, np.array(avg_rews2))
plt.plot(x, t)

## Animation:

In [None]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.7)

print_frames(frames)

## Option policy heatmap

In [None]:
passenger_loc = 2
drop_location = 0
map={(0,0):0,(0,4):1,(4,0):2,(4,3):3}
heatmap_pick = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == passenger_loc: #and drop == drop_location:
    heatmap_pick[row,col] = np.argmax(option_avg[0][state])
    if ((row,col) in map and map[(row,col)]==passenger_loc):
      heatmap_pick[row,col]=np.argmax(ioql_avg[state])+2

plt.imshow(heatmap_pick, cmap='viridis', interpolation='nearest')
labels = {0:'south',1:'north', 2:'east', 3:'west', 4:'pickup', 5:'drop'}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_pick[i, j])], ha='center', va='center', color='white')

plt.show()

heatmap_drop = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == 4 and drop == drop_location:
    heatmap_drop[row,col] = np.argmax(option_avg[1][state])
    if ((row,col) in map and map[(row,col)]==drop_location):
      heatmap_drop[row,col]=np.argmax(ioql_avg[state])+2

plt.imshow(heatmap_drop, cmap='viridis', interpolation='nearest')
labels = {0:'south',1:'north', 2:'east', 3:'west', 4:'pickup', 5:'drop'}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_drop[i, j])], ha='center', va='center', color='white')

plt.show()

## Overall policy heatmap

In [None]:
passenger_loc = 0
drop_location = 1
map={(0,0):0,(0,4):1,(4,0):2,(4,3):3}
heatmap_pick = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == passenger_loc: #and drop == drop_location:
    heatmap_pick[row,col] = np.argmax(ioql_avg[state])


plt.imshow(heatmap_pick, cmap='viridis', interpolation='nearest')
labels = {0:"fetch",1:"dest",2:"pick",3:"drop"}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_pick[i, j])], ha='center', va='center', color='white')

plt.show()

heatmap_drop = np.zeros((5,5))
for state in range(500):
  row, col, pick, drop = decodeObservation(state)
  if pick == 4 and drop == drop_location:
    heatmap_drop[row,col] = np.argmax(ioql_avg[state])


plt.imshow(heatmap_drop, cmap='viridis', interpolation='nearest')
labels = {0:"fetch",1:"dest",2:"pick",3:"drop"}

for i in range(5):
    for j in range(5):
        plt.text(j, i, labels[int(heatmap_drop[i, j])], ha='center', va='center', color='white')

plt.show()