In this notebook we will try to understand and solve the *openaigym* taxi environment, where the can has to pick up and drop the passenger in the correct location.

In [1]:
import gym
import numpy as np
import pickle, os

In [3]:
env = gym.make("Taxi-v3") # creating the environment

In [4]:
state = env.reset() # resting the environment


In [5]:
state

51

In [6]:
env.render() # dispaly the environment

+---------+
|R: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



In [7]:
n_states = env.observation_space.n # possible states in the environment
n_actions = env.action_space.n # possible actions for any state in the environment

In [8]:
n_actions

6

In [9]:
n_states

500

In [10]:
env.step(3) # take a random step

(51, -1, False, {'prob': 1.0})

In [11]:
env.render()

+---------+
|R: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)


In [14]:
# running an environment completely randomly

state = env.reset()
steps = 0
g = 0
reward = None
done = False

while reward != 20:
  state, reward, done, info = env.step(env.action_space.sample())
  steps += 1
  g += reward

print(f"Solved in {steps} Steps with a total reward of {g}")

Solved in 6116 Steps with a total reward of -24203


## Q-Learning

In [15]:
Q = np.zeros([n_states, n_actions]) # initailise all zeros Q-Table

In [17]:
episodes = 500 # number of episodes to run
rewardTracker = []
G = 0
alpha = 0.618

for episode in range(1,episodes+1): # for all episodes
  done = False
  G, reward = 0,0
  state = env.reset() # reset teh environment
  while not done:
    action = np.argmax(Q[state]) # taking maximum Q-value action
    state2, reward, done, info = env.step(action) 
    Q[state,action] += alpha * ((reward + (np.max(Q[state2]))  - Q[state,action])) # update Q-value
    G += reward
    state = state2
      
  if episode % 100 == 0:
    print('Episode {} Total Reward: {}'.format(episode,G))

Episode 100 Total Reward: 11
Episode 200 Total Reward: 5
Episode 300 Total Reward: 8
Episode 400 Total Reward: 10
Episode 500 Total Reward: 7


In [18]:
# now use the learnt Q-values to solve the environment using optimal policy
counter = 0
state = env.reset()
done = False

while not done:
  # We simply take the action with the highest Q Value
  action = np.argmax(Q[state])
  state, reward, done, info = env.step(action)
  counter += 1

In [20]:
print(f"Solved in {counter} Steps")

Solved in 15 Steps


In [None]:
# # saving and loading the learnt table

# with open("smartTaxi_qTable.pkl", 'wb') as f:
#   pickle.dump(Q, f)

# with open("smartTaxi_qTable.pkl", 'rb') as f:
#   Qtest = pickle.load(f)

## SARSA (State Action Reward State Action)

In [21]:
# Exploration vs Exploitation
def choose_action(state):
  action=0
  if np.random.uniform(0, 1) < epsilon:
      action = env.action_space.sample()
  else:
      action = np.argmax(Q[state, :])
  return action

In [22]:
# Update the Q-Table
def learn(state, stateNext, reward, action, actionNext):
  predict = Q[state, action]
  target = reward + gamma * Q[stateNext, actionNext]
  Q[state, action] = Q[state, action] + alpha * (target - predict)

In [24]:
total_episodes = 10000
epsilon = 0.05
alpha = 0.618
gamma = 0.9

Q = np.zeros((env.observation_space.n, env.action_space.n))

In [25]:
rewards = 0
for episode in range(total_episodes):
  counter = 0
  state = env.reset()
  action = choose_action(state)
  done = False
  while not done:
    stateNext, reward, done, info = env.step(action)
    actionNext = choose_action(stateNext)
    learn(state, stateNext, reward, action, actionNext)
    state = stateNext
    action = actionNext

In [26]:
# using the learnt Q-table
state = env.reset()
done = False

while not done:
  # We simply take the action with the highest Q Value
  action = np.argmax(Q[state])
  state, reward, done, info = env.step(action)
  env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (South)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
+---------+
|[42mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :G|
|[42m