In [2]:
import gym 
import numpy as np
import random
streets = gym.make('Taxi-v3')

In [3]:
print("Action space {}".format(streets.action_space))
print("State space {}".format(streets.observation_space))

initial_state = streets.reset()
print("initial State {}" .format(initial_state))

Action space Discrete(6)
State space Discrete(500)
initial State 427


In [4]:
# Taxi row and column , passenger location , destination location 
print("Decode State :" , list(streets.env.decode(initial_state)))
streets.render()

Decode State : [4, 1, 1, 3]
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |[35mB[0m: |
+---------+



In [5]:
#size of the reward table is state_space_size * action_space_size
streets.env.P[initial_state]

{0: [(1.0, 427, -1, False)],
 1: [(1.0, 327, -1, False)],
 2: [(1.0, 447, -1, False)],
 3: [(1.0, 427, -1, False)],
 4: [(1.0, 427, -10, False)],
 5: [(1.0, 427, -10, False)]}

In [8]:
q_table = np.zeros([streets.observation_space.n, streets.action_space.n])

learning_rate = 0.1
#learning_rate = 0.5
discount_factor = 0.6
exploration = 0.1
#exploration = 0.5
epochs = 10000

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    
    while not done:
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Explore a random action
        else:
            action = np.argmax(q_table[state]) # Use the action with the highest q-value
            
        next_state, reward, done, info = streets.step(action)
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table[state, action] = new_q
        
        state = next_state

In [9]:
q_table[initial_state]

array([-2.46267266, -2.4625419 , -2.46267969, -2.46259001, -5.16349214,
       -7.19623884])

In [10]:
from IPython.display import clear_output
from time import sleep

#numTrips = 500
numTrips = 10
totalTripSteps = 0
for tripnum in range(1, numTrips + 1):
    state = streets.reset()
   
    done = False
    trip_length = 0
    
    while not done and trip_length < 20:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render(mode='ansi'))
        sleep(.5)
        state = next_state
        trip_length += 1
    totalTripSteps += trip_length   
    sleep(2)
avgStepsPerTrip = totalTripSteps / numTrips
print("Average Steps Per Trip: " + str(avgStepsPerTrip))

Trip number 10 Step 9
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Average Steps Per Trip: 12.0
