In [None]:
import gym
import random

random.seed(1234)

streets = gym.make("Taxi-v3").env 
streets.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [None]:
initial_state = streets.encode(2, 3, 2, 0)

streets.s = initial_state

streets.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [None]:
import numpy as np

q_table = np.zeros([streets.observation_space.n, streets.action_space.n])
# a 2D array that represent every possible state and action in the virtual space and initialize all of them to 0
learning_rate = 0.1
discount_factor = 0.6
exploration = 0.1
epochs = 10000

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    
    while not done:
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Explore a random action
        else:
            action = np.argmax(q_table[state]) # Use the action with the highest q-value
            
        next_state, reward, done, info = streets.step(action)
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table[state, action] = new_q
        
        state = next_state

In [None]:
q_table[initial_state]

array([-2.41305672, -2.39907636, -2.41582812, -2.3639511 , -9.22785691,
       -6.37749466])

In [None]:
q_table[streets.encode(1,0,2,0)]

array([-2.12208743, -2.2782286 , -2.22903207, -2.20664945, -7.26694056,
       -6.75732291])

In [None]:
from IPython.display import clear_output
from time import sleep
lengths=[]
for tripnum in range(1, 11):
    state = streets.reset()
   
    done = False
    trip_length = 0
    
    while not done and trip_length < 25:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render(mode='ansi'))
        sleep(.2)
        state = next_state
        trip_length += 1
    lengths.append(trip_length)
    
    sleep(.2)
avg_len=sum(lengths)/10
print(avg_len)

Trip number 10 Step 12
+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

15.4


In [None]:
def q_learning(learning_rate,discount_factor,exploration,epochs):
    q_table = np.zeros([streets.observation_space.n, streets.action_space.n])
# a 2D array that represent every possible state and action in the virtual space and initialize all of them to 0
    for taxi_run in range(epochs):
        state = streets.reset()
        done = False
            
        while not done:
            random_value = random.uniform(0, 1)
            if (random_value < exploration):
                action = streets.action_space.sample() # Explore a random action
            else:
                action = np.argmax(q_table[state]) # Use the action with the highest q-value
                    
            next_state, reward, done, info = streets.step(action)
                
            prev_q = q_table[state, action]
            next_max_q = np.max(q_table[next_state])
            new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
            q_table[state, action] = new_q
                
            state = next_state



def average_trip_length():
    lengths=[]
    for tripnum in range(1, 11):
        state = streets.reset()
        done = False
        trip_length = 0
        
        while not done and trip_length < 25:
            action = np.argmax(q_table[state])
            next_state, reward, done, info = streets.step(action)
            clear_output(wait=True)
            state = next_state
            trip_length += 1
        lengths.append(trip_length)
    avg_len=sum(lengths)/10
    return avg_len

In [None]:
learning_rate = 0.1
discount_factor = [0.5,0.6,0.7,0.8,0.9]
exploration = 0.1
epochs = 1000
difdis=[0,0,0,0,0]
for j in range(1,10):
    for i in range(len(discount_factor)):
        q_learning(learning_rate,discount_factor[i],exploration,epochs)
        difdis[i]+=average_trip_length()

print(np.array(difdis)/10)

[11.99 12.7  13.19 12.22 12.4 ]


In [None]:
learning_rate = [0.1,0.2,0.3,0.4,0.5]
discount_factor = 0.9
exploration = 0.1
epochs = 1000
difdis=[0,0,0,0,0]
for j in range(1,10):
    for i in range(len(learning_rate)):
        q_learning(learning_rate[i],discount_factor,exploration,epochs)
        difdis[i]+=average_trip_length()

print(np.array(difdis)/10)

[12.91 13.22 12.96 11.94 13.34]


In [None]:
learning_rate = 0.5
discount_factor = 0.5
exploration = [0.1,0.2,0.3,0.4]
epochs = 1000
difdis=[0,0,0,0]
for j in range(1,10):
    for i in range(len(exploration)):
        q_learning(learning_rate,discount_factor,exploration[i],epochs)
        difdis[i]+=average_trip_length()

print(np.array(difdis)/10)

[13.06 12.64 12.89 12.72]


In [None]:
learning_rate = 0.4
discount_factor = 0.5
exploration = 0.3
epochs = 1000
difdis=[]
for j in range(1,10):
        q_learning(learning_rate,discount_factor,exploration,epochs)
        difdis.append(average_trip_length())

print(sum(difdis)/10)

12.49


In [None]:
from IPython.display import clear_output
from time import sleep
lengths=[]
for tripnum in range(1, 7):
    state = streets.reset()
   
    done = False
    trip_length = 0
    
    while not done and trip_length < 25:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render(mode='ansi'))
        sleep(1)
        state = next_state
        trip_length += 1
    lengths.append(trip_length)
    
    sleep(.2)
avg_len=sum(lengths)/10
print(avg_len)

Trip number 6 Step 16
+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

9.6
