# Your Challenge
Modify the block above to keep track of the total time steps, and use that as a metric as to how good our Q-learning system is. You might want to increase the number of simulated trips, and remove the sleep() calls to allow you to run over more samples.

Now, try experimenting with the hyperparameters. How low can the number of epochs go before our model starts to suffer? Can you come up with better learning rates, discount factors, or exploration factors to make the training more efficient? The exploration vs. exploitation rate in particular is interesting to experiment with.

In [36]:
# Importando as bibliotecas usadas
import numpy as np
import gym
import random

In [37]:
# cria o ambiente
streets = gym.make("Taxi-v3").env
streets.reset()

np.int64(389)

In [38]:
# define estado inicial
initial_state = streets.encode(2, 3, 2, 0)
streets.s = initial_state

In [39]:
# inicializa q-table
q_table = np.zeros([streets.observation_space.n, streets.action_space.n])

In [40]:
# hiperparâmetros
learning_rate = 0.1
discount_factor = 0.6
exploration = 0.1
epochs = 10000

In [41]:
# treinamento
for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    
    while not done:
        random_value = random.uniform(0, 1)
        if random_value < exploration:
            action = streets.action_space.sample()  # explora
        else:
            action = np.argmax(q_table[state])  # usa melhor ação
        
        next_state, reward, done, info = streets.step(action)
        
        # atualiza q-value
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table[state, action] = new_q
        
        state = next_state

In [42]:
# verifica q-values do estado inicial
q_table[initial_state]

array([-2.42545836, -2.40357978, -2.4164343 , -2.3639511 , -6.86630598,
       -7.2792723 ])

In [None]:
from IPython.display import clear_output
from time import sleep

total_steps = 0
successful_trips = 0

# testa o modelo treinado
for tripnum in range(1, 11):
    state = streets.reset()
    done = False
    trip_length = 0
    
    while not done and trip_length < 25:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = streets.step(action)
        clear_output(wait=True)
        print(f"Trip number {tripnum} Step {trip_length}")
        print(streets.render())
        state = next_state
        trip_length += 1
    
    total_steps += trip_length
    if done:
        successful_trips += 1
        
    sleep(2)

# métricas de performance
average_steps = total_steps / 10
success_rate = (successful_trips / 10) * 100

Trip number 3 Step 13
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
None
