In [1]:
import gym
import numpy as np
import pandas as pd 
from random import random
import time

env = gym.make("Taxi-v3")
env.reset()
env.render()

# Info actions
# Down = 1
# Right = 2
# Up = 3
# Left = 4

actions = env.action_space.n
print(f"Total possible actions: {actions}")

states = env.observation_space.n
print(f"Total states: {states}")


+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|Y| : |B: |
+---------+

Total possible actions: 6
Total states: 500


In [2]:
# Create Q table of rewards defined to 0, for each actions on each step
q_table = np.zeros((states, actions))
print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [3]:
def play_to_taxi(environment, q_table, epsilon=1.0, epsilon_min=0.01, epsilon_max=1.0, learning_rate=0.02, reward_discount_rate=0.618, decay_rate = 0.01, episodes=100000, max_steps=99, training=True, reset=False):
    t = time.process_time()
    training_time = 0
    won_episode = 0
    # total_rewards = 0
    # total_steps = 0
    # avg_rewards = 0
    total_steps = []
    total_rewards = []
    

    if reset == True:
        q_table = np.zeros((states, actions))

    done = False
    
    for episode in range(episodes):
        environment.reset()
        state = 0
        total_episode_rewards = 0
        total_episode_steps = 0
        done = False
        
        for step in range(max_steps):
            # epsilon-greedy
            if training == True and random() < epsilon: # Exploration
                action = environment.action_space.sample() # get random action
                
            else: # Exploitation
                possibilities = q_table[state,:] # possibilities from current state
                action = np.argmax(possibilities) # get the best direction depending on the reward value
            
            # Move to direction
            next_state, reward, done, info = environment.step(action)

            if training == True:
                # Update Q table with value function
                # V(s) = V(s) + (lr x (V(s') - V(s)))
                # state_value = state_value + alpha x (reward + gamma x next_state_value - state_value)
                q_table[state, action] = q_table[state, action] + learning_rate * (reward + reward_discount_rate * np.max(q_table[next_state, :]) - q_table[state, action])


            state = next_state
            total_episode_steps = step + 1
            
            # Update statistics
            total_episode_rewards += reward


            if done:
                total_rewards.append(total_episode_rewards)
                won_episode += 1
                # print(f"Score: {total_episode_rewards}")
                break


        # game is ended  
        # total_rewards += total_episode_rewards
        # total_steps += total_episode_steps
        total_steps.append(total_episode_steps)

         # epsilon decay to maintain trade-off between exploration-exploitation
        epsilon = epsilon_min + (epsilon_max - epsilon_min) * np.exp(-decay_rate * episode)
                
    
    print(f'Total episodes : {episodes}')
    # print(f'Total rewards : {total_rewards}')
    # print(f'Total steps : {total_steps}')
    
    avg_rewards = round(sum(total_rewards) / len(total_rewards), 2)
    avg_steps = round(sum(total_steps) / len(total_steps), 2)

    training_time = round(time.process_time() - t, 2)
    
    print(f'Average rewards : {avg_rewards}')
    print(f'Average steps : {avg_steps}')
    print(f'Won episode : {won_episode} ({round(won_episode / episodes * 100, 2)}%)')

    if training == True:
        print(f'Training time : {training_time} seconds')


In [5]:
nb_episodes = 10000             # Number of games to be played
epsilon_rate = 1.0              # Exploration vs exploitation
learning_rate = 0.7             # Learning rate
epsilon_max = 1.0               # Exploration probability at the start
epsilon_min = 0.01              # Minimum exploration probability
reward_discount_rate = 0.618    # Discounting rate for rewards
decay_rate = 0.01 

print('---------------------- TRAINING ----------------------')

play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=True)

print('')
print('---------------------- TESTING ----------------------')
play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=False)


---------------------- TRAINING ----------------------
Total episodes : 10000
Average rewards : 5.41
Average steps : 14.66
Won episode : 9979 (99.79%)
Training time : 3.52 seconds

---------------------- TESTING ----------------------
Total episodes : 10000
Average rewards : 6.86
Average steps : 14.14
Won episode : 10000 (100.0%)


In [6]:
nb_episodes = 50000             # Number of games to be played
epsilon_rate = 1.0              # Exploration vs exploitation
learning_rate = 0.7             # learning rate
epsilon_max = 1.0               # Exploration probability at the start
epsilon_min = 0.01              # Minimum exploration probability
reward_discount_rate = 0.618    # Discounting rate for rewards
decay_rate = 0.01 

print('---------------------- TRAINING ----------------------')

play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=True)

print('')
print('---------------------- TESTING ----------------------')
play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=False)


---------------------- TRAINING ----------------------
Total episodes : 50000
Average rewards : 6.23
Average steps : 14.29
Won episode : 49981 (99.96%)
Training time : 16.78 seconds

---------------------- TESTING ----------------------
Total episodes : 50000
Average rewards : 7.04
Average steps : 13.96
Won episode : 50000 (100.0%)


In [6]:
nb_episodes = 30000             # Number of games to be played
epsilon_rate = 1.0              # Exploration vs exploitation
learning_rate = 0.7             # Learning rate
epsilon_max = 1.0               # Exploration probability at the start
epsilon_min = 0.01              # Minimum exploration probability
reward_discount_rate = 0.618    # Discounting rate for rewards
decay_rate = 0.01 

print('---------------------- TRAINING ----------------------')

play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=True)

print('')
print('---------------------- TESTING ----------------------')
play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=False)


---------------------- TRAINING ----------------------
Total episodes : 30000
Average rewards : 6.082960837947828
Average steps : 14.3574
Won episode : 29978 (99.92666666666666%)
Training time : 10.015625 seconds

---------------------- TESTING ----------------------
Total episodes : 30000
Average rewards : 7.034666666666666
Average steps : 13.965333333333334
Won episode : 30000 (100.0%)
