In [2]:
import gym
import numpy as np
import pandas as pd 
from random import random
import time
from tqdm import tqdm

env = gym.make("Taxi-v3")
env.reset()
env.render()

# Info actions
# Down = 1
# Right = 2
# Up = 3
# Left = 4

actions = env.action_space.n
print(f"Total possible actions: {actions}")

states = env.observation_space.n
print(f"Total states: {states}")


+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[43m [0m: |B: |
+---------+

Total possible actions: 6
Total states: 500


In [3]:
# Create Q table of rewards defined to 0, for each actions on each step
q_table = np.zeros((states, actions))
print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [4]:
def play_to_taxi(environment, q_table, epsilon=1.0, epsilon_min=0.01, epsilon_max=1.0, learning_rate=0.02, reward_discount_rate=0.618, decay_rate = 0.01, episodes=100000, training=True, reset=False):
    t = time.process_time()
    training_time = 0
    won_episode = 0
    total_rewards = 0
    total_steps = 0
    avg_rewards = 0
    avg_steps = []

    if reset == True:
        q_table = np.zeros((states, actions))

    done = False
    
    for episode in tqdm(range(episodes)):
        environment.reset()
        state = 0
        total_reward = 0
        steps = 0

        while not done:
            # epsilon-greedy
            if training == True and random() < epsilon: # Exploration
                action = environment.action_space.sample() # get random action
                
            else: # Exploitation
                possibilities = q_table[state,:] # possibilities from current state
                action = np.argmax(possibilities) # get the best direction depending on the reward value
            
            # Move to direction
            next_state, reward, done, info = environment.step(action)

            if training == True:
                # Update Q table with value function
                # V(s) = V(s) + (lr x (V(s') - V(s)))
                # state_value = state_value + alpha x (reward + gamma x next_state_value - state_value)
                q_table[state, action] = q_table[state, action] + learning_rate * (reward + reward_discount_rate * np.max(q_table[next_state, :]) - q_table[state, action])

                # epsilon decay to maintain trade-off between exploration-exploitation
                epsilon = epsilon_min + (epsilon_max - epsilon_min) * np.exp(-decay_rate * episode)
                
            state = next_state
            
            # Update statistics
            steps += 1
            total_reward += reward

            if state == 15:
                won_episode += 1 


        # game is over    
        # avg_rewards += total_reward
        
        avg_steps.append(steps)

        total_rewards += total_reward
        total_steps += steps
             
        # if training == True:
        #     if episode % 10000 == 0:
        #         avg_rewards /= 10000
        #         avg_steps /= 10000
        #         print(f"Episode {episode} : average steps = {avg_steps}, reward = {avg_rewards}")
        #         avg_rewards = 0
        #         # avg_steps = 0
                
        #     if episode % 100000 == 0:
        #         learning_rate = learning_rate * 1.1


    print(f'Total episodes : {episodes}')
    print(f'Total rewards : {total_rewards}')
    print(f'Total steps : {total_steps}')
    
    avg_rewards = total_rewards / episodes
    avg_steps = sum(avg_steps) / len(avg_steps)

    training_time = time.process_time() - t
    
    print(f'Average rewards : {avg_rewards}')
    print(f'Average steps : {avg_steps}')
    print(f'Won episode : {won_episode} ({won_episode / episodes * 100}%)')
    if training == True:
        print(f'Training time : {training_time} seconds')


In [5]:
nb_episodes = 50000 # number of games to be played
epsilon_rate = 1.0 # exploration vs exploitation
learning_rate = 0.7 # 
epsilon_max = 1.0            # Exploration probability at the start
epsilon_min = 0.01           # Minimum exploration probability
reward_discount_rate = 0.618                # Discounting rate for rewards
decay_rate = 0.01 

print('---------------------- TRAINING ----------------------')

play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=True)

print('')
print('---------------------- TESTING ----------------------')
play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=False)


---------------------- TRAINING ----------------------


100%|██████████| 50000/50000 [00:00<00:00, 116822.96it/s]


Total episodes : 50000
Total rewards : -758
Total steps : 200
Average rewards : -0.01516
Average steps : 0.004
Won episode : 0 (0.0%)
Training time : 0.421875 seconds

---------------------- TESTING ----------------------


100%|██████████| 50000/50000 [00:00<00:00, 109649.27it/s]

Total episodes : 50000
Total rewards : -200
Total steps : 200
Average rewards : -0.004
Average steps : 0.004
Won episode : 0 (0.0%)



