In [1]:
import gym
import numpy as np
import pandas as pd 
from random import random
import time
import csv

env = gym.make("Taxi-v3")
env.reset()
env.render()

actions = env.action_space.n
print(f"Total possible actions: {actions}")

states = env.observation_space.n
print(f"Total states: {states}")


+---------+
|[43mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+

Total possible actions: 6
Total states: 500


In [2]:
# Create Q table of rewards defined to 0, for each actions on each step
q_table = np.zeros((states, actions))
print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [3]:
def play_to_taxi(environment, q_table, epsilon=1.0, epsilon_min=0.01, epsilon_max=1.0, learning_rate=0.02, reward_discount_rate=0.618, decay_rate = 0.01, episodes=100000, max_steps=99, training=True, reset=False, show_print=True):
    t = time.process_time()
    training_time = 0
    won_episode = 0
    # total_rewards = 0
    # total_steps = 0
    # avg_rewards = 0
    total_steps = []
    total_rewards = []

    if show_print:
        if training:
            print('')
            print('---------------------- TRAINING ----------------------')
        else:
            print('')
            print('---------------------- TESTING ----------------------')
    

    if reset == True:
        q_table = np.zeros((states, actions))

    done = False
    
    for episode in range(episodes):
        environment.reset()
        state = 0
        total_episode_rewards = 0
        total_episode_steps = 0
        done = False
        
        for step in range(max_steps):
            # epsilon-greedy
            if training == True and random() < epsilon: # Exploration
                action = environment.action_space.sample() # get random action
                
            else: # Exploitation
                possibilities = q_table[state,:] # possibilities from current state
                action = np.argmax(possibilities) # get the best direction depending on the reward value
            
            # Move to direction
            next_state, reward, done, info = environment.step(action)

            if training == True:
                # Update Q table with value function
                # V(s) = V(s) + (lr x (V(s') - V(s)))
                # state_value = state_value + alpha x (reward + gamma x next_state_value - state_value)
                q_table[state, action] = q_table[state, action] + learning_rate * (reward + reward_discount_rate * np.max(q_table[next_state, :]) - q_table[state, action])


            state = next_state
            total_episode_steps = step + 1
            
            # Update statistics
            total_episode_rewards += reward


            if done:
                total_rewards.append(total_episode_rewards)
                won_episode += 1
                # print(f"Score: {total_episode_rewards}")
                break


        # game is ended  
        # total_rewards += total_episode_rewards
        # total_steps += total_episode_steps
        total_steps.append(total_episode_steps)

         # epsilon decay to maintain trade-off between exploration-exploitation
        epsilon = epsilon_min + (epsilon_max - epsilon_min) * np.exp(-decay_rate * episode)
                
    
    # print(f'Total rewards : {total_rewards}')
    # print(f'Total steps : {total_steps}')
    
    avg_rewards = round(sum(total_rewards) / len(total_rewards), 2)
    avg_steps = round(sum(total_steps) / len(total_steps), 2)
    won_rate = round(won_episode / episodes * 100, 2)
    training_time = round(time.process_time() - t, 2)

    if show_print:
        print(f'Total episodes : {episodes}')
        print(f'Average rewards : {avg_rewards}')
        print(f'Average steps : {avg_steps}')
        print(f'Won episode : {won_episode} ({won_rate}%)')

        if training == True:
            print(f'Training time : {training_time} seconds')

    return {
        'avg_rewards': avg_rewards,
        'avg_steps': avg_steps,
    }


In [4]:
nb_episodes = 10000             # Number of games to be played
epsilon_rate = 1.0              # Exploration vs exploitation
learning_rate = 0.7             # Learning rate
epsilon_max = 1.0               # Exploration probability at the start
epsilon_min = 0.01              # Minimum exploration probability
reward_discount_rate = 0.618    # Discounting rate for rewards
decay_rate = 0.01 

play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=True)

play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=False)



---------------------- TRAINING ----------------------
Total episodes : 10000
Average rewards : 5.39
Average steps : 16.35
Won episode : 9826 (98.26%)
Training time : 3.86 seconds

---------------------- TESTING ----------------------
Total episodes : 10000
Average rewards : 7.03
Average steps : 13.97
Won episode : 10000 (100.0%)


100.0

In [6]:
nb_episodes = 50000             # Number of games to be played
epsilon_rate = 1.0              # Exploration vs exploitation
learning_rate = 0.7             # learning rate
epsilon_max = 1.0               # Exploration probability at the start
epsilon_min = 0.01              # Minimum exploration probability
reward_discount_rate = 0.618    # Discounting rate for rewards
decay_rate = 0.01 

print('---------------------- TRAINING ----------------------')

play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=True)

print('')
print('---------------------- TESTING ----------------------')
play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=False)


---------------------- TRAINING ----------------------
Total episodes : 50000
Average rewards : 6.23
Average steps : 14.29
Won episode : 49981 (99.96%)
Training time : 16.78 seconds

---------------------- TESTING ----------------------
Total episodes : 50000
Average rewards : 7.04
Average steps : 13.96
Won episode : 50000 (100.0%)


In [7]:
nb_episodes = 30000             # Number of games to be played
epsilon_rate = 1.0              # Exploration vs exploitation
learning_rate = 0.7             # Learning rate
epsilon_max = 1.0               # Exploration probability at the start
epsilon_min = 0.01              # Minimum exploration probability
reward_discount_rate = 0.618    # Discounting rate for rewards
decay_rate = 0.01 

print('---------------------- TRAINING ----------------------')

play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=True)

print('')
print('---------------------- TESTING ----------------------')
play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=False)


---------------------- TRAINING ----------------------
Total episodes : 30000
Average rewards : 6.06
Average steps : 14.35
Won episode : 29982 (99.94%)
Training time : 10.08 seconds

---------------------- TESTING ----------------------
Total episodes : 30000
Average rewards : 6.84
Average steps : 14.16
Won episode : 30000 (100.0%)


In [6]:
epsilon_rate = 1.0              # Exploration vs exploitation
learning_rate = 0.7             # Learning rate
epsilon_max = 1.0               # Exploration probability at the start
epsilon_min = 0.01              # Minimum exploration probability
reward_discount_rate = 0.618    # Discounting rate for rewards
decay_rate = 0.01 

nb_episodes_list = [i for i in range(20001, 70001) if (i)/10000 % 1 == 0]
learning_rate_list = [0.5, 0.6, 0.7, 0.8, 0.9]
reward_discount_rate_list = [0.9,  0.786, 0.618, 0.5, 0.382, 0.2, 0.1]
decay_rate_list = [0.01, 0.03, 0.05, 0.07, 0.1]

best_avg_rewards = 0
best_nb_episodes = 0
best_learning_rate = 0
best_reward_discount_rate = 0
best_decay_rate = 0

for nb_episodes in nb_episodes_list:
    for learning_rate in learning_rate_list:
        for reward_discount_rate in reward_discount_rate_list:
            for decay_rate in decay_rate_list:
                # print('')
                # print(nb_episodes)
                training_result = play_to_taxi(environment=env, 
                        q_table=q_table, 
                        epsilon=epsilon_rate, 
                        epsilon_min=epsilon_min,
                        epsilon_max=epsilon_max,
                        learning_rate=learning_rate, 
                        episodes=nb_episodes,
                        reward_discount_rate=reward_discount_rate, 
                        decay_rate=decay_rate,
                        training=True,
                        show_print=False
                        )

                test_result = play_to_taxi(environment=env, 
                        q_table=q_table, 
                        epsilon=epsilon_rate, 
                        epsilon_min=epsilon_min,
                        epsilon_max=epsilon_max,
                        learning_rate=learning_rate, 
                        episodes=nb_episodes,
                        reward_discount_rate=reward_discount_rate, 
                        decay_rate=decay_rate,
                        training=False,
                        show_print=False
                        )

                with open('q-learning.csv', 'a', encoding='UTF8', newline='') as f:
                    writer = csv.writer(f)
                    writer.writerow([nb_episodes, epsilon_rate, epsilon_min, epsilon_max, learning_rate, reward_discount_rate, decay_rate, test_result['avg_rewards'], test_result['avg_steps']])
                
                if test_result['avg_rewards'] > best_avg_rewards:
                    best_avg_rewards = test_result['avg_rewards']
                    best_nb_episodes = nb_episodes
                    best_learning_rate = learning_rate
                    best_reward_discount_rate = reward_discount_rate
                    best_decay_rate = decay_rate
    

In [7]:
print(nb_episodes_list)

[30000, 40000, 50000, 60000, 70000]


In [5]:
print(best_nb_episodes)
print(best_avg_rewards)
print(best_learning_rate)
print(best_reward_discount_rate)
print(best_decay_rate)

10000
11.23
0.6
0.1
0.03


In [7]:
nb_episodes = 10000             # Number of games to be played
epsilon_rate = 1.0              # Exploration vs exploitation
learning_rate = 0.6             # Learning rate
epsilon_max = 1.0               # Exploration probability at the start
epsilon_min = 0.01              # Minimum exploration probability
reward_discount_rate = 0.1      # Discounting rate for rewards
decay_rate = 0.03 

play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=True)

play_to_taxi(environment=env, 
             q_table=q_table, 
             epsilon=epsilon_rate, 
             epsilon_min=epsilon_min,
             epsilon_max=epsilon_max,
             learning_rate=learning_rate, 
             episodes=nb_episodes,
             reward_discount_rate=reward_discount_rate, 
             decay_rate=decay_rate,
             training=False)




---------------------- TRAINING ----------------------
Total episodes : 10000
Average rewards : 7.76
Average steps : 94.0
Won episode : 576 (5.76%)
Training time : 20.64 seconds

---------------------- TESTING ----------------------
Total episodes : 10000
Average rewards : 8.73
Average steps : 92.0
Won episode : 807 (8.07%)


{'avg_rewards': 8.73, 'avg_steps': 92.0}

In [10]:

header = ['nb_episodes', 'epsilon_rate', 'epsilon_min', 'epsilon_max', 'learning_rate', 'reward_discount_rate', 'decay_rate', 'avg_rewards', 'avg_steps']
data = ['Afghanistan', 652090, 'AF', 'AFG']


with open('q-learning.csv', 'a', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    # writer.writerow(header)
    writer.writerow(data)