In [26]:
import gym
import numpy as np
import pandas as pd 
import time, pickle, os, csv, random

env = gym.make("Taxi-v2")

def open_dataset(file_name):
    return pd.read_csv(filepath_or_buffer=file_name, delimiter=",", encoding="utf-8", header=0)

env.reset() # reset environment to a new, random state
env.render()

actions = env.action_space.n
states = env.observation_space.n

print("Action Space {}".format(actions))
print("State Space {}".format(states))

+---------+
|R: | : :[34;1mG[0m|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+

Action Space 6
State Space 500


In [27]:
# {action: [(probability, nextstate, reward, done)]}

env.P[328]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [28]:
# Create Q table of rewards defined to 0, for each actions on each step
q_table = np.zeros((states, actions))
print(q_table)

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]


In [29]:
def eps_greedy(q_table, s, epsilon=0.1, training=False):
    '''
    Epsilon greedy policy
    '''
    if training == True and np.random.uniform(0,1) < epsilon:
        # Choose a random action
        return np.random.randint(q_table.shape[1])
    else:
        # Choose the action of a greedy policy
        return greedy(q_table, s)


def greedy(q_table, s):
    '''
    Greedy policy
    return the index corresponding to the maximum action-state value
    '''
    return np.argmax(q_table[s])

In [30]:
def take_taxi_with_sarsa(environment, q_table, epsilon=1.0, epsilon_min=0.01, epsilon_max=1.0, learning_rate=0.02, reward_discount_rate=0.618, decay_rate = 0.01, episodes=100000, max_steps=99, training=True, reset=False, show_print=True):
    t = time.process_time()
    training_time = 0
    won_episode = 0
    total_steps = []
    total_rewards = []

    if episodes > 0:
        if reset == True:
            q_table = np.zeros((states, actions))

        done = False
    
        if show_print:
            if training:
                print('')
                print('---------------------- TRAINING ----------------------')
            else:
                print('')
                print('---------------------- TESTING ----------------------')

        for episode in range(episodes):
            environment.reset()
            state = 0
            total_episode_rewards = 0
            total_episode_steps = 0
            done = False
            
            for step in range(max_steps):
                # epsilon-greedy
                
                action = eps_greedy(q_table, state, epsilon, training)
                
                # Move to direction
                next_state, reward, done, info = environment.step(action)
                next_action = eps_greedy(q_table, next_state, epsilon, training)

                if training == True:
                    # Update Q table with value function
                    # V(s) = V(s) + (lr x (V(s') - V(s)))
                    # V(s) = V(s) + (lr x (V(s') - V(s)))
                    # state_value = state_value + alpha x (reward + gamma x next_state_value - state_value)                
                    q_table[state, action] = q_table[state][action] + learning_rate*(reward + reward_discount_rate*(q_table[next_state][next_action]) - q_table[state][action])


                state = next_state
                total_episode_steps = step + 1
                
                # Update statistics
                total_episode_rewards += reward


                if done:
                    total_rewards.append(total_episode_rewards)
                    won_episode += 1
                    # print(f"Score: {total_episode_rewards}")
                    break


            # game is ended  
            total_steps.append(total_episode_steps)

            # epsilon decay to maintain trade-off between exploration-exploitation
            epsilon = epsilon_min + (epsilon_max - epsilon_min) * np.exp(-decay_rate * episode)

            total_time = time.process_time() - t
            avg_rewards = round(sum(total_rewards) / len(total_rewards) if len(total_rewards) > 0 else 0, 2)
            avg_steps = round(sum(total_steps) / len(total_steps) if len(total_steps) > 0 else 0, 2)
            won_rate = round(won_episode / episodes * 100 if won_episode > 0 else 0, 2)
            duration = round(total_time if total_time > 0 else 0, 2)

            if show_print:
                print(f'Total episodes : {episodes}')
                print(f'Average rewards : {avg_rewards}')
                print(f'Average steps : {avg_steps}')
                print(f'Won episode : {won_episode} ({won_rate}%)')                          
    
    
    avg_rewards = avg_rewards if "avg_rewards" in locals() else 0
    avg_steps = avg_steps if "avg_steps" in locals() else 0
    duration = duration if "duration" in locals() else 0
    won_rate = won_rate if "won_rate" in locals() else 0    

    

    return {
        'q_table': q_table,
        'avg_rewards': avg_rewards,
        'avg_steps': avg_steps,
    }

In [None]:
epsilon_rate = 1.0              # Exploration vs exploitation
learning_rate = 0.7             # Learning rate
epsilon_max = 1.0               # Exploration probability at the start
epsilon_min = 0.01              # Minimum exploration probability
reward_discount_rate = 0.618    # Discounting rate for rewards
decay_rate = 0.01 

nb_episodes_list = [i for i in range(1, 50001) if (i)/10000 % 1 == 0]
learning_rate_list = [0.5, 0.6, 0.7, 0.8, 0.9]
reward_discount_rate_list = [0.9,  0.786, 0.618, 0.5, 0.382, 0.2, 0.1]
decay_rate_list = [0.01, 0.03, 0.05, 0.07, 0.1]

best_avg_rewards = 0
best_nb_episodes = 0
best_learning_rate = 0
best_reward_discount_rate = 0
best_decay_rate = 0
start = time.time()

with open('SARSA.csv', 'a', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["nb_episodes", "epsilon_rate", "epsilon_min", "epsilon_max", "learning_rate", "reward_discount_rate", "decay_rate", 'avg_rewards', 'avg_steps', "duration"])



for nb_episodes in nb_episodes_list:
    for learning_rate in learning_rate_list:
        for reward_discount_rate in reward_discount_rate_list:
            for decay_rate in decay_rate_list:
                training_result = take_taxi_with_sarsa(environment=env, 
                        q_table=q_table, 
                        epsilon=epsilon_rate, 
                        epsilon_min=epsilon_min,
                        epsilon_max=epsilon_max,
                        learning_rate=learning_rate, 
                        episodes=nb_episodes,
                        reward_discount_rate=reward_discount_rate, 
                        decay_rate=decay_rate,
                        training=True,
                        show_print=False,
                        reset=True
                        )

                test_result = take_taxi_with_sarsa(environment=env, 
                        q_table=training_result['q_table'], 
                        epsilon=epsilon_rate, 
                        epsilon_min=epsilon_min,
                        epsilon_max=epsilon_max,
                        learning_rate=learning_rate, 
                        episodes=nb_episodes,
                        reward_discount_rate=reward_discount_rate, 
                        decay_rate=decay_rate,
                        training=False,
                        show_print=False,
                        reset=False
                        )

                with open('SARSA.csv', 'a', encoding='UTF8', newline='') as f:
                    writer = csv.writer(f)
                    writer.writerow([nb_episodes, epsilon_rate, epsilon_min, epsilon_max, learning_rate, reward_discount_rate, decay_rate, test_result['avg_rewards'], test_result['avg_steps'], time.time() - start])
                

In [None]:
sarsa_data = open_dataset('SARSA.csv')
sarsa_data = sarsa_data.sort_values(
    by=['avg_rewards'], ascending=[False]).reset_index(drop=True)
sarsa_data.head(5)