In [1]:
# Load OpenAI Gym and other necessary packages
import gym
import random
import numpy as np
import time
import matplotlib.pyplot as plt

In [2]:
# Environment
env = gym.make("Taxi-v3")

In [3]:
def train():   
    # Training parameters for Q learning
    alpha = 0.1 # Learning rate
    gamma = 0.9 # Future reward discount factor
    num_of_episodes = 10000
    num_of_steps = 100 # per each episode

    exploration_rate = 1
    max_exploration_rate = 1
    min_exploration_rate = 0.01
    exploration_decay_rate = 0.001

    Q_reward = np.zeros((env.observation_space.n, env.action_space.n)).astype('float64')
    
    for episode in range(num_of_episodes):
        state = env.reset()
        done = False
        rewards_current_episode = 0

        for step in range(num_of_steps):
            #Exploration-exploitation trade-off
            exploration_rate_thresold = random.uniform(0,1)

            if exploration_rate_thresold > exploration_rate:
                action = np.argmax(Q_reward[state,:])
            else:
                action = env.action_space.sample()
            new_state, reward, done, info = env.step(action)
            Q_reward[state, action] = Q_reward[state, action]+alpha*(reward+gamma*
                                                                        np.max(Q_reward[new_state, :])-Q_reward[state, action]) 
            state = new_state
            rewards_current_episode += reward
            if done == True:
                break

        exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate*episode)
    return Q_reward

In [4]:
def test(Q_reward,render):
    state = env.reset()
    total_reward = 0
    total_action = 0

    for t in range(50):
        action = np.argmax(Q_reward[state,:])
        state, reward, done, info = env.step(action)
        total_reward += reward
        total_action +=1
        if render:
            env.render()
            time.sleep(1)
        if done:
            break        
            
    return total_action,total_reward

In [5]:
# Q tables for rewards after training
q_reward = train()

In [7]:
total_action = []
total_reward = []

for i in range(10):
    action,reward = test(q_reward,render = False)
    total_action.append(action)
    total_reward.append(reward)
    
print('Average total reward over 10 episodes:',np.mean(total_reward))
print('Average total actions over 10 episodes:',np.mean(total_action))

Average total reward over 10 episodes: 7.5
Average total actions over 10 episodes: 13.5


In [19]:
#Visualization for one test run
test(q_reward,render = True)

+---------+
|R: | : :[35mG[0m|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[42mB[0m: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | 

(13, 8)