In [1]:
import gymnasium as gym
import numpy as np
import random 

In [2]:
# Create the environment
# env = gym.make('Taxi-v3', render_mode="human")
env = gym.make('Taxi-v3', render_mode="rgb_array")
# env = gym.make("LunarLander-v2", render_mode="human")

In [3]:
# Initialize the Q-table to a zero matrix of size (state_space x action_space)
ns = env.observation_space.n
na = env.action_space.n
q_table = np.zeros([ns, na], dtype=int)
policy = np.full([ns, na], fill_value = 1 / na)

In [4]:
q_table

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])

In [5]:
# Parameters for Q-learning
num_episodes = 10000
max_steps_per_episode = 2000
learning_rate = 0.2
discount_rate = 0.99

# Exploration parameters
exploration_rate = 0.6

In [6]:
env.metadata["render_fps"]
env.reset()

(314, {'prob': 1.0, 'action_mask': array([1, 1, 0, 0, 0, 0], dtype=int8)})

In [7]:
# Training
import time

# The Q-learning algorithm
rewards_all_episodes = []

for episode in range(num_episodes):
    state, prob = env.reset()
    done = False
    rewards_current_episode = 0
    for step in range(max_steps_per_episode):
        
        # Exploration-exploitation trade-off
        exploration_rate_threshold = random.uniform(0, 1)

        if exploration_rate_threshold > exploration_rate:
            # Choose best Q
            action = np.argmax(q_table[state,:])
        else:
            # Choose random action
            action = env.action_space.sample()
            
        new_state, reward, terminated, info, done = env.step(action)
            
        # Update Q-table for Q(s,a)        
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
        state = new_state
        rewards_current_episode += reward
        if terminated == True:
            break
        
    # Print average reward every 1000 episodes    
    if (episode + 1) % 1000 == 0:
        print(f'Episode: {episode + 1}')
        print(q_table)


Episode: 1000
[[ 0  0  0  0  0  0]
 [ 0  0  0  0  0 -4]
 [ 0  0  0  0  0  0]
 ...
 [ 0  0  0  0 -6 -6]
 [ 0  0  0  0 -6 -6]
 [ 0  0  0 10 -5 -3]]
Episode: 2000
[[ 0  0  0  0  0  0]
 [ 0  0  0  0  0 -6]
 [ 0  0  0  0  0 -6]
 ...
 [ 0  0  0  0 -6 -6]
 [ 0  0  0  0 -6 -6]
 [ 1  0  1 10 -3 -3]]
Episode: 3000
[[ 0  0  0  0  0  0]
 [ 0  0  0  0  0 -6]
 [ 0  0  0  0  0 -6]
 ...
 [ 0  0  0  0 -6 -6]
 [ 0  0  0  0 -6 -6]
 [ 2  0  3 10 -1 -3]]
Episode: 4000
[[ 0  0  0  0  0  0]
 [ 0  0  0  0  0 -6]
 [ 0  0  0  0  0 -6]
 ...
 [ 0  0  0  0 -6 -6]
 [ 0  0  0  0 -6 -6]
 [ 4  0  4 10  0 -1]]
Episode: 5000
[[ 0  0  0  0  0  0]
 [ 0  0  0  0  0 -6]
 [ 0  0  0  0  0 -6]
 ...
 [ 0  0  0  0 -6 -6]
 [ 0  0  0  0 -6 -6]
 [ 4  0  4 10  0  0]]
Episode: 6000
[[ 0  0  0  0  0  0]
 [ 0  0  0  0  0 -6]
 [ 0  0  0  0  0 -6]
 ...
 [ 0  0  0  0 -6 -6]
 [ 0  0  0  0 -6 -6]
 [ 4  0  4 10  0  0]]
Episode: 7000
[[ 0  0  0  0  0  0]
 [ 0  0  0  0  0 -6]
 [ 0  0  0  0  0 -6]
 ...
 [ 0  0  0  0 -6 -6]
 [ 0  0  0  0 -6 -6]


In [11]:
q_table

array([[ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, -6],
       [ 0,  0,  0,  0,  0, -6],
       ...,
       [ 0,  0,  0,  0, -6, -6],
       [ 0,  0,  0,  0, -6, -6],
       [ 4,  0,  4, 10,  0,  0]])

In [12]:
# Play Time
import time

env = gym.make('Taxi-v3', render_mode="human")
state, prob = env.reset()
done = False
for step in range(100):
    
    action = np.argmax(q_table[state,:])    
    new_state, reward, terminated, info, done = env.step(action)
    state = new_state
    
    if terminated == True:
        break

In [13]:
env.close()