# What is reinforcement learning?
    Reinforcement learning is a model learning learning by interacting with an environment. It gets rewarded for good action and penalised for bad actions. Over time by time, it learns the best actions to get the most rewards.

# What is Q-Learning?
    Q-Learning is a type of reinforcement learning. It uses a table called Q-table to remember which action gives the best reward in each situation. The model updates this table as it learns, so it can choose the best action later.

In [None]:
# Import libraries
import gymnasium as gym

# Create environment
env = gym.make("Taxi-v3").env  

# Reset the environment before doing anything
state, info = env.reset()

# Now render works fine
env.render()

In [None]:
env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

Action Space Discrete(6)
State Space Discrete(500)


In [None]:
# Generate random action
state = env.unwrapped.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

State: 328


In [None]:
# Look at the action space and state space
env.unwrapped.P[328]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [None]:
# set the environment to a specific state
env.s = 328  # set environment to specific state

epochs = 0
penalties, reward = 0, 0

frames = []  # for animation

terminated = truncated = False

while not (terminated or truncated):
    action = env.action_space.sample()
    state, reward, terminated, truncated, info = env.step(action)

    if reward == -10:
        penalties += 1
    
    frames.append({
        'frame': env.render(),
        'state': state,
        'action': action,
        'reward': reward
    })

    epochs += 1

print("Timesteps taken:", epochs)
print("Penalties incurred:", penalties)

Timesteps taken: 2970
Penalties incurred: 976


In [None]:
# Animation
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        # Just print the string directly, no getvalue()
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(0.1)

print_frames(frames)

None
Timestep: 1466
State: 258
Action: 2
Reward: -1


KeyboardInterrupt: 

In [None]:
# Q-Table
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [None]:
# Q-Learning Training Loop for Taxi-v3 Environment
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state, info = env.reset()  # unpack reset
    epochs, penalties, reward = 0, 0, 0
    terminated = truncated = False
    
    while not (terminated or truncated):
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # explore
        else:
            action = np.argmax(q_table[state])  # exploit

        next_state, reward, terminated, truncated, info = env.step(action)  # unpack step
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 17.2 s, sys: 2.58 s, total: 19.8 s
Wall time: 19.6 s


In [None]:
q_table[328]

array([ -2.40631996,  -2.27325184,  -2.41279344,  -2.35857342,
       -10.51970824, -10.99770375])

In [None]:
# Evaluate the Trained Q-Learning Agent
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state, info = env.reset()  # unpack state and info
    epochs, penalties, reward = 0, 0, 0
    
    terminated = truncated = False
    
    while not (terminated or truncated):
        action = np.argmax(q_table[state])
        next_state, reward, terminated, truncated, info = env.step(action)

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 12.58
Average penalties per episode: 0.0
