In [1]:
import gym # openAi gym
from gym import envs


In [2]:
env = gym.make('Taxi-v3',mode = 'ansi')
env.reset()
env.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [3]:
env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


In [4]:
state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

State: 328
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [5]:
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [6]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 46.1 s, sys: 11.1 s, total: 57.2 s
Wall time: 48.5 s


In [7]:
q_table[328]

array([ -2.40656558,  -2.27325184,  -2.40552262,  -2.36056127,
       -10.19914187, -10.59652385])

In [24]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print("Results after episodes:",episodes)
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after episodes: 100
Average timesteps per episode: 13.31
Average penalties per episode: 0.0


In [6]:
import gym
import numpy as np

env = gym.make("Taxi-v3")

state_size = env.observation_space.n
action_size = env.action_space.n

# Set hyperparameters
alpha_actor = 0.1  # actor learning rate
alpha_critic = 0.2  # critic learning rate
gamma = 0.99  # discount factor

# Initialize weights for actor and critic
actor_weights = 1e-4 * np.random.rand(state_size, action_size)
critic_weights = 1e-4 * np.random.rand(state_size)

def softmax(z):
    return np.exp(z)/((np.exp(z)).sum())

# Function to compute the policy for the given state using the actor weights
def policy(state, actor_weights):
    return softmax(actor_weights[state])

#Function to sample an action from the policy given a state
def sample_action(state, actor_weights):
    probs = policy(state, actor_weights)
    action = np.random.choice(np.arange(len(probs)), p=probs)
    return action

# Function to update the actor and critic weights
def update_weights(state, action, next_state, reward, done, actor_weights, critic_weights):
    td_error = reward + gamma * critic_weights[next_state] - critic_weights[state]
    actor_weights[state, action] += alpha_actor * td_error
    critic_weights[state] += alpha_critic * td_error
    if done:
        actor_weights[state, action] -= alpha_actor * td_error
        critic_weights[state] -= alpha_critic * td_error
    return actor_weights, critic_weights

# Training loop
def train_actor_critic(episodes):
    all_rewards = []
    global actor_weights, critic_weights
    for i in range(1, episodes+1):
        state, info = env.reset()
        total_reward = 0
        done = False
        while not done:
            action = sample_action(state, actor_weights)
            next_state, reward, done, truncated, info = env.step(action)
            actor_weights, critic_weights = update_weights(state, action, next_state, reward, done, actor_weights, critic_weights)
            state = next_state
            total_reward += reward
        all_rewards.append(total_reward)
        if i % 100 == 0:
            print(f"Episode: {i}")
    print("Training finished.\n")
    return actor_weights

# Evaluation function
def evaluate_actor_critic(actor_weights, num_episodes, max_steps=200):
    total_rewards = []
    total_epochs = []
    for i in range(1, num_episodes+1):
        state, info = env.reset()
        episode_reward = 0
        done = False
        epochs = 0
        while not done and epochs < max_steps:
            action = np.argmax(actor_weights[state])
            next_state, reward, done, truncated, info = env.step(action)
            episode_reward += reward
            state = next_state
            epochs += 1
        total_rewards.append(episode_reward)
        total_epochs.append(epochs)
        if i % 100 == 0:
            print("Episode:", i, "Total Reward:", episode_reward)
    print(f"\nResults of Actor-Critic algorithm after {num_episodes} episodes:")
    print("Average total reward per episode: ", np.mean(total_rewards))
    print("Average timesteps per episode: ", np.mean(total_epochs))
    return total_rewards


In [21]:
env = gym.make("Taxi-v3")

state_size = env.observation_space.n

action_size = env.action_space.n

 

# Set hyperparameters

alpha_actor = 0.1  # alearning rate

alpha_critic = 0.2  # critic learning rate

gamma = 0.99  # discount factor

 

# Initialize weights for actor and critic

actor_weights = 1e-4 * np.random.rand(state_size, action_size)

critic_weights = 1e-4 * np.random.rand(state_size)

 

def softmax(z): return np.exp(z)/((np.exp(z)).sum())

 

# Function to compute the policy for the given state using the actor weights

def policy(state, actor_weights):

    return softmax(actor_weights[state])

 

# Function to sample an action from the policy given a state

def sample_action(state, actor_weights):

    probs = policy(state, actor_weights)

    action = np.random.choice(np.arange(len(probs)), p=probs)

    return action

 

# Function to update the actor and critic weights

def update_weights(state, action, next_state, reward, done, actor_weights, critic_weights):

    td_error = reward + gamma * critic_weights[next_state] - critic_weights[state]

    actor_weights[state, action] += alpha_actor * td_error

    critic_weights[state] += alpha_critic * td_error

   

    if done:

        actor_weights[state, action] -= alpha_actor * td_error

        critic_weights[state] -= alpha_critic * td_error

   

    return actor_weights, critic_weights

 
# Training loop

def train_actor_critic(episodes):

    all_rewards = []

    global actor_weights, critic_weights

    for i in range(1, episodes+1):

        state = env.reset()

        total_reward = 0

        done = False

        while not done:

            action = sample_action(state, actor_weights)

            next_state, reward, done, info = env.step(action)

            actor_weights, critic_weights = update_weights(state, action, next_state, reward, done, actor_weights, critic_weights)

            state = next_state

            total_reward += reward

        all_rewards.append(total_reward)

        if i % 100 == 0:
            print(f"Episode: {i}")

    print("Training finished.\n")

    return actor_weights




def evaluate_actor_critic(actor_weights, num_episodes, max_steps):
    total_rewards = []
    for i in range(1, num_episodes+1):
        state = env.reset()
        episode_reward = 0
        done = False
        steps = 0
        while not done and steps < max_steps:
            action = sample_action(state, actor_weights)
            state, reward, done, _ = env.step(action)
            episode_reward += reward
            steps += 1
        total_rewards.append(episode_reward)
    return total_rewards


In [23]:
# Train the Actor-Critic algorithm

num_episodes = 100000  # Number of episodes for training

actor_weights = train_actor_critic(num_episodes)

# Evaluate the trained Actor-Critic

num_eval_episodes = 100  # Number of episodes for evaluation

max_steps = 200  # Maximum number of steps per episode

total_rewards = evaluate_actor_critic(actor_weights, num_eval_episodes, max_steps)

# Print results



print("Average total reward: ", np.mean(total_rewards))



Episode: 100
Episode: 200
Episode: 300
Episode: 400
Episode: 500
Episode: 600
Episode: 700
Episode: 800
Episode: 900
Episode: 1000
Episode: 1100
Episode: 1200
Episode: 1300
Episode: 1400
Episode: 1500
Episode: 1600
Episode: 1700
Episode: 1800
Episode: 1900
Episode: 2000
Episode: 2100
Episode: 2200
Episode: 2300
Episode: 2400
Episode: 2500
Episode: 2600
Episode: 2700
Episode: 2800
Episode: 2900
Episode: 3000
Episode: 3100
Episode: 3200
Episode: 3300
Episode: 3400
Episode: 3500
Episode: 3600
Episode: 3700
Episode: 3800
Episode: 3900
Episode: 4000
Episode: 4100
Episode: 4200
Episode: 4300
Episode: 4400
Episode: 4500
Episode: 4600
Episode: 4700
Episode: 4800
Episode: 4900
Episode: 5000
Episode: 5100
Episode: 5200
Episode: 5300
Episode: 5400
Episode: 5500
Episode: 5600
Episode: 5700
Episode: 5800
Episode: 5900
Episode: 6000
Episode: 6100
Episode: 6200
Episode: 6300
Episode: 6400
Episode: 6500
Episode: 6600
Episode: 6700
Episode: 6800
Episode: 6900
Episode: 7000
Episode: 7100
Episode: 7200
E

Episode: 55600
Episode: 55700
Episode: 55800
Episode: 55900
Episode: 56000
Episode: 56100
Episode: 56200
Episode: 56300
Episode: 56400
Episode: 56500
Episode: 56600
Episode: 56700
Episode: 56800
Episode: 56900
Episode: 57000
Episode: 57100
Episode: 57200
Episode: 57300
Episode: 57400
Episode: 57500
Episode: 57600
Episode: 57700
Episode: 57800
Episode: 57900
Episode: 58000
Episode: 58100
Episode: 58200
Episode: 58300
Episode: 58400
Episode: 58500
Episode: 58600
Episode: 58700
Episode: 58800
Episode: 58900
Episode: 59000
Episode: 59100
Episode: 59200
Episode: 59300
Episode: 59400
Episode: 59500
Episode: 59600
Episode: 59700
Episode: 59800
Episode: 59900
Episode: 60000
Episode: 60100
Episode: 60200
Episode: 60300
Episode: 60400
Episode: 60500
Episode: 60600
Episode: 60700
Episode: 60800
Episode: 60900
Episode: 61000
Episode: 61100
Episode: 61200
Episode: 61300
Episode: 61400
Episode: 61500
Episode: 61600
Episode: 61700
Episode: 61800
Episode: 61900
Episode: 62000
Episode: 62100
Episode: 6