Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Taxi"

In [18]:
import numpy as np
import random
from taxi_env_extended import TaxiEnvExtended

In [19]:
env = TaxiEnvExtended()

Obtener la cantidad de estados y acciones

In [20]:
actions = env.action_space.n
states = env.observation_space.n

Inicialización de la tabla Q

In [21]:
Q = np.zeros((states, actions))
Q

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

Obtención de la acción a partir de la tabla Q

In [22]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

Epsilon-Greedy Policy

In [23]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    # exploit
    else:
        action = np.argmax(Q[state])
        
    return action

Ejemplo de episodio 

In [25]:
import wandb
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

def trainPolicy(Q, env, episodes=1000, alpha=0.1, gamma=0.9, epsilon=0.1):
    wandb.init(project='taxi')
    for episode in range(episodes):
        obs, _ = env.reset()
        done = False
        total_reward = 0
        step_count = 0
        while not done:
            state = obs
            action = epsilon_greedy_policy(state, Q, epsilon)
            obs, reward, done, _, _ = env.step(action)
            Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[obs]) - Q[state, action])
            total_reward += reward
            step_count += 1
        wandb.log({'reward': total_reward, 'episode': "trial"})
    print('total_reward', total_reward)
    print('total_steps', step_count)
    return Q



In [29]:
alfa = 0.4
gamma = 0.6
epsilon = 0.6
##Create Wandb artifact for hyperparameters
import wandb

# Define the hyperparameters
alfa = 0.4
gamma = 0.6
epsilon = 0.6

# Create a dictionary with the hyperparameters
hyperparameters = {'alfa': alfa, 'gamma': gamma, 'epsilon': epsilon}

# Create the Wandb artifact
artifact = wandb.Artifact('hyperparameters', type='hyperparameters')
artifact.add_file('hyperparameters.json', json.dumps(hyperparameters))

# Log the artifact
wandb.log_artifact(artifact)

trainPolicy(Q, env, episodes=10000, alpha=alfa, gamma=gamma, epsilon = epsilon)

ValueError: You must pass an instance of wandb.Artifact or a valid file path to log_artifact

In [None]:
# Probar policy
def testPolicy(Q, env):
    obs, _ = env.reset()
    done = False
    while not done:
        env.render()
        state = obs
        action = optimal_policy(state, Q)
        obs, reward, done, _, _ = env.step(action)
    env.render()
    return reward


In [None]:
# Chequeo de la política óptima
# Total reward / cantidad de episodios. Reward promedio de policy.
#Wandb: Sweeps

# Testear la política 100 veces

import wandb
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

rewards = []
for i in range(100):
    reward = testPolicy(Q, env)
    rewards.append(reward)
print(np.mean(rewards))


19.79
