Esta notebook contiene bloques de código útiles para realizar Q-learning en el entorno "Taxi"

In [None]:
import numpy as np
import random
import json
from taxi_env_extended import TaxiEnvExtended

In [None]:
env = TaxiEnvExtended()

Obtener la cantidad de estados y acciones

In [None]:
actions = env.action_space.n
states = env.observation_space.n

Inicialización de la tabla Q

In [None]:
Q = np.zeros((states, actions))
Q

Obtención de la acción a partir de la tabla Q

In [None]:
def optimal_policy(state, Q):
    action = np.argmax(Q[state])
    return action

Epsilon-Greedy Policy

In [None]:
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    explore = np.random.binomial(1, epsilon)
    if explore:
        action = env.action_space.sample()
    # exploit
    else:
        action = np.argmax(Q[state])
        
    return action

In [None]:
# Probar policy

def testPolicy(Q, env):
    obs, _ = env.reset()
    done = False
    totalReward = 0
    while not done:
        env.render()
        state = obs
        action = optimal_policy(state, Q)
        obs, reward, done, _, _ = env.step(action)
        totalReward += reward
    env.render()
    wandb.log({'test_reward_by_episode': totalReward, 'episode': "trial"})
    return totalReward


In [None]:
# Chequeo de la política óptima
# Total reward / cantidad de episodios. Reward promedio de policy.
#Wandb: Sweeps

# Testear la política 100 veces

import wandb
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc
def getAverageTestingReward(Q, env):
    rewards = []
    for i in range(100):
        reward = testPolicy(Q, env)
        print(reward)
        rewards.append(reward)
    print(np.mean(rewards))     #Promedio de la recompensa total de cada episodio
    return np.mean(rewards) 


In [None]:
import wandb
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

def train_policy():
    try:
        with wandb.init() as run:
            config = run.config
            alpha = config.alpha
            gamma = config.gamma
            epsilon = config.epsilon
            epsilon_variability = config.epsilonVariability
            episodes = config.episodes

            env = TaxiEnvExtended()
            Q = np.zeros([env.observation_space.n, env.action_space.n])

            total_rewards = []

            for episode in range(episodes):
                obs, _ = env.reset()
                done = False
                total_reward = 0
                step_count = 0

                while not done:
                    state = obs
                    epsilon = max(epsilon - epsilon_variability * epsilon, 0)  # Decay epsilon
                    action = epsilon_greedy_policy(state, Q, epsilon)
                    obs, reward, done, _, _ = env.step(action)
                    Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[obs]) - Q[state, action])
                    total_reward += reward
                    step_count += 1

                total_rewards.append(total_reward)
                wandb.log({'train_reward_by_episode': total_reward, 'episode': episode, 'train_avg_reward': np.mean(total_rewards)})

            test_average_reward = getAverageTestingReward(Q, env)
            wandb.log({'test_avg_reward': test_average_reward})
            # Guardar hiperparámetros y resultados como artefacto
            hyperparameters = {
                'alpha': alpha,
                'gamma': gamma,
                'epsilon': epsilon,
                'epsilonVariability': epsilon_variability,
                'episodes': episodes,
                'train_avg_reward': np.mean(total_rewards),
                'test_avg__reward': test_average_reward
            }
            
            with open('hyperparameters.json', 'w') as f:
                json.dump(hyperparameters, f)

            artifact = wandb.Artifact('hyperparameters', type='hyperparameters')
            artifact.add_file('hyperparameters.json')
            run.log_artifact(artifact)


    except Exception as e:
        print(f"An error occurred: {e}")
    finally:
        wandb.finish()


In [None]:
import wandb

# Configuración del sweep
sweep_configuration = {
    "method": "random",
    "metric": {"goal": "maximize", "name": 'avg_total_reward'},
    "parameters": {
        "episodes": {"max": 10000, "min": 9999},
        "alpha": {"max": 0.99, "min": 0.5},
        "gamma": {"max": 0.8, "min": 0.3},
        "epsilon": {"max": 0.8, "min": 0.1},
        "epsilonVariability": {"max": 0.5, "min": 0.35}
    },
}

sweep_id = wandb.sweep(sweep=sweep_configuration, project="taxi-sweep")
wandb.agent(sweep_id, function=train_policy)

Pruebo ciertos hiperparámetros en específico luego de Sweep:

In [None]:
import wandb
from wandb.sklearn import plot_precision_recall, plot_feature_importances
from wandb.sklearn import plot_class_proportions, plot_learning_curve, plot_roc

def train_finalPolicy():
    try:
            alpha = 0.8894046109188853
            gamma = 0.5333435999129466
            epsilon = 0.7391381811833376
            epsilon_variability = 0.4889625280062114
            episodes = 10000

            env = TaxiEnvExtended()
            Q = np.zeros([env.observation_space.n, env.action_space.n])

            total_rewards = []

            for episode in range(episodes):
                obs, _ = env.reset()
                done = False
                total_reward = 0
                step_count = 0

                while not done:
                    state = obs
                    epsilon = max(epsilon - epsilon_variability * epsilon, 0)  # Decay epsilon
                    action = epsilon_greedy_policy(state, Q, epsilon)
                    obs, reward, done, _, _ = env.step(action)
                    Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[obs]) - Q[state, action])
                    total_reward += reward
                    step_count += 1

                total_rewards.append(total_reward)
                print(f"Episode {episode} - Total reward: {total_reward}, avg_total_reward: {np.mean(total_rewards)}")
            return Q


    except Exception as e:
        print(f"An error occurred: {e}")




In [None]:
Q = train_finalPolicy()