In [None]:
import gym
import matplotlib.pyplot as plt
import random
import gym_minigrid
import numpy as np
import wandb
from gym_minigrid.wrappers import FullyObsWrapper

In [None]:
wandb.login()

In [None]:
sweep_config = {'method': 'random'}
metric = {'name': 'reward',
    'goal': 'maximize'}
sweep_config['metric'] = metric
parameters_dict = {
    'learning_rate': {'values': [0.1, 0.05, 0.01, 0.2, 0.25]},
    'epsilon': {'values': [0.1, 0.2, 0.3, 0.05, 0.01]}
}
sweep_config['parameters'] = parameters_dict

import pprint
pprint.pprint(sweep_config)
sweep_id = wandb.sweep(sweep_config, project="RL_PA1_GridWorld_SARSA")

In [None]:
def find_agent(image):
    for x in range(image.shape[0]):
        for y in range(image.shape[1]):
            if image[x, y, 0] == 10:
                return (x, y)
    return None

In [None]:
def epsilon_greedy(q_table, state, epsilon, env):
    if np.random.random() < epsilon:
        return env.action_space.sample()
    else:
        return np.argmax(q_table[state])

In [None]:
def sarsa(config=None):
    env = gym.make("MiniGrid-Dynamic-Obstacles-5x5-v0")
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env = FullyObsWrapper(env)
    
    with wandb.init():
        config = wandb.config
        learning_rate = config.learning_rate
        epsilon = config.epsilon
        state_space_size = (5, 5, 4)
        q_table = np.random.uniform(low=-2, high=0, size=(state_space_size + (env.action_space.n,)))
        episodes = 10000
        discount = 0.99

        for ep in range(episodes):
            done = False
            obs, _ = env.reset()
            step_count = 0
            total_reward = 0

            agent_pos = find_agent(obs["image"])
            agent_dir = obs['direction']
            state = (agent_pos[0], agent_pos[1], agent_dir)
            action = epsilon_greedy(q_table, state, epsilon, env)

            while not done:
                new_obs, _, done, _, _ = env.step(action)
                step_count += 1

                reward = 1 - 0.9 * (step_count / 100)
                if done and reward <= 0:
                    reward -= 1
                total_reward += reward

                new_agent_pos = find_agent(new_obs["image"])
                new_agent_dir = new_obs['direction']
                new_state = (new_agent_pos[0], new_agent_pos[1], new_agent_dir)

                new_action = epsilon_greedy(q_table, new_state, epsilon, env) if not done else None

                current_q = q_table[state + (action, )]
                future_q = q_table[new_state + (new_action, )] if not done else 0
                new_q = (1 - learning_rate) * current_q + learning_rate * (reward + discount * future_q)
                q_table[state + (action, )] = new_q

                state, action = new_state, new_action
                wandb.log({'reward': total_reward, 'episode': ep})
                wandb.log({'length': step_count, 'episode': ep})

        env.close()

In [None]:
wandb.agent(sweep_id, sarsa, count=5)