In [None]:
import gym
import matplotlib.pyplot as plt
import random
import gym_minigrid
import numpy as np
import wandb
from gym_minigrid.wrappers import FullyObsWrapper

In [None]:
wandb.login()

In [None]:
sweep_config = {'method': 'grid'}
sweep_config['metric'] = {'name': 'reward', 'goal': 'maximize'}

sweep_config['parameters'] = {
    'learning_rate': {'values': [0.1, 0.05, 0.01, 0.2, 0.25]},
    'episodes': {'values': [10000, 20000, 5000, 30000, 40000]}
}

import pprint
pprint.pprint(sweep_config)
sweep_id = wandb.sweep(sweep_config, project="RL_PA1_GridWorld_QLearning")

In [None]:
def softmax(x):
    exp_x = np.exp(x - np.max(x)) 
    return exp_x / exp_x.sum()

def find_agent(image):
    for x in range(image.shape[0]):
        for y in range(image.shape[1]):
            if image[x, y, 0] == 10:
                return (x, y)
    return None

In [None]:
def qlearning(config=None):
    env = gym.make("MiniGrid-Dynamic-Obstacles-5x5-v0")
    env = gym.wrappers.RecordEpisodeStatistics(env)
    env = FullyObsWrapper(env)
    discount = 0.99
    
    with wandb.init(): 
        config = wandb.config
        learning_rate = config.learning_rate
        episodes = config.episodes
        state_space_size = (5, 5, 4)
        q_table = np.random.uniform(low=-2, high=0, size=(state_space_size + (env.action_space.n,)))

        for ep in range(episodes):
            done = False
            obs_dict = env.reset()[0]
            step_count = 0
            total_reward = 0
            
            while not done:
                agent_pos = find_agent(obs_dict["image"])
                agent_dir = obs_dict['direction']  
                state = (agent_pos[0], agent_pos[1], agent_dir)
                
                probabilities = softmax(q_table[state])
                action = np.random.choice(env.action_space.n, p=probabilities)
                
                new_obs_dict = env.step(action)[0]
                step_count += 1
                
                reward = 1 - 0.9 * (step_count / 100)
                if done and reward <= 0:
                    reward -= 1
                total_reward += reward
                
                new_agent_pos = find_agent(new_obs_dict["image"])
                new_agent_dir = new_obs_dict['direction']
                new_state = (new_agent_pos[0], new_agent_pos[1], new_agent_dir)
                
                max_future_q = np.max(q_table[new_state]) if not done else 0
                current_q = q_table[state + (action, )]
                new_q = (1 - learning_rate) * current_q + learning_rate * (reward + discount * max_future_q)
                q_table[state + (action, )] = new_q
                obs_dict = new_obs_dict
                
                wandb.log({'reward': total_reward, 'episode': ep})
                wandb.log({'length': step_count, 'episode': ep})
    
        env.close()
    return q_table

In [None]:
wandb.agent(sweep_id, qlearning, count=5)