In [1]:
import warnings; warnings.filterwarnings('ignore')

import itertools
import time
import numpy as np
from pprint import pprint

from collections import defaultdict

import gymnasium as gym
from gridworld.envs import SiblingGridWorldEnv
from gymnasium.envs.registration import register
from gymnasium.wrappers import TimeLimit
from tabulate import tabulate
import tqdm as tqdm

import torch
from torch.utils.tensorboard import SummaryWriter

import random
import matplotlib
import matplotlib.pyplot as plt
SEEDS = (12, 34, 56, 78, 90)

%matplotlib inline

from utils import *
from sibling_gw_agent import SiblingGWAgent

In [2]:
plt.style.use('fivethirtyeight')
params = {
    'figure.figsize': (15, 8),
    'font.size': 24,
    'legend.fontsize': 20,
    'axes.titlesize': 28,
    'axes.labelsize': 24,
    'xtick.labelsize': 20,
    'ytick.labelsize': 20
}
plt.rcParams.update(params)
np.set_printoptions(precision=3, suppress=True)

layout = {
    "Training": {
        "Q_gw": ["Multiline", ["Q_gw/center", "Q_gw/corner"]],
        "Q_bandit": ["Multiline", ["Q_bandit/zero", "Q_bandit/twenty"]],
    },
}

writer = SummaryWriter('runs/sibling_gw')
writer.add_custom_scalars(layout)

In [3]:
def sibling():
    env = SiblingGridWorldEnv(P_gridworld)
    env = TimeLimit(env, max_episode_steps=100)
#     env = RelativePositionenv)
    return env

register(
    id='SiblingGridWorld-v0',
    entry_point=sibling,
    max_episode_steps=100,
)

env = gym.make('SiblingGridWorld-v0')
env = env.unwrapped
# env.render_mode = 'human'
obs, info = env.reset(options={'randomize_world': True})
print(env._true_world_idx)

11


In [4]:
n_episodes = 100_000
agent = SiblingGWAgent(env, gamma=1.0, 
            init_alpha=0.5, min_alpha=0.05, alpha_decay_ratio=0.5, 
            init_epsilon=1.0, min_epsilon=0.1, epsilon_decay_ratio=0.9, 
            n_episodes=n_episodes)
# agent = SiblingGWAgent(env, min_epsilon=0.5, epsilon_decay_ratio=0.9, n_episodes=n_episodes)
# agent.episode = 5000
# agent.epsilons[agent.episode]

In [5]:
for episode in tqdm(range(agent.episode, agent.episode + n_episodes)):
    agent.episode = episode
    state, info = env.reset()
    # if episode % 3_500 == 0:
    #     state, info = env.reset(options={'randomize_world': True})
    done = False

    while not done:
        action = agent.select_action(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        
        # update the agent
        agent.update(state, action, reward, terminated, next_state)

        # update if the environment is done or truncated
        done = terminated or truncated
        state = next_state

    if episode % 100 == 0:
        writer.add_scalar("Q_gw/center",
            np.max(agent.Q_gw[agent.state_multi_to_lin(np.array([2, 2]))]), 
            episode
        )
        writer.add_scalar("Q_gw/corner",
            np.max(agent.Q_gw[agent.state_multi_to_lin(np.array([0, 0]))]), 
            episode
        )
        writer.add_scalar("Q_bandit/zero",
            np.max(agent.Q_bandit[0]), 
            episode
        )
        writer.add_scalar("Q_bandit/twenty",
            np.max(agent.Q_bandit[20]), 
            episode
        )

writer.flush()
writer.close()

100%|██████████| 100000/100000 [02:56<00:00, 566.93it/s]


In [6]:
rng = np.random.default_rng()
env.render_mode = None
# obs, info = env.reset(options={'randomize_world': True})
obs, _ = env.reset()
env._agent_location = np.array([2, 2])
obs = env._agent_location

avg_steps = 100
alpha = 0.1
running_avg = [10]
max_ep_len = 100
for e in tqdm(range(1, 1_000)):
    for i in range(1, max_ep_len):
        action = agent.greedy_action(obs)
        obs, rew, done, trunc, info = env.step(action)
        # print(obs, rew, info)
        # time.sleep(0.05)
        if done or i == max_ep_len - 1:
            # print(f"Finished in {i} steps.")
            avg_steps += 1/e * (i - avg_steps)
            running_avg += [(1-alpha) * running_avg[-1] + alpha * avg_steps]
            break
    obs, _ = env.reset()
    env._agent_location = np.array([2, 2])
    obs = env._agent_location
    # if e % 100 == 0:
    #     print(f"Episode {e} finished, current average estimate is {avg_steps}.")

print(f"Cumulative average estimate: {avg_steps}")

  0%|          | 0/999 [00:00<?, ?it/s]


AttributeError: 'SiblingGWAgent' object has no attribute 'greedy_action'

In [None]:
plt.plot(running_avg)

In [None]:
state = np.array([2, 2, 11])
state_idx = np.ravel_multi_index(state, env.observation_space.nvec, order='F')
print(state_idx, agent.Q[state_idx].max(), np.argmax(agent.Q[state_idx]))

In [None]:
# env.render_mode = 'human'
obs, info = env.reset()
env._agent_location = np.array([2, 2])
obs[:-1] = env._agent_location
# print(np.concatenate([env._agent_location, env._world_belief]))
env._render_frame()
# time.sleep(1.0)
print(env._true_world_idx)

for i in range(1,max_ep_len+1):
    obs_idx = np.ravel_multi_index(obs, env.observation_space.nvec, order='F')
    action_linear = np.argmax(agent.Q[obs_idx])
    action = np.unravel_index(action_linear, env.action_space.nvec, order='F')
    # print(obs_idx, np.concatenate([env._agent_location, env._world_belief]), action_linear, action, agent.Q[obs_idx].max())
    obs, rew, done, trunc, info = env.step(action)
    # print(obs, rew, info)
    # time.sleep(0.05)
    if done or trunc:
        print(f"Finished in {i} steps.")
        break

In [None]:
# env.close()

In [None]:
print(agent.act_lin_to_multi(np.argmax(agent.Q[agent.state_multi_to_lin(np.array([2, 2, 0]))])))
print(agent.act_lin_to_multi(np.argmax(agent.Q[agent.state_multi_to_lin(np.array([2, 2, 7]))])))
print(agent.act_lin_to_multi(np.argmax(agent.Q[agent.state_multi_to_lin(np.array([2, 2, 11]))])))
print(agent.act_lin_to_multi(np.argmax(agent.Q[agent.state_multi_to_lin(np.array([2, 2, 20]))])))

In [None]:
env._true_world_idx

In [None]:
agent.Q[agent.state_multi_to_lin(np.array([2, 2, 0]))]

In [None]:
np.ones((4*24,), dtype=np.float32)*-100