In [1]:
import sys
sys.path.insert(1, '../..')

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from mpl_toolkits.axes_grid1 import make_axes_locatable
import matplotlib.patches as patches
%matplotlib inline

from geodesic_agent import GeodesicAgent
from gridworld import Arena, Bottleneck, LinearChamber
from plot_utils import plot_replay, plot_traj, plot_need_gain
from RL_utils import oned_twod

*Open field*

In [None]:
# Physics
width = 10
height = 7
num_states = width * height

# Build object
one_start_state = np.zeros(num_states)
one_start_state[0] = 1
all_start_states = np.ones(num_states) / num_states

init_state_dist = all_start_states
arena = Arena(width, height, init_state_distribution=init_state_dist)
all_experiences = arena.get_all_transitions()
T = arena.transitions

## Agent parameters
corner_goals = np.array([width - 1, (height - 1) * width, height * width - 1]) # Non-start corners
all_goals = np.arange(0, width * height)

goals = all_goals
alpha = 1.0
gamma = 0.95
num_replay_steps = 20

# Set up agent
ga = GeodesicAgent(arena.num_states, arena.num_actions, goals, T, alpha=alpha, gamma=gamma,
                   s0_dist=init_state_dist)
ga.curr_state = 0
ga.remember(all_experiences) # Pre-load our agent with all possible memories

## Run replay
replayed_experiences, stats_for_nerds, backups = ga.replay(num_steps=num_replay_steps, verbose=True, prospective=True)
needs, gains, all_MEVBs = stats_for_nerds

In [None]:
# Plot the replayed experiences
print('First %d replay steps' % num_replay_steps, flush=True)
fig, ax = plt.subplots(1, 1, figsize=(6,6))
plot_replay(arena, np.array(replayed_experiences).astype(int), ax=ax)
plt.show()

# Plotting params
params = {'min_need' : 0,
          'max_need' : 1}

# Plot need, gain, MEVB throughout each of those steps
meta_need = np.mean(needs, axis=1)
meta_gain = np.mean(gains, axis=1)
meta_MEVB = np.mean(all_MEVBs, axis=1)

for i in range(num_replay_steps):
    print('step %d:' % i, backups[i], flush=True)
    plot_need_gain(arena, ga.memory, np.average(meta_need[i, :, :], weights=init_state_dist, axis=0), 
                   meta_gain[i, :], meta_MEVB[i, :], params)
    plt.show()

*Bottleneck chamber*

In [None]:
## Bottleneck
# Physics
room_width = 1
corridor_width = 4
width = room_width * 2 + corridor_width
height = 5
num_states = width * height

# Build object
init_state_dist = np.ones(num_states) / num_states
bottleneck = Bottleneck(room_width, corridor_width, height, init_state_distribution=init_state_dist)
all_experiences = bottleneck.get_all_transitions()
T = bottleneck.transitions

## Agent parameters
goal_states = np.array([width - 1, (height - 1) * width, height * width - 1]) # Non-start corners
goal_states = np.arange(num_states)
alpha = 1.0
gamma = 0.95
num_replay_steps = 20

# Set up agent
ga = GeodesicAgent(bottleneck.num_states, bottleneck.num_actions, goal_states, T, alpha=alpha, gamma=gamma,
                  s0_dist=init_state_dist)
ga.curr_state = 0
ga.remember(all_experiences) # Pre-load our agent with all possible memories

## Run replay
replayed_experiences, stats_for_nerds = ga.replay(num_steps=num_replay_steps, verbose=True, prospective=True)
needs, gains, all_MEVBs = stats_for_nerds

In [None]:
# Plot the replayed experiences
print('First 10 replay steps', flush=True)
fig, ax = plt.subplots(1, 1, figsize=(6,6))
plot_replay(bottleneck, np.array(replayed_experiences).astype(int), ax=ax)
plt.show()

# Plot need, gain, MEVB throughout each of those steps
meta_need = np.mean(needs, axis=1)
meta_gain = np.mean(gains, axis=1)
meta_MEVB = np.mean(all_MEVBs, axis=1)

for i in range(num_replay_steps):
    print('step', i, flush=True)
    plot_need_gain(bottleneck, ga.memory, np.average(meta_need[i, :, :], weights=init_state_dist, axis=0), 
                   meta_gain[i, :], meta_MEVB[i, :])
    plt.show()

*Linear chamber*

In [3]:
backups

[[(3, 0, 2)],
 [(3, 0, 2), (2, 0, 1)],
 [(3, 0, 2), (2, 0, 1), (1, 0, 0)],
 [(0, 2, 1)],
 [(1, 2, 2)],
 [(4, 0, 3)],
 [(5, 0, 4)],
 [(6, 0, 5)],
 [(7, 0, 6)],
 [(8, 0, 7)],
 [(9, 0, 8)],
 [(10, 0, 9)],
 [(11, 0, 10)],
 [(12, 0, 11)],
 [(13, 0, 12)],
 [(14, 0, 13)],
 [(15, 0, 14)],
 [(16, 0, 15)],
 [(17, 0, 16)],
 [(18, 0, 17)],
 [(19, 0, 18)],
 [(20, 0, 19)],
 [(21, 0, 20)],
 [(22, 0, 21)],
 [(23, 0, 22)],
 [(24, 0, 23)],
 [(25, 0, 24)],
 [(26, 0, 25)],
 [(27, 0, 26)],
 [(28, 0, 27)],
 [(2, 2, 3)],
 [(3, 2, 4)],
 [(4, 2, 5)],
 [(5, 2, 6)],
 [(6, 2, 7)],
 [(7, 2, 8)],
 [(8, 2, 9)],
 [(9, 2, 10)],
 [(29, 0, 28)],
 [(10, 2, 11)],
 [(11, 2, 12)],
 [(12, 2, 13)],
 [(13, 2, 14)],
 [(14, 2, 15)],
 [(15, 2, 16)],
 [(16, 2, 17)],
 [(17, 2, 18)],
 [(18, 2, 19)],
 [(19, 2, 20)],
 [(20, 2, 21)],
 [(21, 2, 22)],
 [(22, 2, 23)],
 [(23, 2, 24)],
 [(24, 2, 25)],
 [(25, 2, 26)],
 [(26, 2, 27)],
 [(27, 2, 28)],
 [(28, 2, 29)],
 [(27, 2, 28)],
 [(26, 2, 27)],
 [(25, 2, 26)],
 [(24, 2, 25)],
 [(23, 2, 24)

In [2]:
## Linear chamber
# Physics
length = 30

# Build object
init_state_dist = np.ones(length) / length
tunnel = LinearChamber(length, init_state_distribution=init_state_dist)
all_experiences = tunnel.get_all_transitions()
T = tunnel.transitions

## Agent parameters
goal_states = np.array([length - 1]) # Non-start corners
goal_states = np.arange(length)
alpha = 1.0
gamma = 0.95
num_replay_steps = 100

# Set up agent
ga = GeodesicAgent(tunnel.num_states, tunnel.num_actions, goal_states, T, alpha=alpha, gamma=gamma,
                  s0_dist=init_state_dist)
ga.curr_state = 0
ga.remember(all_experiences) # Pre-load our agent with all possible memories

## Run replay
replayed_experiences, stats_for_nerds, backups = ga.replay(num_steps=num_replay_steps, verbose=True, prospective=True)
needs, gains, all_MEVBs = stats_for_nerds

0 s_k does not continue replay_seq or creates a loop
3 s_k does not continue replay_seq or creates a loop
4 a_k not best anywhere (1, 2, 2)
	 a_k: 2
	 G[s_k, :, goal]:
	 G[1, :, 0] [1. 0. 0. 0.]
	 G[1, :, 1] [0. 0. 0. 0.]
	 G[1, :, 2] [0. 0. 0. 0.]
	 G[1, :, 3] [0. 0. 0. 0.]
	 G[1, :, 4] [0. 0. 0. 0.]
	 G[1, :, 5] [0. 0. 0. 0.]
	 G[1, :, 6] [0. 0. 0. 0.]
	 G[1, :, 7] [0. 0. 0. 0.]
	 G[1, :, 8] [0. 0. 0. 0.]
	 G[1, :, 9] [0. 0. 0. 0.]
	 G[1, :, 10] [0. 0. 0. 0.]
	 G[1, :, 11] [0. 0. 0. 0.]
	 G[1, :, 12] [0. 0. 0. 0.]
	 G[1, :, 13] [0. 0. 0. 0.]
	 G[1, :, 14] [0. 0. 0. 0.]
	 G[1, :, 15] [0. 0. 0. 0.]
	 G[1, :, 16] [0. 0. 0. 0.]
	 G[1, :, 17] [0. 0. 0. 0.]
	 G[1, :, 18] [0. 0. 0. 0.]
	 G[1, :, 19] [0. 0. 0. 0.]
	 G[1, :, 20] [0. 0. 0. 0.]
	 G[1, :, 21] [0. 0. 0. 0.]
	 G[1, :, 22] [0. 0. 0. 0.]
	 G[1, :, 23] [0. 0. 0. 0.]
	 G[1, :, 24] [0. 0. 0. 0.]
	 G[1, :, 25] [0. 0. 0. 0.]
	 G[1, :, 26] [0. 0. 0. 0.]
	 G[1, :, 27] [0. 0. 0. 0.]
	 G[1, :, 28] [0. 0. 0. 0.]
	 G[1, :, 29] [0. 0. 0. 0.]
5 

38 s_k does not continue replay_seq or creates a loop
39 s_k does not continue replay_seq or creates a loop
40 a_k not best anywhere (11, 2, 12)
	 a_k: 2
	 G[s_k, :, goal]:
	 G[11, :, 0] [0.63 0.   0.   0.  ]
	 G[11, :, 1] [0.032 0.    0.    0.   ]
	 G[11, :, 2] [0.002 0.    0.    0.   ]
	 G[11, :, 3] [0.698 0.    0.    0.   ]
	 G[11, :, 4] [0.735 0.    0.    0.   ]
	 G[11, :, 5] [0.774 0.    0.    0.   ]
	 G[11, :, 6] [0.815 0.    0.    0.   ]
	 G[11, :, 7] [0.857 0.    0.    0.   ]
	 G[11, :, 8] [0.902 0.    0.    0.   ]
	 G[11, :, 9] [0.95 0.   0.   0.  ]
	 G[11, :, 10] [1. 0. 0. 0.]
	 G[11, :, 11] [0. 0. 0. 0.]
	 G[11, :, 12] [0. 0. 0. 0.]
	 G[11, :, 13] [0. 0. 0. 0.]
	 G[11, :, 14] [0. 0. 0. 0.]
	 G[11, :, 15] [0. 0. 0. 0.]
	 G[11, :, 16] [0. 0. 0. 0.]
	 G[11, :, 17] [0. 0. 0. 0.]
	 G[11, :, 18] [0. 0. 0. 0.]
	 G[11, :, 19] [0. 0. 0. 0.]
	 G[11, :, 20] [0. 0. 0. 0.]
	 G[11, :, 21] [0. 0. 0. 0.]
	 G[11, :, 22] [0. 0. 0. 0.]
	 G[11, :, 23] [0. 0. 0. 0.]
	 G[11, :, 24] [0. 0. 0. 0.]


48 a_k not best anywhere (19, 2, 20)
	 a_k: 2
	 G[s_k, :, goal]:
	 G[19, :, 0] [0.418 0.    0.    0.   ]
	 G[19, :, 1] [0.021 0.    0.    0.   ]
	 G[19, :, 2] [0.001 0.    0.    0.   ]
	 G[19, :, 3] [0.463 0.    0.    0.   ]
	 G[19, :, 4] [0.488 0.    0.    0.   ]
	 G[19, :, 5] [0.513 0.    0.    0.   ]
	 G[19, :, 6] [0.54 0.   0.   0.  ]
	 G[19, :, 7] [0.569 0.    0.    0.   ]
	 G[19, :, 8] [0.599 0.    0.    0.   ]
	 G[19, :, 9] [0.63 0.   0.   0.  ]
	 G[19, :, 10] [0.663 0.    0.    0.   ]
	 G[19, :, 11] [0.698 0.    0.    0.   ]
	 G[19, :, 12] [0.735 0.    0.    0.   ]
	 G[19, :, 13] [0.774 0.    0.    0.   ]
	 G[19, :, 14] [0.815 0.    0.    0.   ]
	 G[19, :, 15] [0.857 0.    0.    0.   ]
	 G[19, :, 16] [0.902 0.    0.    0.   ]
	 G[19, :, 17] [0.95 0.   0.   0.  ]
	 G[19, :, 18] [1. 0. 0. 0.]
	 G[19, :, 19] [0. 0. 0. 0.]
	 G[19, :, 20] [0. 0. 0. 0.]
	 G[19, :, 21] [0. 0. 0. 0.]
	 G[19, :, 22] [0. 0. 0. 0.]
	 G[19, :, 23] [0. 0. 0. 0.]
	 G[19, :, 24] [0. 0. 0. 0.]
	 G[19, :, 25] [

56 a_k not best anywhere (27, 2, 28)
	 a_k: 2
	 G[s_k, :, goal]:
	 G[27, :, 0] [0.277 0.    0.    0.   ]
	 G[27, :, 1] [0.014 0.    0.    0.   ]
	 G[27, :, 2] [0.001 0.    0.    0.   ]
	 G[27, :, 3] [0.307 0.    0.    0.   ]
	 G[27, :, 4] [0.324 0.    0.    0.   ]
	 G[27, :, 5] [0.341 0.    0.    0.   ]
	 G[27, :, 6] [0.358 0.    0.    0.   ]
	 G[27, :, 7] [0.377 0.    0.    0.   ]
	 G[27, :, 8] [0.397 0.    0.    0.   ]
	 G[27, :, 9] [0.418 0.    0.    0.   ]
	 G[27, :, 10] [0.44 0.   0.   0.  ]
	 G[27, :, 11] [0.463 0.    0.    0.   ]
	 G[27, :, 12] [0.488 0.    0.    0.   ]
	 G[27, :, 13] [0.513 0.    0.    0.   ]
	 G[27, :, 14] [0.54 0.   0.   0.  ]
	 G[27, :, 15] [0.569 0.    0.    0.   ]
	 G[27, :, 16] [0.599 0.    0.    0.   ]
	 G[27, :, 17] [0.63 0.   0.   0.  ]
	 G[27, :, 18] [0.663 0.    0.    0.   ]
	 G[27, :, 19] [0.698 0.    0.    0.   ]
	 G[27, :, 20] [0.735 0.    0.    0.   ]
	 G[27, :, 21] [0.774 0.    0.    0.   ]
	 G[27, :, 22] [0.815 0.    0.    0.   ]
	 G[27, :, 23]

In [None]:
# Plot the replayed experiences
print('First %d replay steps' % num_replay_steps, flush=True)
fig, ax = plt.subplots(1, 1, figsize=(6,6))
plot_replay(tunnel, np.array(replayed_experiences).astype(int), ax=ax)
plt.show()

# Plot need, gain, MEVB throughout each of those steps
meta_need = np.mean(needs, axis=1)
meta_gain = np.mean(gains, axis=1)
meta_MEVB = np.mean(all_MEVBs, axis=1)

for i in range(num_replay_steps):
    print('step %d:' % i, backups[i], flush=True)
    plot_need_gain(tunnel, ga.memory, np.average(meta_need[i, :, :], weights=init_state_dist, axis=0), meta_gain[i, :], meta_MEVB[i, :])
    plt.show()

*Analyze simulated data*

In [None]:
# Create vanilla version of GridWorld
width = 10
height = 7
goal_states = np.array([width - 1, (height - 1) * width, height * width - 1]) # Non-start corners

# GridWorld parameters
stoch = 0 # Grid stochasticity
num_states = width * height
num_actions = 4

init_state_distribution = np.zeros(num_states)
init_state_distribution[0] = 1

gw = GridWorld(width, height, stoch, init_state_distribution=init_state_distribution)

In [None]:
# Load data
d = np.load('arena.npz')
replay_seqs = d['replay_seqs']
state_seqs = d['state_seqs']
goal_seqs = d['goal_seqs']

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(20, 6))
plot_replay(gw, replay_seqs[0, 0, :, :, 0].astype(int), ax=axes[0]);
plot_traj(gw, state_seqs[0, 0, :].astype(int), ax=axes[1])
plot_replay(gw, replay_seqs[0, 0, :, :, 1].astype(int), ax=axes[2]);

In [None]:
print(gains[0, :])