In [1]:
%matplotlib notebook
%load_ext autoreload
%autoreload 2


In [2]:
import itertools

import inept
import matplotlib.animation as animation
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import torch


# Style
sns.set_theme(context='paper', style='white', palette='Set2')

In [3]:
# Resources
# https://openai.com/research/emergent-tool-use
# https://glouppe.github.io/info8004-advanced-machine-learning/pdf/pleroy-hide-and-seek.pdf
# https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO.py#L38


In [4]:
# Reproducibility
torch.manual_seed(42)
np.random.seed(42)

### Create Environment

In [5]:
# Params
num_nodes = 5
num_dims = 2

# Data
M1 = torch.rand((num_nodes, 1))
M2 = torch.rand((num_nodes, 2))

# Environment
# x, y, vx, vy
env = inept.environments.trajectory(M1, M2, dim=num_dims)

### Train Policy

In [6]:
# Make policy
input_dims = 2*num_dims+M1.shape[1]+M2.shape[1]
policy = inept.models.PPO(input_dims, num_dims)

# Parameters
max_ep_timesteps = 1e3
max_timesteps = 1e5  # 1e6
update_timesteps = 4 * max_ep_timesteps
decay_timesteps = 2.5e5

# Simulation loop
print('Beginning training')
timestep = 0; episode = 0; states = []
while timestep < max_timesteps:
    # Reset environment
    env.reset()

    # Start episode
    ep_timestep = 0
    while ep_timestep < max_ep_timesteps:
        # Get current state
        state = env.get_state(include_modalities=True)
        states.append(state)

        # Get self features for each node
        self_entity = state

        # Get node features for each state
        idx = torch.zeros((num_nodes, state.shape[0]), dtype=torch.bool)
        for i, j in itertools.product(*[range(x) for x in idx.shape]):
            idx[i, j] = i!=j
        node_entities = state.unsqueeze(0).expand(num_nodes, *state.shape)
        node_entities = node_entities[idx].reshape(num_nodes, num_nodes-1, input_dims)

        # Get actions from policy
        actions = policy.act(self_entity, node_entities)

        # Update velocities
        env.add_velocities(env.delta * actions)

        # Step environment and get reward
        distances_old = env.get_distance_from_origin()
        env.step()
        distances = env.get_distance_from_origin()
        rewards = distances_old - distances
        finished = env.finished() or (ep_timestep == max_ep_timesteps-1)

        # Record rewards
        for key in range(state.shape[0]):
            policy.memory.rewards.append(rewards[key].item())
            policy.memory.is_terminals.append(finished)

        # Iterate
        timestep += 1
        ep_timestep += 1

        # Decay model std
        if timestep % decay_timesteps == 0:
            policy.decay_action_std()
            print(f'Decaying std to {policy.action_std} on episode {episode} and timestep {timestep}')

        # Update model
        if timestep % update_timesteps == 0:
            print(f'Updating model with average reward {np.mean(policy.memory.rewards)} on episode {episode} and timestep {timestep}')
            policy.update()

    # Iterate
    episode += 1

# Format states for animation
states = np.stack(states, axis=0)

Beginning training
Updating model with average reward -0.0005281682980246842 on episode 3 and timestep 4000
Updating model with average reward -0.00038950182534754275 on episode 7 and timestep 8000
Updating model with average reward -0.00036637189164757727 on episode 11 and timestep 12000
Updating model with average reward -0.0004350326017476618 on episode 15 and timestep 16000
Updating model with average reward -0.00031166426334530115 on episode 19 and timestep 20000
Updating model with average reward -0.0002710308206267655 on episode 23 and timestep 24000
Updating model with average reward -0.00021300027668476104 on episode 27 and timestep 28000
Updating model with average reward -9.209200674667955e-05 on episode 31 and timestep 32000
Updating model with average reward 2.9203627118840813e-05 on episode 35 and timestep 36000
Updating model with average reward 3.7803596118465067e-05 on episode 39 and timestep 40000
Updating model with average reward 2.4023803416639565e-07 on episode 43

### Animate Latent Space

In [11]:
# Subset
anim_states = states[-int(max_ep_timesteps):-int(max_ep_timesteps-max_ep_timesteps/10)]

# Create figure
fig, ax = plt.subplots()
plt.sca(ax)

# Initial scatter
sct = ax.scatter(*anim_states[0, :, :num_dims].T)

# Update function
def update(frame):
    # Filter data
    sct.set_offsets(anim_states[frame, :, :num_dims])

    # Format
    ax.set_aspect('equal')
    ax.set_xlim([-1.2, 1.2])
    ax.set_ylim([-1.2, 1.2])

    return sct

# Run animation
timescale = 3
ani = animation.FuncAnimation(fig=fig, func=update, frames=anim_states.shape[0], interval=(1/timescale)*1000*env.delta)

# Save animation
ani.save('sample.gif')

# Show animation
from IPython.display import HTML
HTML(ani.to_jshtml())

<IPython.core.display.Javascript object>

MovieWriter ffmpeg unavailable; using Pillow instead.


In [None]:
# TODO
# Record avg reward and timesteps in a plot
# Undo `selfish` argument in model calculation
# Add activations to regular model
# Make reward function on real data
# Write README