In [1]:
# TODO
# More efficient state storage (store modalities only once, etc.)
# Add FCL between raw features and the features appended to vector
# Make distance reward env-size and dataset-agnostic (i.e. spawn nodes in range 0-1 (or at origin), normalize dist per dataset (maybe by average dist))
# Try MMD-MA with all features

# Fix off-center positioning in large environments (kinda solved with post-centering?)
# Try using running average early stopping
# Save checkpoint models
# Try with 100s of nodes
# Try transfer learning with stages like stay at origin -> regular dist -> etc.
# Upload episode playback data to wandb
# Add parallel envs of different sizes, with different data to help generality

In [2]:
%load_ext autoreload
%autoreload 2
%env WANDB_NOTEBOOK_NAME train.ipynb
%env WANDB_SILENT true

env: WANDB_NOTEBOOK_NAME=train.ipynb
env: WANDB_SILENT=true


In [3]:
from collections import defaultdict
import itertools
import os

import inept
import numpy as np
import pandas as pd
import torch
import wandb

# Set params
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATA_FOLDER = os.path.join(os.path.abspath(''), '../data')
MODEL_FOLDER = os.path.join(os.path.abspath(''), 'temp/trained_models')

# Script arguments
# import sys
# arg1 = int(sys.argv[1])

In [4]:
# Original paper (pg 24)
# https://arxiv.org/pdf/1909.07528.pdf

# Original blog
# https://openai.com/research/emergent-tool-use

# Gym
# https://gymnasium.farama.org/

# Slides
# https://glouppe.github.io/info8004-advanced-machine-learning/pdf/pleroy-hide-and-seek.pdf

# PPO implementation
# https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO.py#L38

# Residual SA
# https://github.com/openai/multi-agent-emergence-environments/blob/bafaf1e11e6398624116761f91ae7c93b136f395/ma_policy/layers.py#L89

In [5]:
# Reproducibility
seed = 42
torch.manual_seed(seed)
if DEVICE == 'cuda': torch.cuda.manual_seed(seed)
np.random.seed(seed)

note_kwargs = {'seed': seed}

### Create Environment

In [6]:
# MMD-MA data
num_nodes = 50
data_kwargs = {
    'dataset': 'MMD-MA',
    'num_nodes': num_nodes,
}
M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped1.txt'), delimiter='\t', header=None).to_numpy()
M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped2.txt'), delimiter='\t', header=None).to_numpy()
T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type1.txt'), delimiter='\t', header=None).to_numpy()
T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type2.txt'), delimiter='\t', header=None).to_numpy()
# Subsample
idx = np.random.choice(M1.shape[0], num_nodes, replace=False)
idx_1 = np.random.choice(M1.shape[1], 16, replace=False)
idx_2 = np.random.choice(M2.shape[1], 16, replace=False)
M1 = torch.tensor(M1[idx][:, idx_1], dtype=torch.float32, device=DEVICE)
M2 = torch.tensor(M2[idx][:, idx_2], dtype=torch.float32, device=DEVICE)
T1 = torch.tensor(T1[idx], dtype=torch.long, device=DEVICE)
T2 = torch.tensor(T2[idx], dtype=torch.long, device=DEVICE)
modalities = (M1, M2)

# Random data
# num_nodes = 20
# data_kwargs = {
#     'dataset': 'Random',
#     'num_nodes': num_nodes,
# }
# M1 = torch.rand((num_nodes, 8), device=DEVICE)
# M2 = torch.rand((num_nodes, 16), device=DEVICE)
# modalities = (M1, M2)

# Environment
# x, y, vx, vy
num_dims = 2
env_kwargs = {
    'dim': num_dims,
    'pos_bound': 2,
    'vel_bound': 1,
    'delta': .1,
    'reward_distance': 10,
    # 'reward_origin': 1,
    'penalty_bound': 1,
    'penalty_velocity': 1,
    'penalty_action': 1,
    'reward_distance_type': 'euclidean',
}
env = inept.environments.trajectory(*modalities, **env_kwargs, device=DEVICE)

### Train Policy

In [7]:
# Policy parameters
input_dims = 2*num_dims+sum([m.shape[1] for m in modalities])
update_minibatch = int( 4e4 * (10 / num_nodes) )
update_max_batch = 1 * update_minibatch  # Only run one minibatch, empirically the benefit is minimal compared to time loss
policy_kwargs = {
    'num_features_per_node': input_dims,
    'output_dim': num_dims,
    'action_std_init': .6,
    'action_std_decay': .05,
    'action_std_min': .1,
    'actor_lr': 3e-4,
    'critic_lr': 1e-3,
    'lr_gamma': 1,
    'update_minibatch': update_minibatch,  # Based on no minibatches needed with 10 nodes at 4k update timesteps
    'update_max_batch': update_max_batch,  # Try making larger, e.g. 20x minibatches
    'device': DEVICE,
}
policy = inept.models.PPO(**policy_kwargs)

# Training parameters
max_ep_timesteps = 2e2  # 2e2
max_timesteps = 1e6
update_timesteps = 4e3  # 20 * max_ep_timesteps
train_kwargs = {
    'max_ep_timesteps': max_ep_timesteps,
    'max_timesteps': max_timesteps,
    'update_timesteps': update_timesteps,
}

# Early stopping parameters
es_kwargs = {
    'buffer': 3 * int(update_timesteps / max_ep_timesteps),
    'delta': .01,
}
early_stopping = inept.utilities.EarlyStopping(**es_kwargs)

# Initialize wandb
use_wandb = True
if use_wandb: wandb.init(
    project='INEPT',
    config={
        **{'note/'+k:v for k, v in note_kwargs.items()},
        **{'data/'+k:v for k, v in data_kwargs.items()},
        **{'env/'+k:v for k, v in env_kwargs.items()},
        **{'policy/'+k:v for k, v in policy_kwargs.items()},
        **{'train/'+k:v for k, v in train_kwargs.items()},
        **{'es/'+k:v for k, v in es_kwargs.items()},
    },
)

# Initialize logging vars
torch.cuda.reset_peak_memory_stats()
timer = inept.utilities.time_logger(discard_first_sample=True)
timestep = 0; episode = 1

# CLI
print('Beginning training')
print(f'Subsampling {update_max_batch} states with minibatches of size {update_minibatch} from {int(update_timesteps * num_nodes)} total.')

# Simulation loop
while timestep < max_timesteps:
    # Reset environment
    env.reset()
    # env.reward_scales['reward_origin'] = episode / 1e3
    timer.log('Reset Environment')

    # Start episode
    ep_timestep = 0; ep_reward = 0; ep_itemized_reward = defaultdict(lambda: 0)
    while ep_timestep < max_ep_timesteps:
        with torch.no_grad():
            # Get current state
            state = env.get_state(include_modalities=True)

            # Get self features for each node
            self_entity = state

            # Get node features for each state
            idx = torch.zeros((num_nodes, num_nodes), dtype=torch.bool)
            for i, j in itertools.product(*[range(x) for x in idx.shape]):
                idx[i, j] = i!=j
            node_entities = state.unsqueeze(0).expand(num_nodes, *state.shape)
            node_entities = node_entities[idx].reshape(num_nodes, num_nodes-1, input_dims)
            timer.log('Environment Setup')

            # Get actions from policy
            actions = policy.act(self_entity, node_entities).detach()
            timer.log('Calculate Actions')

            # Step environment and get reward
            rewards, finished, itemized_rewards = env.step(actions, return_rewards=True)
            timer.log('Step Environment')

            # Record rewards
            for key in range(num_nodes):
                policy.memory.rewards.append(rewards[key].item())  # Could just add lists
                policy.memory.is_terminals.append(finished)
            ep_reward = ep_reward + rewards.cpu().mean()
            for k, v in itemized_rewards.items():
                ep_itemized_reward[k] += v.cpu().mean()
            timer.log('Record Rewards')

        # Iterate
        timestep += 1
        ep_timestep += 1

        # Update model
        if timestep % update_timesteps == 0:
            print(f'Updating model with average reward {np.mean(policy.memory.rewards)} on episode {episode} and timestep {timestep}', end='')
            policy.update()
            print(f' ({torch.cuda.max_memory_allocated() / 1024**3:.2f} GB CUDA)')
            torch.cuda.reset_peak_memory_stats()
            timer.log('Update Policy')

        # Escape if finished
        if finished: break

    # Upload stats
    ep_reward = (ep_reward / ep_timestep).item()
    if use_wandb:
        wandb.log({
            **{
            'episode': episode,
            'update': int(timestep / update_timesteps),
            'end_timestep': timestep,
            'average_reward': ep_reward,
            'action_std': policy.action_std,
            },
            **{'rewards/'+k: (v / ep_timestep).item() for k, v in ep_itemized_reward.items()},
        })
    timer.log('Record Stats')

    # Decay model std
    if early_stopping(ep_reward):
        # End if already at minimum
        if policy.action_std <= policy.action_std_min:
            print(f'Ending early on episode {episode} and timestep {timestep}')
            break

        # Decay and reset early stop
        policy.decay_action_std()
        early_stopping.reset()

        # CLI
        print(f'Decaying std to {policy.action_std} on episode {episode} and timestep {timestep}')
    timer.log('Early Stopping')

    # Iterate
    episode += 1

# CLI Timer
print()
timer.aggregate('sum')

# Save model
wgt_file = os.path.join(MODEL_FOLDER, 'policy.wgt')
torch.save(policy.state_dict(), wgt_file)  # Save just weights
if use_wandb: wandb.save(wgt_file)
mdl_file = os.path.join(MODEL_FOLDER, 'policy.mdl')
torch.save(policy, mdl_file)  # Save whole model
if use_wandb: wandb.save(mdl_file)

# Finish wandb
if use_wandb: wandb.finish()

Beginning training
Subsampling 8000 states with minibatches of size 8000 from 200000 total.


Updating model with average reward -1.3917209325961886 on episode 20 and timestep 4000

 (3.79 GB CUDA)


Updating model with average reward -1.3151680687619234 on episode 40 and timestep 8000

 (3.79 GB CUDA)


Updating model with average reward -1.2382298703477916 on episode 60 and timestep 12000

 (3.79 GB CUDA)


Updating model with average reward -1.152881999494515 on episode 80 and timestep 16000

 (3.79 GB CUDA)


Updating model with average reward -1.0521632170345996 on episode 100 and timestep 20000

 (3.79 GB CUDA)


Updating model with average reward -0.9886831611302821 on episode 120 and timestep 24000

 (3.79 GB CUDA)


Updating model with average reward -0.9196926702908398 on episode 140 and timestep 28000

 (3.79 GB CUDA)


Updating model with average reward -0.8428780927985988 on episode 160 and timestep 32000

 (3.79 GB CUDA)


Updating model with average reward -0.7481078351171023 on episode 180 and timestep 36000

 (3.79 GB CUDA)


Updating model with average reward -0.6709775609658979 on episode 200 and timestep 40000

 (3.79 GB CUDA)


Updating model with average reward -0.5877676838175167 on episode 220 and timestep 44000

 (3.79 GB CUDA)


Updating model with average reward -0.5035580485002283 on episode 240 and timestep 48000

 (3.79 GB CUDA)


Updating model with average reward -0.4576468310356146 on episode 260 and timestep 52000

 (3.79 GB CUDA)


Updating model with average reward -0.4236286136259636 on episode 280 and timestep 56000

 (3.79 GB CUDA)


Updating model with average reward -0.3939922519378917 on episode 300 and timestep 60000

 (3.79 GB CUDA)


Updating model with average reward -0.3795537867309595 on episode 320 and timestep 64000

 (3.79 GB CUDA)


Updating model with average reward -0.3626377503960341 on episode 340 and timestep 68000

 (3.79 GB CUDA)


Updating model with average reward -0.3436570979552246 on episode 360 and timestep 72000

 (3.79 GB CUDA)


Updating model with average reward -0.3415568515166969 on episode 380 and timestep 76000

 (3.79 GB CUDA)


Updating model with average reward -0.32636265260663744 on episode 400 and timestep 80000

 (3.79 GB CUDA)


Updating model with average reward -0.3267465251421464 on episode 420 and timestep 84000

 (3.79 GB CUDA)


Updating model with average reward -0.31719898975027117 on episode 440 and timestep 88000

 (3.79 GB CUDA)


Decaying std to 0.5499999999999999 on episode 441 and timestep 88200


Updating model with average reward -0.25575230960073575 on episode 460 and timestep 92000

 (3.79 GB CUDA)


Updating model with average reward -0.24725818082189965 on episode 480 and timestep 96000

 (3.79 GB CUDA)


Updating model with average reward -0.24865575973629123 on episode 500 and timestep 100000

 (3.79 GB CUDA)


Decaying std to 0.49999999999999994 on episode 512 and timestep 102400


Updating model with average reward -0.2250454308072952 on episode 520 and timestep 104000

 (3.79 GB CUDA)


Updating model with average reward -0.186052550865803 on episode 540 and timestep 108000

 (3.79 GB CUDA)


Updating model with average reward -0.18970456164538352 on episode 560 and timestep 112000

 (3.79 GB CUDA)


Updating model with average reward -0.19333805921990818 on episode 580 and timestep 116000

 (3.79 GB CUDA)


Decaying std to 0.44999999999999996 on episode 581 and timestep 116200


Updating model with average reward -0.13412317971113633 on episode 600 and timestep 120000

 (3.79 GB CUDA)


Updating model with average reward -0.13373263429741492 on episode 620 and timestep 124000

 (3.79 GB CUDA)


Updating model with average reward -0.13411498940484184 on episode 640 and timestep 128000

 (3.79 GB CUDA)


Decaying std to 0.39999999999999997 on episode 643 and timestep 128600


Updating model with average reward -0.09749904959264385 on episode 660 and timestep 132000

 (3.79 GB CUDA)


Updating model with average reward -0.08214718012809062 on episode 680 and timestep 136000

 (3.79 GB CUDA)


Updating model with average reward -0.07973560899763427 on episode 700 and timestep 140000

 (3.79 GB CUDA)


Updating model with average reward -0.08914491416471748 on episode 720 and timestep 144000

 (3.79 GB CUDA)


Updating model with average reward -0.08232390026858957 on episode 740 and timestep 148000

 (3.79 GB CUDA)


Updating model with average reward -0.08274717580577656 on episode 760 and timestep 152000

 (3.79 GB CUDA)


Decaying std to 0.35 on episode 771 and timestep 154200


Updating model with average reward -0.0667399663067986 on episode 780 and timestep 156000

 (3.79 GB CUDA)


Updating model with average reward -0.0406938525407284 on episode 800 and timestep 160000

 (3.79 GB CUDA)


Updating model with average reward -0.03858389412840908 on episode 820 and timestep 164000

 (3.79 GB CUDA)


Updating model with average reward -0.036193509088624884 on episode 840 and timestep 168000

 (3.79 GB CUDA)


Decaying std to 0.3 on episode 845 and timestep 169000


Updating model with average reward -0.013247834154431839 on episode 860 and timestep 172000

 (3.79 GB CUDA)


Updating model with average reward -0.0007127597328264528 on episode 880 and timestep 176000

 (3.79 GB CUDA)


Updating model with average reward -0.0040380415998672835 on episode 900 and timestep 180000

 (3.79 GB CUDA)


Decaying std to 0.25 on episode 914 and timestep 182800


Updating model with average reward 0.005193568981504359 on episode 920 and timestep 184000

 (3.79 GB CUDA)


Updating model with average reward 0.020094097247816245 on episode 940 and timestep 188000

 (3.79 GB CUDA)


Updating model with average reward 0.029007319567891827 on episode 960 and timestep 192000

 (3.79 GB CUDA)


Updating model with average reward 0.02079581021755872 on episode 980 and timestep 196000

 (3.79 GB CUDA)


Decaying std to 0.2 on episode 983 and timestep 196600


Updating model with average reward 0.04703338525587955 on episode 1000 and timestep 200000

 (3.79 GB CUDA)


Updating model with average reward 0.051210108835136416 on episode 1020 and timestep 204000

 (3.79 GB CUDA)


Updating model with average reward 0.051174396016751125 on episode 1040 and timestep 208000

 (3.79 GB CUDA)


Updating model with average reward 0.051247948595122925 on episode 1060 and timestep 212000

 (3.79 GB CUDA)


Decaying std to 0.15000000000000002 on episode 1067 and timestep 213400


Updating model with average reward 0.0584429944379257 on episode 1080 and timestep 216000

 (3.79 GB CUDA)


Updating model with average reward 0.0729569145752331 on episode 1100 and timestep 220000

 (3.79 GB CUDA)


Updating model with average reward 0.06415593391028912 on episode 1120 and timestep 224000

 (3.79 GB CUDA)


Updating model with average reward 0.07593103827403405 on episode 1140 and timestep 228000

 (3.79 GB CUDA)


Updating model with average reward 0.07698732997552582 on episode 1160 and timestep 232000

 (3.79 GB CUDA)


Updating model with average reward 0.06254915316787839 on episode 1180 and timestep 236000

 (3.79 GB CUDA)


Decaying std to 0.10000000000000002 on episode 1195 and timestep 239000


Updating model with average reward 0.07451648730699564 on episode 1200 and timestep 240000

 (3.79 GB CUDA)


Updating model with average reward 0.08041215548045433 on episode 1220 and timestep 244000

 (3.79 GB CUDA)


Updating model with average reward 0.08196720699148116 on episode 1240 and timestep 248000

 (3.79 GB CUDA)


Updating model with average reward 0.08163264575361687 on episode 1260 and timestep 252000

 (3.79 GB CUDA)


Updating model with average reward 0.08303656767645534 on episode 1280 and timestep 256000

 (3.79 GB CUDA)


Decaying std to 0.1 on episode 1286 and timestep 257200


Updating model with average reward 0.07996932511244811 on episode 1300 and timestep 260000

 (3.79 GB CUDA)


Updating model with average reward 0.08729848490672615 on episode 1320 and timestep 264000

 (3.79 GB CUDA)


Updating model with average reward 0.09342098058841584 on episode 1340 and timestep 268000

 (3.79 GB CUDA)


Ending early on episode 1356 and timestep 271200

Reset Environment: 0.16404553500251495
Environment Setup: 2973.672982980039
Calculate Actions: 1695.3234914082823
Step Environment: 195.54342375727356


Record Rewards: 186.03955030623183
Record Stats: 0.44416088401703746
Early Stopping: 0.01824922999367118
Update Policy: 2087.196198100981
Total: 7138.4021022018205
