In [1]:
# TODO
# Add key argument in forward which is required for memories
# Add static prefix for memory based on key
# Reduce redundancy in state storage (now data is duplicated 'nodes' # of times), maybe add indexing function to memorybuffer
# If memory still used too much, implement file-storage

# Add FCL between raw features and the features appended to vector
# Make distance reward env-size and dataset-agnostic (i.e. spawn nodes in range 0-1 (or at origin), normalize dist per dataset (maybe by average dist))
# Check that rewards are normalized after (?) advantage

# Randomize positions within -1 to 1, no matter the environment size
# Fix off-center positioning in large environments (kinda solved with post-centering?)
# Revise distance reward
# Try using running average early stopping
# Save checkpoint models

# Try full MMD-MA data
# Try real data
# Add parallel envs of different sizes, with different data to help generality

In [2]:
%load_ext autoreload
%autoreload 2
%env WANDB_NOTEBOOK_NAME train.ipynb
%env WANDB_SILENT true

env: WANDB_NOTEBOOK_NAME=train.ipynb
env: WANDB_SILENT=true


In [3]:
from collections import defaultdict
import itertools
import os

import inept
import numpy as np
import pandas as pd
import torch
import wandb

# Set params
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATA_FOLDER = os.path.join(os.path.abspath(''), '../data')
MODEL_FOLDER = os.path.join(os.path.abspath(''), 'temp/trained_models')

# Script arguments
# import sys
# arg1 = int(sys.argv[1])

In [4]:
# Original paper (pg 24)
# https://arxiv.org/pdf/1909.07528.pdf

# Original blog
# https://openai.com/research/emergent-tool-use

# Gym
# https://gymnasium.farama.org/

# Slides
# https://glouppe.github.io/info8004-advanced-machine-learning/pdf/pleroy-hide-and-seek.pdf

# PPO implementation
# https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO.py#L38

# Residual SA
# https://github.com/openai/multi-agent-emergence-environments/blob/bafaf1e11e6398624116761f91ae7c93b136f395/ma_policy/layers.py#L89

In [5]:
# Reproducibility
seed = 42
torch.manual_seed(seed)
if DEVICE == 'cuda': torch.cuda.manual_seed(seed)
np.random.seed(seed)

note_kwargs = {'seed': seed}

### Create Environment

In [6]:
# scGEM data
dataset_name = 'scGEM'
M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/GeneExpression.txt'), delimiter=' ', header=None).to_numpy()
M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/DNAmethylation.txt'), delimiter=' ', header=None).to_numpy()
T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type1.txt'), delimiter=' ', header=None).to_numpy()
T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type2.txt'), delimiter=' ', header=None).to_numpy()
F1 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/gex_names.txt'), dtype='str')
F2 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/dm_names.txt'), dtype='str')

# MMD-MA data
# dataset_name = 'MMD-MA'
# M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped1.txt'), delimiter='\t', header=None).to_numpy()
# M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped2.txt'), delimiter='\t', header=None).to_numpy()
# T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type1.txt'), delimiter='\t', header=None).to_numpy()
# T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type2.txt'), delimiter='\t', header=None).to_numpy()

# Normalize
M1 = np.nan_to_num((M1 - M1.mean(axis=0)) / M1.std(axis=0)) / 5
M2 = np.nan_to_num((M2 - M2.mean(axis=0)) / M2.std(axis=0)) / 5

# Subsample nodes
num_nodes = 50
idx = np.random.choice(M1.shape[0], num_nodes, replace=False)
M1 = torch.tensor(M1[idx], dtype=torch.float32, device=DEVICE)
M2 = torch.tensor(M2[idx], dtype=torch.float32, device=DEVICE)
T1 = torch.tensor(T1[idx], dtype=torch.long, device=DEVICE)
T2 = torch.tensor(T2[idx], dtype=torch.long, device=DEVICE)

# Subsample features
num_features = (16, 16)
idx_1 = np.random.choice(M1.shape[1], num_features[0], replace=False)
idx_2 = np.random.choice(M2.shape[1], num_features[1], replace=False)
M1 = torch.tensor(M1[:, idx_1], dtype=torch.float32, device=DEVICE)
M2 = torch.tensor(M2[:, idx_2], dtype=torch.float32, device=DEVICE)

# Random data
# dataset_name = 'Random'
# num_nodes = 20
# M1 = torch.rand((num_nodes, 8), device=DEVICE)
# M2 = torch.rand((num_nodes, 16), device=DEVICE)

# Record data kwargs
modalities = (M1, M2)
data_kwargs = {
    'dataset': dataset_name,
    'num_nodes': num_nodes,
}

  M1 = np.nan_to_num((M1 - M1.mean(axis=0)) / M1.std(axis=0)) / 5
  M1 = torch.tensor(M1[:, idx_1], dtype=torch.float32, device=DEVICE)
  M2 = torch.tensor(M2[:, idx_2], dtype=torch.float32, device=DEVICE)


In [7]:
# Environment
# x, y, vx, vy
num_dims = 2
env_kwargs = {
    'dim': num_dims,
    'pos_bound': 2,
    'vel_bound': 1,
    'delta': .1,
    'reward_distance': 10,
    # 'reward_origin': 1,
    'penalty_bound': 1,
    'penalty_velocity': 1,
    'penalty_action': 1,
    'reward_distance_type': 'euclidean',
}
env = inept.environments.trajectory(*modalities, **env_kwargs, device=DEVICE)

### Train Policy

In [8]:
# Tracking parameters
# Use `watch -d -n 0.5 nvidia-smi` to watch CUDA memory usage
# Use `top` to watch system memory usage
use_wandb = True

# Policy parameters
input_dims = 2*num_dims+sum([m.shape[1] for m in modalities])
batch_split_factor = 1  # Set to high if large number of features
update_minibatch = int( 4e4 * (10 / num_nodes) / batch_split_factor )
update_max_batch = batch_split_factor * update_minibatch  # Only run one minibatch, empirically the benefit is minimal compared to time loss
policy_kwargs = {
    'num_features_per_node': input_dims,
    'output_dim': num_dims,
    'action_std_init': .6,
    'action_std_decay': .05,
    'action_std_min': .1,
    'actor_lr': 3e-4,
    'critic_lr': 1e-3,
    'lr_gamma': 1,
    'update_minibatch': update_minibatch,  # Based on no minibatches needed with 10 nodes at 4k update timesteps
    'update_max_batch': update_max_batch,  # Try making larger, e.g. 20x minibatches
    'device': DEVICE,
}
policy = inept.models.PPO(**policy_kwargs)

# Training parameters
max_ep_timesteps = 2e2  # 2e2
max_timesteps = 1e6
update_timesteps = 4e3  # 20 * max_ep_timesteps
train_kwargs = {
    'max_ep_timesteps': max_ep_timesteps,
    'max_timesteps': max_timesteps,
    'update_timesteps': update_timesteps,
}

# Early stopping parameters
es_kwargs = {
    'buffer': 3 * int(update_timesteps / max_ep_timesteps),
    'delta': .01,
}
early_stopping = inept.utilities.EarlyStopping(**es_kwargs)

# Initialize wandb
if use_wandb: wandb.init(
    project='INEPT',
    config={
        **{'note/'+k:v for k, v in note_kwargs.items()},
        **{'data/'+k:v for k, v in data_kwargs.items()},
        **{'env/'+k:v for k, v in env_kwargs.items()},
        **{'policy/'+k:v for k, v in policy_kwargs.items()},
        **{'train/'+k:v for k, v in train_kwargs.items()},
        **{'es/'+k:v for k, v in es_kwargs.items()},
    },
)

# Initialize logging vars
torch.cuda.reset_peak_memory_stats()
timer = inept.utilities.time_logger(discard_first_sample=True)
timestep = 0; episode = 1

# CLI
print('Beginning training')
print(f'Subsampling {update_max_batch} states with minibatches of size {update_minibatch} from {int(update_timesteps * num_nodes)} total.')

# Simulation loop
while timestep < max_timesteps:
    # Reset environment
    env.reset()
    # env.reward_scales['reward_origin'] = episode / 1e3
    timer.log('Reset Environment')

    # Start episode
    ep_timestep = 0; ep_reward = 0; ep_itemized_reward = defaultdict(lambda: 0)
    while ep_timestep < max_ep_timesteps:
        with torch.no_grad():
            # Get current state
            state = env.get_state(include_modalities=True)

            # Get self features for each node
            self_entity = state

            # Get node features for each state
            idx = torch.zeros((num_nodes, num_nodes), dtype=torch.bool)
            for i, j in itertools.product(*[range(x) for x in idx.shape]):
                idx[i, j] = i!=j
            node_entities = state.unsqueeze(0).expand(num_nodes, *state.shape)
            node_entities = node_entities[idx].reshape(num_nodes, num_nodes-1, input_dims)
            timer.log('Environment Setup')

            # Get actions from policy
            actions = policy.act(self_entity, node_entities).detach()
            timer.log('Calculate Actions')

            # Step environment and get reward
            rewards, finished, itemized_rewards = env.step(actions, return_rewards=True)
            timer.log('Step Environment')

            # Record rewards
            for key in range(num_nodes):
                policy.memory.rewards.append(rewards[key].item())  # Could just add lists
                policy.memory.is_terminals.append(finished)
            ep_reward = ep_reward + rewards.cpu().mean()
            for k, v in itemized_rewards.items():
                ep_itemized_reward[k] += v.cpu().mean()
            timer.log('Record Rewards')

        # Iterate
        timestep += 1
        ep_timestep += 1

        # Update model
        if timestep % update_timesteps == 0:
            print(f'Updating model with average reward {np.mean(policy.memory.rewards)} on episode {episode} and timestep {timestep}', end='')
            policy.update()
            print(f' ({torch.cuda.max_memory_allocated() / 1024**3:.2f} GB CUDA)')
            torch.cuda.reset_peak_memory_stats()
            timer.log('Update Policy')

        # Escape if finished
        if finished: break

    # Upload stats
    ep_reward = (ep_reward / ep_timestep).item()
    if use_wandb:
        wandb.log({
            **{
            'episode': episode,
            'update': int(timestep / update_timesteps),
            'end_timestep': timestep,
            'average_reward': ep_reward,
            'action_std': policy.action_std,
            },
            **{'rewards/'+k: (v / ep_timestep).item() for k, v in ep_itemized_reward.items()},
        })
    timer.log('Record Stats')

    # Decay model std
    if early_stopping(ep_reward):
        # End if already at minimum
        if policy.action_std <= policy.action_std_min:
            print(f'Ending early on episode {episode} and timestep {timestep}')
            break

        # Decay and reset early stop
        policy.decay_action_std()
        early_stopping.reset()

        # CLI
        print(f'Decaying std to {policy.action_std} on episode {episode} and timestep {timestep}')
    timer.log('Early Stopping')

    # Iterate
    episode += 1

# CLI Timer
print()
timer.aggregate('sum')

# Save model
wgt_file = os.path.join(MODEL_FOLDER, 'policy.wgt')
torch.save(policy.state_dict(), wgt_file)  # Save just weights
if use_wandb: wandb.save(wgt_file)
mdl_file = os.path.join(MODEL_FOLDER, 'policy.mdl')
torch.save(policy, mdl_file)  # Save whole model
if use_wandb: wandb.save(mdl_file)

# Finish wandb
if use_wandb: wandb.finish()

Beginning training
Subsampling 8000 states with minibatches of size 8000 from 200000 total.


Updating model with average reward -1.4861794186462671 on episode 20 and timestep 4000

 (3.79 GB CUDA)


Updating model with average reward -1.3684505920995385 on episode 40 and timestep 8000

 (3.79 GB CUDA)


Updating model with average reward -1.2659886709387018 on episode 60 and timestep 12000

 (3.79 GB CUDA)


Updating model with average reward -1.155364594862834 on episode 80 and timestep 16000

 (3.79 GB CUDA)


Updating model with average reward -1.05165240073125 on episode 100 and timestep 20000

 (3.79 GB CUDA)


Updating model with average reward -0.9650112656755045 on episode 120 and timestep 24000

 (3.79 GB CUDA)


Updating model with average reward -0.8874917128633661 on episode 140 and timestep 28000

 (3.79 GB CUDA)


Updating model with average reward -0.7813832676538564 on episode 160 and timestep 32000

 (3.79 GB CUDA)


Updating model with average reward -0.7074593216814276 on episode 180 and timestep 36000

 (3.79 GB CUDA)


Updating model with average reward -0.6405917290770551 on episode 200 and timestep 40000

 (3.79 GB CUDA)


Updating model with average reward -0.5893686301062501 on episode 220 and timestep 44000

 (3.79 GB CUDA)


Updating model with average reward -0.5402739837848091 on episode 240 and timestep 48000

 (3.79 GB CUDA)


Updating model with average reward -0.4987224031773764 on episode 260 and timestep 52000

 (3.79 GB CUDA)


Updating model with average reward -0.47500457889845243 on episode 280 and timestep 56000

 (3.79 GB CUDA)


Updating model with average reward -0.4429691364530717 on episode 300 and timestep 60000

 (3.79 GB CUDA)


Updating model with average reward -0.43560437256601814 on episode 320 and timestep 64000

 (3.79 GB CUDA)


Updating model with average reward -0.42094385071213136 on episode 340 and timestep 68000

 (3.79 GB CUDA)


Updating model with average reward -0.4051438178902672 on episode 360 and timestep 72000

 (3.79 GB CUDA)


Updating model with average reward -0.4010373742381498 on episode 380 and timestep 76000

 (3.79 GB CUDA)


Updating model with average reward -0.3873332599363249 on episode 400 and timestep 80000

 (3.79 GB CUDA)


Decaying std to 0.5499999999999999 on episode 419 and timestep 83800


Updating model with average reward -0.376677226895639 on episode 420 and timestep 84000

 (3.79 GB CUDA)


Updating model with average reward -0.307500339882572 on episode 440 and timestep 88000

 (3.79 GB CUDA)


Updating model with average reward -0.30659127881432124 on episode 460 and timestep 92000

 (3.79 GB CUDA)


Updating model with average reward -0.2989338789848215 on episode 480 and timestep 96000

 (3.79 GB CUDA)


Updating model with average reward -0.29318601153314433 on episode 500 and timestep 100000

 (3.79 GB CUDA)


Updating model with average reward -0.2889223947134543 on episode 520 and timestep 104000

 (3.79 GB CUDA)


Updating model with average reward -0.2846068095700376 on episode 540 and timestep 108000

 (3.79 GB CUDA)


Updating model with average reward -0.28631953373505575 on episode 560 and timestep 112000

 (3.79 GB CUDA)


Updating model with average reward -0.28947932896591216 on episode 580 and timestep 116000

 (3.79 GB CUDA)


Updating model with average reward -0.2803184054872896 on episode 600 and timestep 120000

 (3.79 GB CUDA)


Updating model with average reward -0.2836348638105657 on episode 620 and timestep 124000

 (3.79 GB CUDA)


Updating model with average reward -0.28161024295606185 on episode 640 and timestep 128000

 (3.79 GB CUDA)


Decaying std to 0.49999999999999994 on episode 643 and timestep 128600


Updating model with average reward -0.23673063935746075 on episode 660 and timestep 132000

 (3.79 GB CUDA)


Updating model with average reward -0.21871966160509154 on episode 680 and timestep 136000

 (3.79 GB CUDA)


Updating model with average reward -0.21662808454334884 on episode 700 and timestep 140000

 (3.79 GB CUDA)


Updating model with average reward -0.22339909896758348 on episode 720 and timestep 144000

 (3.79 GB CUDA)


Decaying std to 0.44999999999999996 on episode 732 and timestep 146400


Updating model with average reward -0.19668116914405684 on episode 740 and timestep 148000

 (3.79 GB CUDA)


Updating model with average reward -0.16563468324176792 on episode 760 and timestep 152000

 (3.79 GB CUDA)


Updating model with average reward -0.16843152744999598 on episode 780 and timestep 156000

 (3.79 GB CUDA)


Decaying std to 0.39999999999999997 on episode 799 and timestep 159800


Updating model with average reward -0.1625951261600192 on episode 800 and timestep 160000

 (3.79 GB CUDA)


Updating model with average reward -0.11553886100500858 on episode 820 and timestep 164000

 (3.79 GB CUDA)


Updating model with average reward -0.11309662544666593 on episode 840 and timestep 168000

 (3.79 GB CUDA)


Updating model with average reward -0.117256117599561 on episode 860 and timestep 172000

 (3.79 GB CUDA)


Updating model with average reward -0.11456388843789755 on episode 880 and timestep 176000

 (3.79 GB CUDA)


Decaying std to 0.35 on episode 892 and timestep 178400


Updating model with average reward -0.1000505280826243 on episode 900 and timestep 180000

 (3.79 GB CUDA)


Updating model with average reward -0.07362657142354885 on episode 920 and timestep 184000

 (3.79 GB CUDA)


Updating model with average reward -0.08074709075358849 on episode 940 and timestep 188000

 (3.79 GB CUDA)


Decaying std to 0.3 on episode 958 and timestep 191600


Updating model with average reward -0.06934643481986368 on episode 960 and timestep 192000

 (3.79 GB CUDA)


Updating model with average reward -0.04404377976948781 on episode 980 and timestep 196000

 (3.79 GB CUDA)


Updating model with average reward -0.03908491041554131 on episode 1000 and timestep 200000

 (3.79 GB CUDA)


Updating model with average reward -0.03889067292736021 on episode 1020 and timestep 204000

 (3.79 GB CUDA)


Updating model with average reward -0.03780826585992966 on episode 1040 and timestep 208000

 (3.79 GB CUDA)


Decaying std to 0.25 on episode 1044 and timestep 208800


Updating model with average reward -0.013490591471528932 on episode 1060 and timestep 212000

 (3.79 GB CUDA)


Updating model with average reward -0.011732081596754942 on episode 1080 and timestep 216000

 (3.79 GB CUDA)


Updating model with average reward -0.005370778514915455 on episode 1100 and timestep 220000

 (3.79 GB CUDA)


Decaying std to 0.2 on episode 1109 and timestep 221800


Updating model with average reward 0.0018402363342591252 on episode 1120 and timestep 224000

 (3.79 GB CUDA)


Updating model with average reward 0.023794792113746917 on episode 1140 and timestep 228000

 (3.79 GB CUDA)


Updating model with average reward 0.024287385194749822 on episode 1160 and timestep 232000

 (3.79 GB CUDA)


Updating model with average reward 0.01150873805789908 on episode 1180 and timestep 236000

 (3.79 GB CUDA)


Decaying std to 0.15000000000000002 on episode 1195 and timestep 239000


Updating model with average reward 0.024146723235422413 on episode 1200 and timestep 240000

 (3.79 GB CUDA)


Updating model with average reward 0.03511320679680656 on episode 1220 and timestep 244000

 (3.79 GB CUDA)


Updating model with average reward 0.035516927432864304 on episode 1240 and timestep 248000

 (3.79 GB CUDA)


Updating model with average reward 0.035993477735385505 on episode 1260 and timestep 252000

 (3.79 GB CUDA)


Decaying std to 0.10000000000000002 on episode 1279 and timestep 255800


Updating model with average reward 0.03610493618437096 on episode 1280 and timestep 256000

 (3.79 GB CUDA)


Updating model with average reward 0.0471286198416561 on episode 1300 and timestep 260000

 (3.79 GB CUDA)


Updating model with average reward 0.05033959953889232 on episode 1320 and timestep 264000

 (3.79 GB CUDA)


Updating model with average reward 0.05540788958987853 on episode 1340 and timestep 268000

 (3.79 GB CUDA)


Decaying std to 0.1 on episode 1356 and timestep 271200


Updating model with average reward 0.05324814108510014 on episode 1360 and timestep 272000

 (3.79 GB CUDA)


Updating model with average reward 0.052663276845081784 on episode 1380 and timestep 276000

 (3.79 GB CUDA)


Updating model with average reward 0.05660696620069655 on episode 1400 and timestep 280000

 (3.79 GB CUDA)


Updating model with average reward 0.050254890631361106 on episode 1420 and timestep 284000

 (3.79 GB CUDA)


Ending early on episode 1436 and timestep 287200

Reset Environment: 0.17257263987266924
Environment Setup: 3297.6846694385167
Calculate Actions: 1684.7094423648814


Step Environment: 205.5562401945208
Record Rewards: 178.18715313302528
Record Stats: 0.48038218487636186
Early Stopping: 0.018218705896288157
Update Policy: 2098.469653935972
Total: 7465.2783325975615
