In [1]:
## Checks
# Check that rewards are normalized after (?) advantage

## Improvements
# Fix off-center positioning in large environments
# Revise distance reward - Maybe add cell attraction (all should be close to each other) and repulsion (repulsion based on distance in modality)
# Revise velocity and action penalties to encourage early cell-type separation (i.e. sqrt of vec length or similar)
# Try using running average early stopping
# Add parallel envs of different sizes, with different data to help generality

## QOL
# Save every time early stopping occurs

## Runs
# Try full real data

In [2]:
%load_ext autoreload
%autoreload 2
%env WANDB_NOTEBOOK_NAME train.ipynb
%env WANDB_SILENT true

env: WANDB_NOTEBOOK_NAME=train.ipynb
env: WANDB_SILENT=true


In [3]:
from collections import defaultdict
import os

import inept
import numpy as np
import pandas as pd
import torch
import wandb

# Set params
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATA_FOLDER = os.path.join(os.path.abspath(''), '../data')
MODEL_FOLDER = os.path.join(os.path.abspath(''), 'temp/trained_models')

# Script arguments
# import sys
# arg1 = int(sys.argv[1])

In [4]:
# Original paper (pg 24)
# https://arxiv.org/pdf/1909.07528.pdf

# Original blog
# https://openai.com/research/emergent-tool-use

# Gym
# https://gymnasium.farama.org/

# Slides
# https://glouppe.github.io/info8004-advanced-machine-learning/pdf/pleroy-hide-and-seek.pdf

# PPO implementation
# https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO.py#L38

# Residual SA
# https://github.com/openai/multi-agent-emergence-environments/blob/bafaf1e11e6398624116761f91ae7c93b136f395/ma_policy/layers.py#L89

In [5]:
# Reproducibility
seed = 42
torch.manual_seed(seed)
if DEVICE == 'cuda': torch.cuda.manual_seed(seed)
np.random.seed(seed)

note_kwargs = {'seed': seed}

### Load Data

In [6]:
# Dataset loading
dataset_name = 'BrainChromatin'
if dataset_name == 'BrainChromatin':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_rna_counts.tsv'), delimiter='\t', nrows=2_000).transpose()  # TODO: Raise number of features
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_atac_gene_activities.tsv'), delimiter='\t', nrows=2_000).transpose()  # TODO: Raise number of features
    M2 = M2.transpose()[M1.index].transpose()
    meta = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_cell_metadata.txt'), delimiter='\t')
    meta_names = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_cluster_names.txt'), delimiter='\t')
    meta_names = meta_names[meta_names['Assay'] == 'Multiome ATAC']
    meta = pd.merge(meta, meta_names, left_on='ATAC_cluster', right_on='Cluster.ID', how='left')
    meta.index = meta['Cell.ID']
    T1 = T2 = np.array(meta.transpose()[M1.index].transpose()['Cluster.Name'])
    F1, F2 = M1.columns, M2.columns
    M1, M2 = M1.to_numpy(), M2.to_numpy()

    del meta, meta_names

elif dataset_name == 'scGEM':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/GeneExpression.txt'), delimiter=' ', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/DNAmethylation.txt'), delimiter=' ', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type1.txt'), delimiter=' ', header=None).to_numpy()
    T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type2.txt'), delimiter=' ', header=None).to_numpy()
    F1 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/gex_names.txt'), dtype='str')
    F2 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/dm_names.txt'), dtype='str')

# MMD-MA data
elif dataset_name == 'MMD-MA':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped1.txt'), delimiter='\t', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped2.txt'), delimiter='\t', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type1.txt'), delimiter='\t', header=None).to_numpy()
    T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type2.txt'), delimiter='\t', header=None).to_numpy()

# Random data
elif dataset_name == 'Random':
    num_nodes = 100
    M1 = torch.rand((num_nodes, 8), device=DEVICE)
    M2 = torch.rand((num_nodes, 16), device=DEVICE)

else: assert False, 'No matching dataset found.'

# Parameters
num_nodes = 100  # M1.shape[0]

# Modify data
M1, M2 = inept.utilities.normalize(M1, M2)  # Normalize
# M1, M2 = inept.utilities.pca_features(M1, M2, num_features=(16, 16))  # PCA features
M1, M2, T1, T2 = inept.utilities.subsample_nodes(M1, M2, T1, T2, num_nodes=num_nodes)  # Subsample nodes
# M1, M2 = inept.utilities.subsample_features(M1, M2, num_features=(16, 16))  # Subsample features

# Cast types
M1 = torch.tensor(M1, dtype=torch.float32, device=DEVICE)
M2 = torch.tensor(M2, dtype=torch.float32, device=DEVICE)
modalities = (M1, M2)

### Parameters

In [7]:
# Data parameters
data_kwargs = {
    'dataset': dataset_name,
    'num_nodes': num_nodes,
}

# Environment parameters
env_kwargs = {
    'dim': 2,  # x, y, vx, vy
    'pos_bound': 5,
    'pos_rand_bound': 1,
    'vel_bound': 1,
    'delta': .1,
    # 'reward_distance': 0,
    # 'reward_origin': 0,
    # 'penalty_bound': 0,
    # 'penalty_velocity': 0,
    # 'penalty_action': 0,
    'reward_distance_type': 'euclidean',
}

# Environment weight stages
stages_kwargs = {
    'env': (
        # Stage 0
        {'penalty_bound': 1},
        # Stage 1
        {'reward_origin': 1},
        # Stage 2
        {'penalty_velocity': 1, 'penalty_action': 1},
        # Stage 3
        {'reward_origin': 0, 'reward_distance': 1},
    ),
}

# Training parameters
max_ep_timesteps = 1e3  # Normal: 2e2
max_timesteps = 5e3 * max_ep_timesteps
update_timesteps = 5 * max_ep_timesteps  # Normal: 4e3
train_kwargs = {
    'max_ep_timesteps': max_ep_timesteps,
    'max_timesteps': max_timesteps,
    'update_timesteps': update_timesteps,
}

# Policy parameters
update_minibatch = int( 1e4 * (2000 / sum(M.shape[1] for M in modalities)) * (20 / data_kwargs["num_nodes"]) )  # Optimized for 1080 Ti
update_max_batch = update_minibatch  # int( 2e4 )
policy_kwargs = {
    # Main arguments
    'num_features_per_node': 2*env_kwargs['dim'],
    'modal_sizes': [M.shape[1] for M in modalities],
    'output_dim': env_kwargs['dim'],
    'action_std_init': .6,
    'action_std_decay': .05,
    'action_std_min': .1,
    'epochs': 80,
    'epsilon_clip': .2,
    'memory_gamma': .99,
    'actor_lr': 3e-4,
    'critic_lr': 1e-3,
    'lr_gamma': 1,
    'update_minibatch': min( update_minibatch, update_max_batch ),  # If too high, the kernel will crash (can also often crash machine)
    'update_max_batch': update_max_batch,  # All memories: int(train_kwargs['update_timesteps'] * data_kwargs["num_nodes"])
    'device': DEVICE,
    # Layer arguments
    'embed_dim': 64,
    'feature_embed_dim': 32,
}

# Early stopping parameters
es_kwargs = {
    # Global parameters
    'method': 'average',
    'buffer': 6 * int(train_kwargs['update_timesteps'] / train_kwargs['max_ep_timesteps']),  # 6 training cycles
    'delta': .01,
    'decreasing': False,
    # `average` method parameters
    'window_size': 3 * int(train_kwargs['update_timesteps'] / train_kwargs['max_ep_timesteps']),  # 3 training cycles
}

### Train Policy

In [8]:
# Tracking parameters
# Use `watch -d -n 0.5 nvidia-smi` to watch CUDA memory usage
# Use `top` to watch system memory usage
use_wandb = True

# Initialize classes
env = inept.environments.trajectory(*modalities, **env_kwargs, **stages_kwargs['env'][0], device=DEVICE)  # Set to first stage
policy = inept.models.PPO(**policy_kwargs)
early_stopping = inept.utilities.EarlyStopping(**es_kwargs)

# Initialize wandb
if use_wandb: wandb.init(
    project='INEPT',
    config={
        **{'note/'+k:v for k, v in note_kwargs.items()},
        **{'data/'+k:v for k, v in data_kwargs.items()},
        **{'env/'+k:v for k, v in env_kwargs.items()},
        **{'stages/'+k:v for k, v in stages_kwargs.items()},
        **{'policy/'+k:v for k, v in policy_kwargs.items()},
        **{'train/'+k:v for k, v in train_kwargs.items()},
        **{'es/'+k:v for k, v in es_kwargs.items()},
    },
)

# Initialize logging vars
torch.cuda.reset_peak_memory_stats()
timer = inept.utilities.time_logger(discard_first_sample=True)
timestep = 0; episode = 1; stage = 0

# CLI
print('Beginning training')
print(f'Subsampling {policy_kwargs["update_max_batch"]} states with minibatches of size {policy_kwargs["update_minibatch"]} from {int(train_kwargs["update_timesteps"] * data_kwargs["num_nodes"])} total.')

# Simulation loop
while timestep < train_kwargs['max_timesteps']:
    # Reset environment
    env.reset()
    timer.log('Reset Environment')

    # Start episode
    ep_timestep = 0; ep_reward = 0; ep_itemized_reward = defaultdict(lambda: 0)
    while ep_timestep < train_kwargs['max_ep_timesteps']:
        with torch.no_grad():
            # Get current state
            state = env.get_state(include_modalities=True)
            timer.log('Environment Setup')

            # Get actions from policy
            actions = policy.act_macro(state, keys=list(range(num_nodes))).detach()
            timer.log('Calculate Actions')

            # Step environment and get reward
            rewards, finished, itemized_rewards = env.step(actions, return_rewards=True)
            finished = finished or (ep_timestep == train_kwargs['max_ep_timesteps']-1)  # Maybe move logic inside env?
            timer.log('Step Environment')

            # Record rewards for policy
            policy.memory.record(
                rewards=rewards.cpu().tolist(),
                is_terminals=finished,
            )

            # Record rewards for logging
            ep_reward = ep_reward + rewards.cpu().mean()
            for k, v in itemized_rewards.items():
                ep_itemized_reward[k] += v.cpu().mean()
            timer.log('Record Rewards')

        # Iterate
        timestep += 1
        ep_timestep += 1

        # Update model
        if timestep % train_kwargs['update_timesteps'] == 0:
            # assert False
            print(f'Updating model with average reward {np.mean(policy.memory.storage["rewards"])} on episode {episode} and timestep {timestep}', end='')
            policy.update()
            print(f' ({torch.cuda.max_memory_allocated() / 1024**3:.2f} GB CUDA)')
            torch.cuda.reset_peak_memory_stats()
            timer.log('Update Policy')

        # Escape if finished
        if finished: break

    # Upload stats
    ep_reward = (ep_reward / ep_timestep).item()
    update = int(timestep / train_kwargs['update_timesteps'])
    if use_wandb:
        wandb.log({
            **{
            # Measurements
            'end_timestep': timestep,
            'episode': episode,
            'update': update,
            'stage': stage,
            # Parameters
            'action_std': policy.action_std,
            # Outputs
            'average_reward': ep_reward,
            },
            **{'rewards/'+k: (v / ep_timestep).item() for k, v in ep_itemized_reward.items()},
        })
    timer.log('Record Stats')

    # Decay model std
    if early_stopping(ep_reward) or timestep >= train_kwargs['max_timesteps']:
        # Save model
        wgt_file = os.path.join(MODEL_FOLDER, f'policy_{stage:02}.wgt')
        torch.save(policy.state_dict(), wgt_file)  # Save just weights
        if use_wandb: wandb.save(wgt_file)
        mdl_file = os.path.join(MODEL_FOLDER, f'policy_{stage:02}.mdl')
        torch.save(policy, mdl_file)  # Save whole model
        if use_wandb: wandb.save(mdl_file)

        # End if maximum timesteps reached
        if timestep >= train_kwargs['max_timesteps']:
            print('Maximal timesteps reached')

        # End if at minimum `action_std`
        if policy.action_std <= policy.action_std_min:
            print(f'Ending early on episode {episode} and timestep {timestep}')
            break

        # Activate next stage or decay
        stage += 1
        # CLI
        print(f'Advancing training to stage {stage}')
        if stage < len(stages_kwargs['env']):
            # Activate next stage
            env.set_rewards(stages_kwargs['env'][stage])
        else:
            # Decay policy randomness
            policy.decay_action_std()
            # CLI
            print(f'Decaying std to {policy.action_std} on episode {episode} and timestep {timestep}')

        # Reset early stopping
        early_stopping.reset()
    timer.log('Early Stopping')

    # Iterate
    episode += 1

# CLI Timer
print()
timer.aggregate('sum')

# Finish wandb
if use_wandb: wandb.finish()

Beginning training
Subsampling 1000 states with minibatches of size 1000 from 500000 total.


Updating model with average reward -1.54066 on episode 5 and timestep 5000

 (3.01 GB CUDA)


Updating model with average reward -1.427368 on episode 10 and timestep 10000

 (3.01 GB CUDA)


Updating model with average reward -1.366008 on episode 15 and timestep 15000

 (3.01 GB CUDA)


Updating model with average reward -1.228196 on episode 20 and timestep 20000

 (3.01 GB CUDA)


Updating model with average reward -1.376924 on episode 25 and timestep 25000

 (3.01 GB CUDA)


Updating model with average reward -1.175704 on episode 30 and timestep 30000

 (3.01 GB CUDA)


Updating model with average reward -1.007472 on episode 35 and timestep 35000

 (3.01 GB CUDA)


Updating model with average reward -0.737512 on episode 40 and timestep 40000

 (3.01 GB CUDA)


Updating model with average reward -1.032336 on episode 45 and timestep 45000

 (3.01 GB CUDA)


Updating model with average reward -0.693968 on episode 50 and timestep 50000

 (3.01 GB CUDA)


Updating model with average reward -0.901836 on episode 55 and timestep 55000

 (3.01 GB CUDA)


Updating model with average reward -0.713328 on episode 60 and timestep 60000

 (3.01 GB CUDA)


Updating model with average reward -0.732932 on episode 65 and timestep 65000

 (3.01 GB CUDA)


Updating model with average reward -0.565408 on episode 70 and timestep 70000

 (3.01 GB CUDA)


Updating model with average reward -0.818816 on episode 75 and timestep 75000

 (3.01 GB CUDA)


Updating model with average reward -0.36382 on episode 80 and timestep 80000

 (3.01 GB CUDA)


Updating model with average reward -0.243916 on episode 85 and timestep 85000

 (3.01 GB CUDA)


Updating model with average reward -0.0531 on episode 90 and timestep 90000

 (3.01 GB CUDA)


Updating model with average reward -0.034008 on episode 95 and timestep 95000

 (3.01 GB CUDA)


Updating model with average reward -0.023912 on episode 100 and timestep 100000

 (3.01 GB CUDA)


Updating model with average reward -0.01802 on episode 105 and timestep 105000

 (3.01 GB CUDA)


Updating model with average reward -0.014884 on episode 110 and timestep 110000

 (3.01 GB CUDA)


Updating model with average reward -0.013796 on episode 115 and timestep 115000

 (3.01 GB CUDA)


Updating model with average reward -0.014464 on episode 120 and timestep 120000

 (3.01 GB CUDA)


Updating model with average reward -0.007404 on episode 125 and timestep 125000

 (3.01 GB CUDA)


Updating model with average reward -0.004596 on episode 130 and timestep 130000

 (3.01 GB CUDA)


Updating model with average reward -0.003456 on episode 135 and timestep 135000

 (3.01 GB CUDA)


Updating model with average reward -0.002328 on episode 140 and timestep 140000

 (3.01 GB CUDA)


Updating model with average reward -0.001536 on episode 145 and timestep 145000

 (3.01 GB CUDA)


Updating model with average reward -0.000656 on episode 150 and timestep 150000

 (3.01 GB CUDA)


Updating model with average reward -0.000728 on episode 155 and timestep 155000

 (3.01 GB CUDA)


Updating model with average reward -8.8e-05 on episode 160 and timestep 160000

 (3.01 GB CUDA)


Updating model with average reward -0.00026 on episode 165 and timestep 165000

 (3.01 GB CUDA)


Advancing training to stage 1


Updating model with average reward -0.11570096064680815 on episode 170 and timestep 170000

 (3.01 GB CUDA)


Updating model with average reward -0.10258315300875902 on episode 175 and timestep 175000

 (3.01 GB CUDA)


Updating model with average reward -0.08739825301133096 on episode 180 and timestep 180000

 (3.01 GB CUDA)


Updating model with average reward -0.05665520005464554 on episode 185 and timestep 185000

 (3.01 GB CUDA)


Updating model with average reward -0.03601942310194671 on episode 190 and timestep 190000

 (3.01 GB CUDA)


Updating model with average reward -0.023815207916654646 on episode 195 and timestep 195000

 (3.01 GB CUDA)


Updating model with average reward -0.003764595066934824 on episode 200 and timestep 200000

 (3.01 GB CUDA)


Updating model with average reward 0.008319394479781389 on episode 205 and timestep 205000

 (3.01 GB CUDA)


Updating model with average reward 0.013820699608340859 on episode 210 and timestep 210000

 (3.01 GB CUDA)


Updating model with average reward 0.01836525334794633 on episode 215 and timestep 215000

 (3.01 GB CUDA)


Updating model with average reward 0.0234605907426998 on episode 220 and timestep 220000

 (3.01 GB CUDA)


Updating model with average reward 0.03139412025847286 on episode 225 and timestep 225000

 (3.01 GB CUDA)


Updating model with average reward 0.03460393983583152 on episode 230 and timestep 230000

 (3.01 GB CUDA)


Updating model with average reward 0.03855696997383609 on episode 235 and timestep 235000

 (3.01 GB CUDA)


Updating model with average reward 0.04020594506212324 on episode 240 and timestep 240000

 (3.01 GB CUDA)


Updating model with average reward 0.043574800506174564 on episode 245 and timestep 245000

 (3.01 GB CUDA)


Updating model with average reward 0.04476795309132338 on episode 250 and timestep 250000

 (3.01 GB CUDA)


Updating model with average reward 0.04254899021672271 on episode 255 and timestep 255000

 (3.01 GB CUDA)


Updating model with average reward 0.053321021058563144 on episode 260 and timestep 260000

 (3.01 GB CUDA)


Updating model with average reward 0.05139799759559892 on episode 265 and timestep 265000

 (3.01 GB CUDA)


Updating model with average reward 0.053360716918645426 on episode 270 and timestep 270000

 (3.01 GB CUDA)


Updating model with average reward 0.05432648567392491 on episode 275 and timestep 275000

 (3.01 GB CUDA)


Updating model with average reward 0.05696174816912972 on episode 280 and timestep 280000

 (3.01 GB CUDA)


Updating model with average reward 0.05374877974059619 on episode 285 and timestep 285000

 (3.01 GB CUDA)


Updating model with average reward 0.05829406799527444 on episode 290 and timestep 290000

 (3.01 GB CUDA)


Updating model with average reward 0.05888016659829393 on episode 295 and timestep 295000

 (3.01 GB CUDA)


Updating model with average reward 0.059957179331095886 on episode 300 and timestep 300000

 (3.01 GB CUDA)


Updating model with average reward 0.057937664813505485 on episode 305 and timestep 305000

 (3.01 GB CUDA)


Advancing training to stage 2


Updating model with average reward -0.05598663475413853 on episode 310 and timestep 310000

 (3.01 GB CUDA)


Updating model with average reward -0.5148614919857871 on episode 315 and timestep 315000

 (3.01 GB CUDA)


Updating model with average reward -0.5081089103398909 on episode 320 and timestep 320000

 (3.01 GB CUDA)


Updating model with average reward -0.501293186148361 on episode 325 and timestep 325000

 (3.01 GB CUDA)


Updating model with average reward -0.5047824757470488 on episode 330 and timestep 330000

 (3.01 GB CUDA)


Updating model with average reward -0.4985051459697989 on episode 335 and timestep 335000

 (3.01 GB CUDA)


Updating model with average reward -0.4902715367288711 on episode 340 and timestep 340000

 (3.01 GB CUDA)


Updating model with average reward -0.4854798003756078 on episode 345 and timestep 345000

 (3.01 GB CUDA)


Updating model with average reward -0.4846533656806806 on episode 350 and timestep 350000

 (3.01 GB CUDA)


Updating model with average reward -0.48340760166663793 on episode 355 and timestep 355000

 (3.01 GB CUDA)


Updating model with average reward -0.4819550950154181 on episode 360 and timestep 360000

 (3.01 GB CUDA)


Updating model with average reward -0.4782591398533841 on episode 365 and timestep 365000

 (3.01 GB CUDA)


Updating model with average reward -0.47691344832403026 on episode 370 and timestep 370000

 (3.01 GB CUDA)


Updating model with average reward -0.47521700166680686 on episode 375 and timestep 375000

 (3.01 GB CUDA)


Updating model with average reward -0.47386891810715337 on episode 380 and timestep 380000

 (3.01 GB CUDA)


Updating model with average reward -0.47606758102232566 on episode 385 and timestep 385000

 (3.01 GB CUDA)


Updating model with average reward -0.4761281356226748 on episode 390 and timestep 390000

 (3.01 GB CUDA)


Updating model with average reward -0.4735426171301268 on episode 395 and timestep 395000

 (3.01 GB CUDA)


Updating model with average reward -0.4732152965267971 on episode 400 and timestep 400000

 (3.01 GB CUDA)


Advancing training to stage 3


Updating model with average reward -0.503548025891373 on episode 405 and timestep 405000

 (3.01 GB CUDA)


Updating model with average reward -0.6253156348157514 on episode 410 and timestep 410000

 (3.01 GB CUDA)


Updating model with average reward -0.6205414692423825 on episode 415 and timestep 415000

 (3.01 GB CUDA)


Updating model with average reward -0.6083087774239926 on episode 420 and timestep 420000

 (3.01 GB CUDA)


Updating model with average reward -0.6049807994570835 on episode 425 and timestep 425000

 (3.01 GB CUDA)


Updating model with average reward -0.5933496502458169 on episode 430 and timestep 430000

 (3.01 GB CUDA)


Updating model with average reward -0.583857180426142 on episode 435 and timestep 435000

 (3.01 GB CUDA)


Updating model with average reward -0.5785446841778348 on episode 440 and timestep 440000

 (3.01 GB CUDA)


Updating model with average reward -0.569338977948314 on episode 445 and timestep 445000

 (3.01 GB CUDA)


Updating model with average reward -0.5654857559293265 on episode 450 and timestep 450000

 (3.01 GB CUDA)


Updating model with average reward -0.5614679762772288 on episode 455 and timestep 455000

 (3.01 GB CUDA)


Updating model with average reward -0.5538476393241525 on episode 460 and timestep 460000

 (3.01 GB CUDA)


Updating model with average reward -0.5486622803654242 on episode 465 and timestep 465000

 (3.01 GB CUDA)


Updating model with average reward -0.5419943129895325 on episode 470 and timestep 470000

 (3.01 GB CUDA)


Updating model with average reward -0.5385906260346641 on episode 475 and timestep 475000

 (3.01 GB CUDA)


Updating model with average reward -0.5296250962687983 on episode 480 and timestep 480000

 (3.01 GB CUDA)


Updating model with average reward -0.5277920683446412 on episode 485 and timestep 485000

 (3.01 GB CUDA)


Updating model with average reward -0.529106513174777 on episode 490 and timestep 490000

 (3.01 GB CUDA)


Updating model with average reward -0.5237140378542526 on episode 495 and timestep 495000

 (3.01 GB CUDA)


Updating model with average reward -0.5200457376363153 on episode 500 and timestep 500000

 (3.01 GB CUDA)


Updating model with average reward -0.5225533022466696 on episode 505 and timestep 505000

 (3.01 GB CUDA)


Updating model with average reward -0.5217887052540626 on episode 510 and timestep 510000

 (3.01 GB CUDA)


Updating model with average reward -0.5158428374735299 on episode 515 and timestep 515000

 (3.01 GB CUDA)


Updating model with average reward -0.5160592146342259 on episode 520 and timestep 520000

 (3.01 GB CUDA)


Updating model with average reward -0.5143750173293409 on episode 525 and timestep 525000

 (3.01 GB CUDA)


Updating model with average reward -0.5126583480325435 on episode 530 and timestep 530000

 (3.01 GB CUDA)


Updating model with average reward -0.5142203527926607 on episode 535 and timestep 535000

 (3.01 GB CUDA)


Advancing training to stage 4


Decaying std to 0.5499999999999999 on episode 537 and timestep 537000


Updating model with average reward -0.4632151129331398 on episode 540 and timestep 540000

 (3.01 GB CUDA)


Updating model with average reward -0.4290758101294317 on episode 545 and timestep 545000

 (3.01 GB CUDA)


Updating model with average reward -0.4293705092679708 on episode 550 and timestep 550000

 (3.01 GB CUDA)


Updating model with average reward -0.429771243119462 on episode 555 and timestep 555000

 (3.01 GB CUDA)


Updating model with average reward -0.4285177794568837 on episode 560 and timestep 560000

 (3.01 GB CUDA)


Updating model with average reward -0.4275622973761333 on episode 565 and timestep 565000

 (3.01 GB CUDA)


Updating model with average reward -0.43005398196770356 on episode 570 and timestep 570000

 (3.01 GB CUDA)


Updating model with average reward -0.4273201193430388 on episode 575 and timestep 575000

 (3.01 GB CUDA)


Updating model with average reward -0.4262102257307524 on episode 580 and timestep 580000

 (3.01 GB CUDA)


Advancing training to stage 5


Decaying std to 0.49999999999999994 on episode 581 and timestep 581000


Updating model with average reward -0.3645127527178382 on episode 585 and timestep 585000

 (3.01 GB CUDA)


Updating model with average reward -0.34981887868900435 on episode 590 and timestep 590000

 (3.01 GB CUDA)


Updating model with average reward -0.3522195074372339 on episode 595 and timestep 595000

 (3.01 GB CUDA)


Updating model with average reward -0.35300941866503427 on episode 600 and timestep 600000

 (3.01 GB CUDA)


Updating model with average reward -0.3477827548752659 on episode 605 and timestep 605000

 (3.01 GB CUDA)


Updating model with average reward -0.3517837985286681 on episode 610 and timestep 610000

 (3.01 GB CUDA)


Updating model with average reward -0.3509270669821132 on episode 615 and timestep 615000

 (3.01 GB CUDA)


Updating model with average reward -0.3506481035254446 on episode 620 and timestep 620000

 (3.01 GB CUDA)


Updating model with average reward -0.3513816625865364 on episode 625 and timestep 625000

 (3.01 GB CUDA)


Advancing training to stage 6
Decaying std to 0.44999999999999996 on episode 625 and timestep 625000


Updating model with average reward -0.2831705946614091 on episode 630 and timestep 630000

 (3.01 GB CUDA)


Updating model with average reward -0.28310268454522686 on episode 635 and timestep 635000

 (3.01 GB CUDA)


Updating model with average reward -0.28419906556366875 on episode 640 and timestep 640000

 (3.01 GB CUDA)


Updating model with average reward -0.28245321121667244 on episode 645 and timestep 645000

 (3.01 GB CUDA)


Updating model with average reward -0.28430728142733364 on episode 650 and timestep 650000

 (3.01 GB CUDA)


Updating model with average reward -0.2828289310499787 on episode 655 and timestep 655000

 (3.01 GB CUDA)


Updating model with average reward -0.2818364120498789 on episode 660 and timestep 660000

 (3.01 GB CUDA)


Updating model with average reward -0.2835964505551198 on episode 665 and timestep 665000

 (3.01 GB CUDA)


Advancing training to stage 7
Decaying std to 0.39999999999999997 on episode 669 and timestep 669000


Updating model with average reward -0.27133745546980426 on episode 670 and timestep 670000

 (3.01 GB CUDA)


Updating model with average reward -0.22375874592233524 on episode 675 and timestep 675000

 (3.01 GB CUDA)


Updating model with average reward -0.22257423246608019 on episode 680 and timestep 680000

 (3.01 GB CUDA)


Updating model with average reward -0.22204693396558084 on episode 685 and timestep 685000

 (3.01 GB CUDA)


Updating model with average reward -0.22166390583817186 on episode 690 and timestep 690000

 (3.01 GB CUDA)


Updating model with average reward -0.22257334948153473 on episode 695 and timestep 695000

 (3.01 GB CUDA)


Updating model with average reward -0.22311602327622318 on episode 700 and timestep 700000

 (3.01 GB CUDA)


Updating model with average reward -0.22317602742725187 on episode 705 and timestep 705000

 (3.01 GB CUDA)


Updating model with average reward -0.22382956658322412 on episode 710 and timestep 710000

 (3.01 GB CUDA)


Advancing training to stage 8


Decaying std to 0.35 on episode 713 and timestep 713000


Updating model with average reward -0.20076919110929267 on episode 715 and timestep 715000

 (3.01 GB CUDA)


Updating model with average reward -0.1714653199682168 on episode 720 and timestep 720000

 (3.01 GB CUDA)


Updating model with average reward -0.17182464384631682 on episode 725 and timestep 725000

 (3.01 GB CUDA)


Updating model with average reward -0.16908968203825597 on episode 730 and timestep 730000

 (3.01 GB CUDA)


Updating model with average reward -0.16875157179720937 on episode 735 and timestep 735000

 (3.01 GB CUDA)


Updating model with average reward -0.16962338136892324 on episode 740 and timestep 740000

 (3.01 GB CUDA)


Updating model with average reward -0.16989796843554028 on episode 745 and timestep 745000

 (3.01 GB CUDA)


Updating model with average reward -0.17102557481500044 on episode 750 and timestep 750000

 (3.01 GB CUDA)


Updating model with average reward -0.17089648234004295 on episode 755 and timestep 755000

 (3.01 GB CUDA)


Advancing training to stage 9


Decaying std to 0.3 on episode 757 and timestep 757000


Updating model with average reward -0.14261231198215926 on episode 760 and timestep 760000

 (3.01 GB CUDA)


Updating model with average reward -0.12479208914985174 on episode 765 and timestep 765000

 (3.01 GB CUDA)


Updating model with average reward -0.12471232143890823 on episode 770 and timestep 770000

 (3.01 GB CUDA)


Updating model with average reward -0.12355054188129595 on episode 775 and timestep 775000

 (3.01 GB CUDA)


Updating model with average reward -0.12524160984049326 on episode 780 and timestep 780000

 (3.01 GB CUDA)


Updating model with average reward -0.12299275131304103 on episode 785 and timestep 785000

 (3.01 GB CUDA)


Updating model with average reward -0.12407219673364236 on episode 790 and timestep 790000

 (3.01 GB CUDA)


Updating model with average reward -0.12449626236590879 on episode 795 and timestep 795000

 (3.01 GB CUDA)


Updating model with average reward -0.12573313721128648 on episode 800 and timestep 800000

 (3.01 GB CUDA)


Advancing training to stage 10


Decaying std to 0.25 on episode 801 and timestep 801000


Updating model with average reward -0.09380606601747993 on episode 805 and timestep 805000

 (3.01 GB CUDA)


Updating model with average reward -0.08783048389222854 on episode 810 and timestep 810000

 (3.01 GB CUDA)


Updating model with average reward -0.08811354956748689 on episode 815 and timestep 815000

 (3.01 GB CUDA)


Updating model with average reward -0.08922434137103319 on episode 820 and timestep 820000

 (3.01 GB CUDA)


Updating model with average reward -0.0875540706172887 on episode 825 and timestep 825000

 (3.01 GB CUDA)


Updating model with average reward -0.08894683096827878 on episode 830 and timestep 830000

 (3.01 GB CUDA)


Updating model with average reward -0.08689919884208604 on episode 835 and timestep 835000

 (3.01 GB CUDA)


Updating model with average reward -0.08693671220168857 on episode 840 and timestep 840000

 (3.01 GB CUDA)


Updating model with average reward -0.08905558798947258 on episode 845 and timestep 845000

 (3.01 GB CUDA)
Advancing training to stage 11
Decaying std to 0.2 on episode 845 and timestep 845000


Updating model with average reward -0.05465780347325913 on episode 850 and timestep 850000

 (3.01 GB CUDA)


Updating model with average reward -0.05578872013186259 on episode 855 and timestep 855000

 (3.01 GB CUDA)


Updating model with average reward -0.05590849679404192 on episode 860 and timestep 860000

 (3.01 GB CUDA)


Updating model with average reward -0.054960764928805396 on episode 865 and timestep 865000

 (3.01 GB CUDA)


Updating model with average reward -0.05389241654601143 on episode 870 and timestep 870000

 (3.01 GB CUDA)


Updating model with average reward -0.05433718180977933 on episode 875 and timestep 875000

 (3.01 GB CUDA)


Updating model with average reward -0.0553662420144005 on episode 880 and timestep 880000

 (3.01 GB CUDA)


Updating model with average reward -0.05537830573898862 on episode 885 and timestep 885000

 (3.01 GB CUDA)


Advancing training to stage 12


Decaying std to 0.15000000000000002 on episode 889 and timestep 889000


Updating model with average reward -0.05167800684563273 on episode 890 and timestep 890000

 (3.01 GB CUDA)


Updating model with average reward -0.028622414808721602 on episode 895 and timestep 895000

 (3.01 GB CUDA)


Updating model with average reward -0.03043119059550322 on episode 900 and timestep 900000

 (3.01 GB CUDA)


Updating model with average reward -0.030569271234324714 on episode 905 and timestep 905000

 (3.01 GB CUDA)


Updating model with average reward -0.031160431341165164 on episode 910 and timestep 910000

 (3.01 GB CUDA)


Updating model with average reward -0.03010945486133019 on episode 915 and timestep 915000

 (3.01 GB CUDA)


Updating model with average reward -0.03210623789521736 on episode 920 and timestep 920000

 (3.01 GB CUDA)


Updating model with average reward -0.03171856929222687 on episode 925 and timestep 925000

 (3.01 GB CUDA)


Updating model with average reward -0.030875544422589954 on episode 930 and timestep 930000

 (3.01 GB CUDA)


Advancing training to stage 13


Decaying std to 0.10000000000000002 on episode 933 and timestep 933000


Updating model with average reward -0.02412624672257092 on episode 935 and timestep 935000

 (3.01 GB CUDA)


Updating model with average reward -0.011813218912250293 on episode 940 and timestep 940000

 (3.01 GB CUDA)


Updating model with average reward -0.015295426602479793 on episode 945 and timestep 945000

 (3.01 GB CUDA)


Updating model with average reward -0.013085396294426845 on episode 950 and timestep 950000

 (3.01 GB CUDA)


Updating model with average reward -0.014581249723119737 on episode 955 and timestep 955000

 (3.01 GB CUDA)


Updating model with average reward -0.01417408926799234 on episode 960 and timestep 960000

 (3.01 GB CUDA)


Updating model with average reward -0.013266835332582487 on episode 965 and timestep 965000

 (3.01 GB CUDA)


Updating model with average reward -0.014185800197246997 on episode 970 and timestep 970000

 (3.01 GB CUDA)


Updating model with average reward -0.014599908882804083 on episode 975 and timestep 975000

 (3.01 GB CUDA)


Advancing training to stage 14


Decaying std to 0.1 on episode 977 and timestep 977000


Updating model with average reward -0.013887968614676482 on episode 980 and timestep 980000

 (3.01 GB CUDA)


Updating model with average reward -0.014977739237843584 on episode 985 and timestep 985000

 (3.01 GB CUDA)


Updating model with average reward -0.014434134868327678 on episode 990 and timestep 990000

 (3.01 GB CUDA)


Updating model with average reward -0.013095085220360117 on episode 995 and timestep 995000

 (3.01 GB CUDA)


Updating model with average reward -0.015330101614758984 on episode 1000 and timestep 1000000

 (3.01 GB CUDA)


Updating model with average reward -0.013439631169379252 on episode 1005 and timestep 1005000

 (3.01 GB CUDA)


Updating model with average reward -0.015392415418416705 on episode 1010 and timestep 1010000

 (3.01 GB CUDA)


Updating model with average reward -0.013930599324360563 on episode 1015 and timestep 1015000

 (3.01 GB CUDA)


Updating model with average reward -0.014090815707665577 on episode 1020 and timestep 1020000

 (3.01 GB CUDA)


Ending early on episode 1021 and timestep 1021000



Reset Environment: 3.6189038030119036


Environment Setup: 34.459714092356705


Calculate Actions: 10679.664114404415


Step Environment: 858.6425333003565


Record Rewards: 228.97438001811037
Record Stats: 1.9380471019876495
Early Stopping: 254.60155569203897
Update Policy: 14367.916906778002
Total: 26429.81615519028
