In [1]:
## To Check
# Check that rewards are normalized after (?) advantage

## High Priority Training Changes
# Make backward (MAX_NODES, MAX_BATCH) batching work
# Add multithreading to forward and distributed to backward
# Add compatibility for env being on CPU, check for timing changes

## Backburner Priority Training Changes
# Add compatibility for cells with missing modalities (add mask to distance reward)
# Try imitation learning to better learn CT trajectories
# Add parallel envs of different sizes, with different data to help generality
# Fix off-center positioning in large environments
# Revise distance reward - Maybe add cell attraction (all should be close to each other) and repulsion (repulsion based on distance in modality)
# Revise velocity and action penalties to encourage early cell-type separation (i.e. sqrt of vec length or similar)

## Bookkeeping and QOL
# Save every time early stopping occurs
# Hook up sweeps API for wandb

In [2]:
# Original paper (pg 24)
# https://arxiv.org/pdf/1909.07528.pdf

# Original blog
# https://openai.com/research/emergent-tool-use

# Gym
# https://gymnasium.farama.org/

# Slides
# https://glouppe.github.io/info8004-advanced-machine-learning/pdf/pleroy-hide-and-seek.pdf

# PPO implementation
# https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO.py#L38

# Residual SA
# https://github.com/openai/multi-agent-emergence-environments/blob/bafaf1e11e6398624116761f91ae7c93b136f395/ma_policy/layers.py#L89

In [3]:
%load_ext autoreload
%autoreload 2
%env WANDB_NOTEBOOK_NAME train.ipynb
%env WANDB_SILENT true

env: WANDB_NOTEBOOK_NAME=train.ipynb
env: WANDB_SILENT=true


In [4]:
from collections import defaultdict
import os

import inept
import numpy as np
import pandas as pd
import torch
import wandb

# Set params
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATA_FOLDER = os.path.join(os.path.abspath(''), '../data')
MODEL_FOLDER = os.path.join(os.path.abspath(''), 'temp/trained_models')

# Script arguments
# import sys
# arg1 = int(sys.argv[1])

In [5]:
# Reproducibility
seed = 42
torch.manual_seed(seed)
if DEVICE == 'cuda': torch.cuda.manual_seed(seed)
np.random.seed(seed)

note_kwargs = {'seed': seed}

### Load Data

In [6]:
# Dataset loading
dataset_name = 'BrainChromatin'

if dataset_name == 'scNMT':
    dataset_dir = os.path.join(DATA_FOLDER, 'UnionCom/scNMT')
    M1 = pd.read_csv(os.path.join(dataset_dir, 'Paccessibility_300.txt'), delimiter=' ', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(dataset_dir, 'Pmethylation_300.txt'), delimiter=' ', header=None).to_numpy()
    M3 = pd.read_csv(os.path.join(dataset_dir, 'RNA_300.txt'), delimiter=' ', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(dataset_dir, 'type1.txt'), delimiter=' ', header=None).to_numpy().flatten()
    T2 = pd.read_csv(os.path.join(dataset_dir, 'type2.txt'), delimiter=' ', header=None).to_numpy().flatten()
    T3 = pd.read_csv(os.path.join(dataset_dir, 'type3.txt'), delimiter=' ', header=None).to_numpy().flatten()

elif dataset_name == 'BrainChromatin':
    nrows = None  # 2_000
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_rna_counts.tsv'), delimiter='\t', nrows=nrows).transpose()  # 4.6 Gb in memory
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_atac_gene_activities.tsv'), delimiter='\t', nrows=nrows).transpose()  # 2.6 Gb in memory
    M2 = M2.transpose()[M1.index].transpose()
    meta = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_cell_metadata.txt'), delimiter='\t')
    meta_names = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_cluster_names.txt'), delimiter='\t')
    meta_names = meta_names[meta_names['Assay'] == 'Multiome ATAC']
    meta = pd.merge(meta, meta_names, left_on='ATAC_cluster', right_on='Cluster.ID', how='left')
    meta.index = meta['Cell.ID']
    T1 = T2 = np.array(meta.transpose()[M1.index].transpose()['Cluster.Name'])
    F1, F2 = M1.columns, M2.columns
    M1, M2 = M1.to_numpy(), M2.to_numpy()

    del meta, meta_names

elif dataset_name == 'scGEM':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/GeneExpression.txt'), delimiter=' ', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/DNAmethylation.txt'), delimiter=' ', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type1.txt'), delimiter=' ', header=None).to_numpy()
    T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type2.txt'), delimiter=' ', header=None).to_numpy()
    F1 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/gex_names.txt'), dtype='str')
    F2 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/dm_names.txt'), dtype='str')

# MMD-MA data
elif dataset_name == 'MMD-MA':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped1.txt'), delimiter='\t', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped2.txt'), delimiter='\t', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type1.txt'), delimiter='\t', header=None).to_numpy()
    T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type2.txt'), delimiter='\t', header=None).to_numpy()

# Random data
elif dataset_name == 'Random':
    num_nodes = 100
    M1 = torch.rand((num_nodes, 8), device=DEVICE)
    M2 = torch.rand((num_nodes, 16), device=DEVICE)

else: assert False, 'No matching dataset found.'

# Parameters
num_nodes = 100  # M1.shape[0]
modalities = [M1, M2]  # [[M1, M2, M3][1]]  # TODO: Make more flexible
types = [T1, T2]  # [[T1, T2, T3][1]]

# Modify data
# modalities = inept.utilities.normalize(*modalities, keep_array=True)  # Normalize
modalities = inept.utilities.pca_features(*modalities, num_features=(512, 512), keep_array=True)  # PCA features (2 min for 8k x 35+k)
subsample = inept.utilities.subsample_nodes(*modalities, *types, num_nodes=num_nodes, keep_array=True)  # Subsample nodes
modalities, types = subsample[:len(modalities)], subsample[len(modalities):]
# modalities = inept.utilities.subsample_features(*modalities, num_features=(16, 16), keep_array=True)  # Subsample features

# Cast types
modalities = [torch.tensor(Mx, dtype=torch.float32, device=DEVICE) for Mx in modalities]

### Parameters

In [7]:
# Data parameters
data_kwargs = {
    'dataset': dataset_name,
    'num_nodes': num_nodes,
}

# Environment parameters
env_kwargs = {
    'dim': 32,  # 2 = (x, y, vx, vy), 3 = (x, y, z, vx, vy, vz)
    'pos_bound': 10,
    'pos_rand_bound': 1,
    'vel_bound': 1,
    'delta': .1,
    # 'reward_distance': 0,
    # 'reward_origin': 0,
    # 'penalty_bound': 0,
    # 'penalty_velocity': 0,
    # 'penalty_action': 0,
    'reward_distance_type': 'euclidean',
}

# Environment weight stages
stages_kwargs = {
    'env': (
        # Stage 0
        {'penalty_bound': 1},
        # Stage 1
        {'reward_origin': 1},
        # Stage 2
        {'penalty_velocity': 1, 'penalty_action': 1},
        # Stage 3
        {'reward_origin': 0, 'reward_distance': 1},
    ),
}

# Training parameters
max_ep_timesteps = 1e3
update_timesteps = 5 * max_ep_timesteps
max_timesteps = 1e3 * update_timesteps
MAX_BATCH = min( 500, data_kwargs['num_nodes'] )  # NOTE: value should be similar to update_minibatch, if a bit larger
MAX_NODES = min( 50, data_kwargs['num_nodes'] )  # Larger means smaller minibatches but a fuller picture for each agent
MAX_BATCH = MAX_NODES = None  # TODO: Currently values other than `None` do not work with update
train_kwargs = {
    'max_ep_timesteps': max_ep_timesteps,
    'max_timesteps': max_timesteps,
    'update_timesteps': update_timesteps,
    'max_batch': MAX_BATCH,  # Max number of nodes to calculate actions for at a time
    'max_nodes': MAX_NODES,  # Max number of nodes to use as neighbors in action calculation
}

# Policy parameters
# num_train_nodes = data_kwargs['num_nodes'] if train_kwargs['max_nodes'] is None else min(data_kwargs['num_nodes'], train_kwargs['max_nodes'])
# GPU_MEMORY = 6; CPU_MEMORY = 16  # Optimized for 6Gb VRAM and 16Gb RAM
# MAX_GPU_RUN_SAMPLES = int( .8 * (GPU_MEMORY / 6) * 1e4 * (2000 / sum(M.shape[1] for M in modalities)) * (20 / num_train_nodes) )
# GPU_STORE_SAMPLES = int( 2 * MAX_GPU_RUN_SAMPLES )  # 3
# MAX_CPU_SAMPLES = int( (CPU_MEMORY / GPU_MEMORY) * MAX_GPU_RUN_SAMPLES )
# IDEAL_BATCH_SIZE = int( max_ep_timesteps )
update_maxbatch = None  # `MAX_CPU_SAMPLES`, `None` takes slightly longer but is more reliable
update_batch = int(2e4)  # Same or larger size as `update_maxbatch` skips GPU cast step inside epoch loop
update_minibatch = int(2e3)
policy_kwargs = {
    # Main arguments
    'num_features_per_node': 2*env_kwargs['dim'],
    'modal_sizes': [M.shape[1] for M in modalities],
    'output_dim': env_kwargs['dim'],
    'action_std_init': .6,
    'action_std_decay': .05,
    'action_std_min': .1,
    'epochs': 80,
    'epsilon_clip': .2,
    'memory_gamma': .95,
    'memory_prune': 100,
    'actor_lr': 3e-4,
    'critic_lr': 1e-3,
    'lr_gamma': 1,
    'update_maxbatch': update_maxbatch,  # Batch to load into RAM
    'update_batch': update_batch,  # Batch to load into VRAM
    'update_minibatch': update_minibatch,  # Batch to compute
    'update_load_level': 'minibatch',
    'update_cast_level': 'minibatch',
    'device': DEVICE,
    # Layer arguments
    'embed_dim': 64,
    'feature_embed_dim': 32,
    'rs_nset': 1e5,  # Inversely proportional to influence of individual reward on moving statistics
}

# Early stopping parameters
es_kwargs = {
    # Global parameters
    'method': 'average',
    'buffer': 6 * int(train_kwargs['update_timesteps'] / train_kwargs['max_ep_timesteps']),  # 6 training cycles
    'delta': .01,
    'decreasing': False,
    # `average` method parameters
    'window_size': 3 * int(train_kwargs['update_timesteps'] / train_kwargs['max_ep_timesteps']),  # 3 training cycles
}

### Train Policy

In [8]:
# Tracking parameters
# Use `watch -d -n 0.5 nvidia-smi` to watch CUDA memory usage
# Use `top` to watch system memory usage
# Run script and put following above function to profile
#    from memory_profiler import profile
#    @profile
# Use cProfiler to profile timing:
#    python -m cProfile -s time -o profile.prof train.py
#    snakeviz profile.prof
use_wandb = True

# Initialize classes
env = inept.environments.trajectory(*modalities, **env_kwargs, **stages_kwargs['env'][0], device=DEVICE)  # Set to first stage
policy = inept.models.PPO(**policy_kwargs).train()
early_stopping = inept.utilities.EarlyStopping(**es_kwargs)

# Initialize wandb
if use_wandb: wandb.init(
    project='INEPT',
    config={
        **{'note/'+k:v for k, v in note_kwargs.items()},
        **{'data/'+k:v for k, v in data_kwargs.items()},
        **{'env/'+k:v for k, v in env_kwargs.items()},
        **{'stages/'+k:v for k, v in stages_kwargs.items()},
        **{'policy/'+k:v for k, v in policy_kwargs.items()},
        **{'train/'+k:v for k, v in train_kwargs.items()},
        **{'es/'+k:v for k, v in es_kwargs.items()},
    },
)

# Initialize logging vars
torch.cuda.reset_peak_memory_stats()
timer = inept.utilities.time_logger(discard_first_sample=True)
timestep = 0; episode = 1; stage = 0

# CLI
print('Beginning training')
num_train_nodes = data_kwargs['num_nodes'] if train_kwargs['max_nodes'] is None else min(data_kwargs['num_nodes'], train_kwargs['max_nodes'])
print(
    f'Training using {num_train_nodes} nodes out of a'
    f' total {data_kwargs["num_nodes"]} with forward batches of'
    f' size {train_kwargs["max_batch"]}.'
)
update_maxbatch_print = (
    policy_kwargs["update_maxbatch"]
    if policy_kwargs["update_maxbatch"] is not None else 
    'all'
)
print(
    f'Training on {update_maxbatch_print} states'
    f' with batches of size {policy_kwargs["update_batch"]}'
    f' and minibatches of size {policy_kwargs["update_minibatch"]}'
    f' from {int(train_kwargs["update_timesteps"] * data_kwargs["num_nodes"])} total.')

# Simulation loop
while timestep < train_kwargs['max_timesteps']:
    # Reset environment
    env.reset()
    timer.log('Reset Environment')

    # Start episode
    ep_timestep = 0; ep_reward = 0; ep_itemized_reward = defaultdict(lambda: 0)
    while ep_timestep < train_kwargs['max_ep_timesteps']:
        with torch.no_grad():
            # Get current state
            state = env.get_state(include_modalities=True)
            timer.log('Environment Setup')

            # Get actions from policy
            actions = policy.act_macro(
                state,
                keys=list(range(num_nodes)),
                max_batch=train_kwargs['max_batch'],
                max_nodes=train_kwargs['max_nodes'],
            ).detach()
            timer.log('Calculate Actions')

            # Step environment and get reward
            rewards, finished, itemized_rewards = env.step(actions, return_rewards=True)
            finished = finished or (ep_timestep == train_kwargs['max_ep_timesteps']-1)  # Maybe move logic inside env?
            timer.log('Step Environment')

            # Record rewards for policy
            policy.memory.record(
                rewards=rewards.cpu().tolist(),
                is_terminals=finished,
            )

            # Record rewards for logging
            ep_reward = ep_reward + rewards.cpu().mean()
            for k, v in itemized_rewards.items():
                ep_itemized_reward[k] += v.cpu().mean()
            timer.log('Record Rewards')

        # Iterate
        timestep += 1
        ep_timestep += 1

        # Update model
        if timestep % train_kwargs['update_timesteps'] == 0:
            # assert False
            print(f'Updating model with average reward {np.mean(policy.memory.storage["rewards"])} on episode {episode} and timestep {timestep}', end='')
            policy.update()
            print(f' ({torch.cuda.max_memory_allocated() / 1024**3:.2f} GB CUDA)')
            torch.cuda.reset_peak_memory_stats()
            timer.log('Update Policy')

        # Escape if finished
        if finished: break

    # Upload stats
    ep_reward = (ep_reward / ep_timestep).item()
    update = int(timestep / train_kwargs['update_timesteps'])
    if use_wandb:
        wandb.log({
            **{
            # Measurements
            'end_timestep': timestep,
            'episode': episode,
            'update': update,
            'stage': stage,
            # Parameters
            'action_std': policy.action_std,
            # Outputs
            'average_reward': ep_reward,
            },
            **{'rewards/'+k: (v / ep_timestep).item() for k, v in ep_itemized_reward.items()},
        })
    timer.log('Record Stats')

    # Decay model std
    if early_stopping(ep_reward) or timestep >= train_kwargs['max_timesteps']:
        # Save model
        wgt_file = os.path.join(MODEL_FOLDER, f'policy_{stage:02}.wgt')
        torch.save(policy.state_dict(), wgt_file)  # Save just weights
        if use_wandb: wandb.save(wgt_file)
        mdl_file = os.path.join(MODEL_FOLDER, f'policy_{stage:02}.mdl')
        torch.save(policy, mdl_file)  # Save whole model
        if use_wandb: wandb.save(mdl_file)

        # End if maximum timesteps reached
        if timestep >= train_kwargs['max_timesteps']:
            print('Maximal timesteps reached')

        # End if at minimum `action_std`
        if policy.action_std <= policy.action_std_min:
            print(f'Ending early on episode {episode} and timestep {timestep}')
            break

        # Activate next stage or decay
        stage += 1
        # CLI
        print(f'Advancing training to stage {stage}')
        if stage < len(stages_kwargs['env']):
            # Activate next stage
            env.set_rewards(stages_kwargs['env'][stage])
        else:
            # Decay policy randomness
            policy.decay_action_std()
            # CLI
            print(f'Decaying std to {policy.action_std} on episode {episode} and timestep {timestep}')

        # Reset early stopping
        early_stopping.reset()
    timer.log('Early Stopping')

    # Iterate
    episode += 1

# CLI Timer
print()
timer.aggregate('sum')

# Finish wandb
if use_wandb: wandb.finish()

Beginning training
Training using 100 nodes out of a total 100 with forward batches of size None.
Training on all states with batches of size 10000 and minibatches of size 2000 from 500000 total.


Updating model with average reward -1.800024 on episode 5 and timestep 5000

 (3.67 GB CUDA)


Updating model with average reward -1.800004 on episode 10 and timestep 10000

 (3.67 GB CUDA)


Updating model with average reward -1.800112 on episode 15 and timestep 15000

 (3.67 GB CUDA)


Updating model with average reward -1.799216 on episode 20 and timestep 20000

 (3.67 GB CUDA)


Updating model with average reward -1.799748 on episode 25 and timestep 25000

 (3.67 GB CUDA)


Updating model with average reward -1.799584 on episode 30 and timestep 30000

 (3.67 GB CUDA)


Updating model with average reward -1.799712 on episode 35 and timestep 35000

 (3.67 GB CUDA)


Updating model with average reward -1.800836 on episode 40 and timestep 40000

 (3.67 GB CUDA)


Advancing training to stage 1


Updating model with average reward -2.676311474910736 on episode 45 and timestep 45000

 (3.67 GB CUDA)


Updating model with average reward -6.3549468975048065 on episode 50 and timestep 50000

 (3.67 GB CUDA)


Updating model with average reward -6.305410761581421 on episode 55 and timestep 55000

 (3.67 GB CUDA)


Updating model with average reward -6.260936995761871 on episode 60 and timestep 60000

 (3.67 GB CUDA)


Updating model with average reward -6.333338120319366 on episode 65 and timestep 65000

 (3.67 GB CUDA)


Updating model with average reward -6.413348591796875 on episode 70 and timestep 70000

 (3.67 GB CUDA)


Updating model with average reward -6.52896226279068 on episode 75 and timestep 75000

 (3.67 GB CUDA)


Updating model with average reward -6.231223566240311 on episode 80 and timestep 80000

 (3.67 GB CUDA)


Updating model with average reward -6.418488053922653 on episode 85 and timestep 85000

 (3.67 GB CUDA)


Updating model with average reward -6.216107120752334 on episode 90 and timestep 90000

 (3.67 GB CUDA)


Updating model with average reward -6.315790684564591 on episode 95 and timestep 95000

 (3.67 GB CUDA)


Updating model with average reward -6.417264604843139 on episode 100 and timestep 100000

 (3.67 GB CUDA)


Updating model with average reward -6.323246401220322 on episode 105 and timestep 105000

 (3.67 GB CUDA)


Updating model with average reward -6.11804074077034 on episode 110 and timestep 110000

 (3.67 GB CUDA)


Updating model with average reward -6.4080422531051635 on episode 115 and timestep 115000

 (3.67 GB CUDA)


Advancing training to stage 2


Updating model with average reward -7.148020480508566 on episode 120 and timestep 120000

 (3.67 GB CUDA)


Updating model with average reward -10.374354410906761 on episode 125 and timestep 125000

 (3.67 GB CUDA)


Updating model with average reward -10.391144395839483 on episode 130 and timestep 130000

 (3.67 GB CUDA)


Updating model with average reward -10.209409741629601 on episode 135 and timestep 135000

 (3.67 GB CUDA)


Updating model with average reward -10.436348011278868 on episode 140 and timestep 140000

 (3.67 GB CUDA)


Updating model with average reward -10.500904341767729 on episode 145 and timestep 145000

 (3.67 GB CUDA)


Updating model with average reward -10.547080142258435 on episode 150 and timestep 150000

 (3.67 GB CUDA)


Updating model with average reward -10.572569179540753 on episode 155 and timestep 155000

 (3.67 GB CUDA)


Updating model with average reward -10.580670114060968 on episode 160 and timestep 160000

 (3.67 GB CUDA)


Updating model with average reward -10.463592183890075 on episode 165 and timestep 165000

 (3.67 GB CUDA)


Advancing training to stage 3


Updating model with average reward -50.08240344140598 on episode 170 and timestep 170000

 (3.67 GB CUDA)


Updating model with average reward -58.26449364275655 on episode 175 and timestep 175000

 (3.67 GB CUDA)


Updating model with average reward -33.53594266360206 on episode 180 and timestep 180000

 (3.67 GB CUDA)


Updating model with average reward -32.52329852458009 on episode 185 and timestep 185000

 (3.67 GB CUDA)


Updating model with average reward -31.23070326891291 on episode 190 and timestep 190000

 (3.67 GB CUDA)


Updating model with average reward -33.515664417480735 on episode 195 and timestep 195000

 (3.67 GB CUDA)


Updating model with average reward -29.422503361664504 on episode 200 and timestep 200000

 (3.67 GB CUDA)


Updating model with average reward -27.767677638444244 on episode 205 and timestep 205000

 (3.67 GB CUDA)


Updating model with average reward -30.011057150416345 on episode 210 and timestep 210000

 (3.67 GB CUDA)


Updating model with average reward -21.936122018612565 on episode 215 and timestep 215000

 (3.67 GB CUDA)


Updating model with average reward -29.85009681520307 on episode 220 and timestep 220000

 (3.67 GB CUDA)


Updating model with average reward -20.062386016202808 on episode 225 and timestep 225000

 (3.67 GB CUDA)


Updating model with average reward -20.592649228514134 on episode 230 and timestep 230000

 (3.67 GB CUDA)


Updating model with average reward -18.474810764520168 on episode 235 and timestep 235000

 (3.67 GB CUDA)


Updating model with average reward -20.845314389563264 on episode 240 and timestep 240000

 (3.67 GB CUDA)


Updating model with average reward -19.059382275172144 on episode 245 and timestep 245000

 (3.67 GB CUDA)


Updating model with average reward -13.58796420180276 on episode 250 and timestep 250000

 (3.67 GB CUDA)


Updating model with average reward -13.180723704117566 on episode 255 and timestep 255000

 (3.67 GB CUDA)


Updating model with average reward -13.995568706623763 on episode 260 and timestep 260000

 (3.67 GB CUDA)


Updating model with average reward -17.90799708092487 on episode 265 and timestep 265000

 (3.67 GB CUDA)


Updating model with average reward -16.376268078676073 on episode 270 and timestep 270000

 (3.67 GB CUDA)


Updating model with average reward -12.687315772045553 on episode 275 and timestep 275000

 (3.67 GB CUDA)


Updating model with average reward -11.547802521054535 on episode 280 and timestep 280000

 (3.67 GB CUDA)


Updating model with average reward -12.072401931439341 on episode 285 and timestep 285000

 (3.67 GB CUDA)


Updating model with average reward -14.819860802851737 on episode 290 and timestep 290000

 (3.67 GB CUDA)


Updating model with average reward -11.749675345232339 on episode 295 and timestep 295000

 (3.67 GB CUDA)


Updating model with average reward -9.79568938666439 on episode 300 and timestep 300000

 (3.67 GB CUDA)


Updating model with average reward -9.9030946503703 on episode 305 and timestep 305000

 (3.67 GB CUDA)


Updating model with average reward -9.92414509019202 on episode 310 and timestep 310000

 (3.67 GB CUDA)


Updating model with average reward -12.76570985312289 on episode 315 and timestep 315000

 (3.67 GB CUDA)


Updating model with average reward -11.308062202868491 on episode 320 and timestep 320000

 (3.67 GB CUDA)


Updating model with average reward -11.175849430798293 on episode 325 and timestep 325000

 (3.67 GB CUDA)


Updating model with average reward -10.305306543716371 on episode 330 and timestep 330000

 (3.67 GB CUDA)


Updating model with average reward -9.927152455291182 on episode 335 and timestep 335000

 (3.67 GB CUDA)


Advancing training to stage 4
Decaying std to 0.5499999999999999 on episode 339 and timestep 339000


Updating model with average reward -9.360852718606859 on episode 340 and timestep 340000

 (3.67 GB CUDA)


Updating model with average reward -8.447633202944338 on episode 345 and timestep 345000

 (3.67 GB CUDA)


Updating model with average reward -7.29450100122583 on episode 350 and timestep 350000

 (3.67 GB CUDA)


Updating model with average reward -8.942664359427662 on episode 355 and timestep 355000

 (3.67 GB CUDA)


Updating model with average reward -7.578808115392774 on episode 360 and timestep 360000

 (3.67 GB CUDA)


Updating model with average reward -7.268813029195785 on episode 365 and timestep 365000

 (3.67 GB CUDA)


Updating model with average reward -6.461095805532098 on episode 370 and timestep 370000

 (3.67 GB CUDA)


Updating model with average reward -6.711778058506906 on episode 375 and timestep 375000

 (3.67 GB CUDA)


Updating model with average reward -6.880952091035932 on episode 380 and timestep 380000

 (3.67 GB CUDA)


Updating model with average reward -7.0040109753763975 on episode 385 and timestep 385000

 (3.67 GB CUDA)


Updating model with average reward -6.558632085186511 on episode 390 and timestep 390000

 (3.67 GB CUDA)


Updating model with average reward -6.755753246305257 on episode 395 and timestep 395000

 (3.67 GB CUDA)


Updating model with average reward -6.493977467722416 on episode 400 and timestep 400000

 (3.67 GB CUDA)


Updating model with average reward -6.073897515322119 on episode 405 and timestep 405000

 (3.67 GB CUDA)


Updating model with average reward -5.820680090283066 on episode 410 and timestep 410000

 (3.67 GB CUDA)


Updating model with average reward -6.119150952339798 on episode 415 and timestep 415000

 (3.67 GB CUDA)


Updating model with average reward -6.42652599001807 on episode 420 and timestep 420000

 (3.67 GB CUDA)


Updating model with average reward -6.034876780151189 on episode 425 and timestep 425000

 (3.67 GB CUDA)


Updating model with average reward -6.191381552601785 on episode 430 and timestep 430000

 (3.67 GB CUDA)


Updating model with average reward -6.531318666184187 on episode 435 and timestep 435000

 (3.67 GB CUDA)


Updating model with average reward -6.414717392209768 on episode 440 and timestep 440000

 (3.67 GB CUDA)


Advancing training to stage 5
Decaying std to 0.49999999999999994 on episode 442 and timestep 442000


Updating model with average reward -5.8300892483769955 on episode 445 and timestep 445000

 (3.67 GB CUDA)


Updating model with average reward -5.826687543949545 on episode 450 and timestep 450000

 (3.67 GB CUDA)


Updating model with average reward -5.634033281151563 on episode 455 and timestep 455000

 (3.67 GB CUDA)


Updating model with average reward -5.75657749750328 on episode 460 and timestep 460000

 (3.67 GB CUDA)


Updating model with average reward -5.295044120734572 on episode 465 and timestep 465000

 (3.67 GB CUDA)


Updating model with average reward -5.348238852814793 on episode 470 and timestep 470000

 (3.67 GB CUDA)


Updating model with average reward -5.5741618602187035 on episode 475 and timestep 475000

 (3.67 GB CUDA)


Updating model with average reward -5.69117669159472 on episode 480 and timestep 480000

 (3.67 GB CUDA)


Updating model with average reward -5.532862801258862 on episode 485 and timestep 485000

 (3.67 GB CUDA)


Updating model with average reward -5.6538531659346525 on episode 490 and timestep 490000

 (3.67 GB CUDA)


Updating model with average reward -5.6293987939004895 on episode 495 and timestep 495000

 (3.67 GB CUDA)


Updating model with average reward -5.531491131496936 on episode 500 and timestep 500000

 (3.67 GB CUDA)


Advancing training to stage 6
Decaying std to 0.44999999999999996 on episode 503 and timestep 503000


Updating model with average reward -5.626327781612724 on episode 505 and timestep 505000

 (3.67 GB CUDA)


Updating model with average reward -5.346392116420031 on episode 510 and timestep 510000

 (3.67 GB CUDA)


Updating model with average reward -5.219602584786236 on episode 515 and timestep 515000

 (3.67 GB CUDA)


Updating model with average reward -4.982250698162347 on episode 520 and timestep 520000

 (3.67 GB CUDA)


Updating model with average reward -5.08079572493726 on episode 525 and timestep 525000

 (3.67 GB CUDA)


Updating model with average reward -5.558940859103352 on episode 530 and timestep 530000

 (3.67 GB CUDA)


Updating model with average reward -5.273170975546568 on episode 535 and timestep 535000

 (3.67 GB CUDA)


Updating model with average reward -5.720060014345854 on episode 540 and timestep 540000

 (3.67 GB CUDA)


Updating model with average reward -5.639846338152826 on episode 545 and timestep 545000

 (3.67 GB CUDA)


Updating model with average reward -5.5606891976382435 on episode 550 and timestep 550000

 (3.67 GB CUDA)


Updating model with average reward -5.2283441484479605 on episode 555 and timestep 555000

 (3.67 GB CUDA)


Advancing training to stage 7
Decaying std to 0.39999999999999997 on episode 555 and timestep 555000


Updating model with average reward -4.898948622247249 on episode 560 and timestep 560000

 (3.67 GB CUDA)


Updating model with average reward -4.861080644927785 on episode 565 and timestep 565000

 (3.67 GB CUDA)


Updating model with average reward -5.026364600531399 on episode 570 and timestep 570000

 (3.67 GB CUDA)


Updating model with average reward -5.070782459609598 on episode 575 and timestep 575000

 (3.67 GB CUDA)


Updating model with average reward -5.721440441523105 on episode 580 and timestep 580000

 (3.67 GB CUDA)


Updating model with average reward -5.349496622751444 on episode 585 and timestep 585000

 (3.67 GB CUDA)


Updating model with average reward -5.320223808238059 on episode 590 and timestep 590000

 (3.67 GB CUDA)


Updating model with average reward -5.150207035041541 on episode 595 and timestep 595000

 (3.67 GB CUDA)


Advancing training to stage 8
Decaying std to 0.35 on episode 599 and timestep 599000


Updating model with average reward -5.9282430136383475 on episode 600 and timestep 600000

 (3.67 GB CUDA)


Updating model with average reward -4.866178307381928 on episode 605 and timestep 605000

 (3.67 GB CUDA)


Updating model with average reward -4.999477811502963 on episode 610 and timestep 610000

 (3.67 GB CUDA)


Updating model with average reward -5.1720639723840955 on episode 615 and timestep 615000

 (3.67 GB CUDA)


Updating model with average reward -5.152495955325425 on episode 620 and timestep 620000

 (3.67 GB CUDA)


Updating model with average reward -5.067962712999583 on episode 625 and timestep 625000

 (3.67 GB CUDA)


Updating model with average reward -4.9549036686592105 on episode 630 and timestep 630000

 (3.67 GB CUDA)


Updating model with average reward -5.272858495254368 on episode 635 and timestep 635000

 (3.67 GB CUDA)


Updating model with average reward -5.261471824602008 on episode 640 and timestep 640000

 (3.67 GB CUDA)


Advancing training to stage 9
Decaying std to 0.3 on episode 643 and timestep 643000


Updating model with average reward -5.2144295411174895 on episode 645 and timestep 645000

 (3.67 GB CUDA)


Updating model with average reward -5.075896342716306 on episode 650 and timestep 650000

 (3.67 GB CUDA)


Updating model with average reward -5.171390603909552 on episode 655 and timestep 655000

 (3.67 GB CUDA)


Updating model with average reward -5.105078336213335 on episode 660 and timestep 660000

 (3.67 GB CUDA)


Updating model with average reward -5.137223014934361 on episode 665 and timestep 665000

 (3.67 GB CUDA)


Updating model with average reward -5.433623175272316 on episode 670 and timestep 670000

 (3.67 GB CUDA)


Updating model with average reward -5.42245827413088 on episode 675 and timestep 675000

 (3.67 GB CUDA)


Updating model with average reward -5.232022426744789 on episode 680 and timestep 680000

 (3.67 GB CUDA)


Updating model with average reward -5.3393192234061955 on episode 685 and timestep 685000

 (3.67 GB CUDA)


Advancing training to stage 10
Decaying std to 0.25 on episode 687 and timestep 687000


Updating model with average reward -4.912054155461937 on episode 690 and timestep 690000

 (3.67 GB CUDA)


Updating model with average reward -5.013681518726051 on episode 695 and timestep 695000

 (3.67 GB CUDA)


Updating model with average reward -5.037415096737683 on episode 700 and timestep 700000

 (3.67 GB CUDA)


Updating model with average reward -4.944619764448255 on episode 705 and timestep 705000

 (3.67 GB CUDA)


Updating model with average reward -4.9280908157036905 on episode 710 and timestep 710000

 (3.67 GB CUDA)


Updating model with average reward -4.933095241488025 on episode 715 and timestep 715000

 (3.67 GB CUDA)


Updating model with average reward -5.129155404489845 on episode 720 and timestep 720000

 (3.67 GB CUDA)


Updating model with average reward -5.132435890456423 on episode 725 and timestep 725000

 (3.67 GB CUDA)


Updating model with average reward -4.96773436438635 on episode 730 and timestep 730000

 (3.67 GB CUDA)


Advancing training to stage 11
Decaying std to 0.2 on episode 731 and timestep 731000


Updating model with average reward -4.765994446723997 on episode 735 and timestep 735000

 (3.67 GB CUDA)


Updating model with average reward -4.6666572987308355 on episode 740 and timestep 740000

 (3.67 GB CUDA)


Updating model with average reward -4.622848608471945 on episode 745 and timestep 745000

 (3.67 GB CUDA)


Updating model with average reward -4.603241486681566 on episode 750 and timestep 750000

 (3.67 GB CUDA)


Updating model with average reward -4.766039656627267 on episode 755 and timestep 755000

 (3.67 GB CUDA)


Updating model with average reward -4.539947919972912 on episode 760 and timestep 760000

 (3.67 GB CUDA)


Updating model with average reward -4.6050959618624 on episode 765 and timestep 765000

 (3.67 GB CUDA)


Updating model with average reward -4.789775239986644 on episode 770 and timestep 770000

 (3.67 GB CUDA)


Updating model with average reward -4.722907508806259 on episode 775 and timestep 775000

 (3.67 GB CUDA)


Updating model with average reward -4.826851990148425 on episode 780 and timestep 780000

 (3.67 GB CUDA)


Advancing training to stage 12
Decaying std to 0.15000000000000002 on episode 780 and timestep 780000


Updating model with average reward -4.65355317163992 on episode 785 and timestep 785000

 (3.67 GB CUDA)


Updating model with average reward -4.723691855323524 on episode 790 and timestep 790000

 (3.67 GB CUDA)


Updating model with average reward -4.707459194495558 on episode 795 and timestep 795000

 (3.67 GB CUDA)


Updating model with average reward -4.552456571493119 on episode 800 and timestep 800000

 (3.67 GB CUDA)


Updating model with average reward -4.603940501017422 on episode 805 and timestep 805000

 (3.67 GB CUDA)


Updating model with average reward -4.656086358260602 on episode 810 and timestep 810000

 (3.67 GB CUDA)


Updating model with average reward -4.775076739680887 on episode 815 and timestep 815000

 (3.67 GB CUDA)


Updating model with average reward -4.712988044647634 on episode 820 and timestep 820000

 (3.67 GB CUDA)


Updating model with average reward -5.070580358421698 on episode 825 and timestep 825000

 (3.67 GB CUDA)


Updating model with average reward -4.92652421847488 on episode 830 and timestep 830000

 (3.67 GB CUDA)


Updating model with average reward -5.15334684782423 on episode 835 and timestep 835000

 (3.67 GB CUDA)


Advancing training to stage 13
Decaying std to 0.10000000000000002 on episode 839 and timestep 839000


Updating model with average reward -5.000636575686842 on episode 840 and timestep 840000

 (3.67 GB CUDA)


Updating model with average reward -19.340399501733884 on episode 845 and timestep 845000

 (3.67 GB CUDA)


Updating model with average reward -17.15519160639693 on episode 850 and timestep 850000

 (3.67 GB CUDA)


Updating model with average reward -19.354606456641434 on episode 855 and timestep 855000

 (3.67 GB CUDA)


Updating model with average reward -19.348682592916088 on episode 860 and timestep 860000

 (3.67 GB CUDA)


Updating model with average reward -17.498152673149182 on episode 865 and timestep 865000

 (3.67 GB CUDA)


Updating model with average reward -15.495219475130527 on episode 870 and timestep 870000

 (3.67 GB CUDA)


Updating model with average reward -17.805314016589865 on episode 875 and timestep 875000

 (3.67 GB CUDA)


Updating model with average reward -17.709819439172758 on episode 880 and timestep 880000

 (3.67 GB CUDA)


Updating model with average reward -18.46850851956731 on episode 885 and timestep 885000

 (3.67 GB CUDA)


Updating model with average reward -19.323767757686465 on episode 890 and timestep 890000

 (3.67 GB CUDA)


Updating model with average reward -18.480065500159622 on episode 895 and timestep 895000

 (3.67 GB CUDA)


Updating model with average reward -17.89725962223965 on episode 900 and timestep 900000

 (3.67 GB CUDA)


Advancing training to stage 14
Decaying std to 0.1 on episode 904 and timestep 904000


Updating model with average reward -17.490009062251254 on episode 905 and timestep 905000

 (3.67 GB CUDA)


Updating model with average reward -16.435601037615864 on episode 910 and timestep 910000

 (3.67 GB CUDA)


Updating model with average reward -16.17841741047415 on episode 915 and timestep 915000

 (3.67 GB CUDA)


Updating model with average reward -17.15707692344004 on episode 920 and timestep 920000

 (3.67 GB CUDA)


Updating model with average reward -16.96007179894446 on episode 925 and timestep 925000

 (3.67 GB CUDA)


Updating model with average reward -19.045560387228893 on episode 930 and timestep 930000

 (3.67 GB CUDA)


Updating model with average reward -17.40018062399711 on episode 935 and timestep 935000

 (3.67 GB CUDA)


Updating model with average reward -17.014887360817223 on episode 940 and timestep 940000

 (3.67 GB CUDA)


Updating model with average reward -16.681561104256048 on episode 945 and timestep 945000

 (3.67 GB CUDA)


Updating model with average reward -16.603194720432402 on episode 950 and timestep 950000

 (3.67 GB CUDA)


Updating model with average reward -16.270883522680997 on episode 955 and timestep 955000

 (3.67 GB CUDA)


Updating model with average reward -17.756735644641147 on episode 960 and timestep 960000

 (3.67 GB CUDA)


Updating model with average reward -16.417283957173584 on episode 965 and timestep 965000

 (3.67 GB CUDA)


Updating model with average reward -15.600872053378806 on episode 970 and timestep 970000

 (3.67 GB CUDA)


Updating model with average reward -16.773233760142208 on episode 975 and timestep 975000

 (3.67 GB CUDA)


Updating model with average reward -18.50884063754642 on episode 980 and timestep 980000

 (3.67 GB CUDA)


Updating model with average reward -17.164772185512096 on episode 985 and timestep 985000

 (3.67 GB CUDA)


Updating model with average reward -17.695555583229005 on episode 990 and timestep 990000

 (3.67 GB CUDA)


Updating model with average reward -17.244346723665103 on episode 995 and timestep 995000

 (3.67 GB CUDA)


Updating model with average reward -16.940330606439964 on episode 1000 and timestep 1000000

 (3.67 GB CUDA)


Updating model with average reward -18.302135254275694 on episode 1005 and timestep 1005000

 (3.67 GB CUDA)


Ending early on episode 1007 and timestep 1007000

Reset Environment: 0.1439296305179596


Environment Setup: 33.42545697744936


Calculate Actions: 7302.262522215024


Step Environment: 833.4506894359365


Record Rewards: 263.2506745159626
Record Stats: 0.44872702192515135
Early Stopping: 16.331091308034956
Update Policy: 54646.259424337186
Total: 63095.572515442036
