In [1]:
## To Check
# Check that rewards are normalized after (?) advantage

## High Priority Training Changes
# Make backward (MAX_NODES, MAX_BATCH) batching work
# Add multithreading to forward and distributed to backward
# Add compatibility for env being on CPU, check for timing changes

## Backburner Priority Training Changes
# Add compatibility for cells with missing modalities (add mask to distance reward)
# Try imitation learning to better learn CT trajectories
# Add parallel envs of different sizes, with different data to help generality
# Fix off-center positioning in large environments
# Revise distance reward - Maybe add cell attraction (all should be close to each other) and repulsion (repulsion based on distance in modality)
# Revise velocity and action penalties to encourage early cell-type separation (i.e. sqrt of vec length or similar)

## Bookkeeping and QOL
# Save every time early stopping occurs
# Hook up sweeps API for wandb

In [2]:
# Original paper (pg 24)
# https://arxiv.org/pdf/1909.07528.pdf

# Original blog
# https://openai.com/research/emergent-tool-use

# Gym
# https://gymnasium.farama.org/

# Slides
# https://glouppe.github.io/info8004-advanced-machine-learning/pdf/pleroy-hide-and-seek.pdf

# PPO implementation
# https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO.py#L38

# Residual SA
# https://github.com/openai/multi-agent-emergence-environments/blob/bafaf1e11e6398624116761f91ae7c93b136f395/ma_policy/layers.py#L89

In [3]:
%load_ext autoreload
%autoreload 2
%env WANDB_NOTEBOOK_NAME train.ipynb
%env WANDB_SILENT true

env: WANDB_NOTEBOOK_NAME=train.ipynb
env: WANDB_SILENT=true


In [4]:
from collections import defaultdict
import os

import inept
import numpy as np
import pandas as pd
import torch
import wandb

# Set params
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATA_FOLDER = os.path.join(os.path.abspath(''), '../data')
MODEL_FOLDER = os.path.join(os.path.abspath(''), 'temp/trained_models')

# Script arguments
# import sys
# arg1 = int(sys.argv[1])

In [5]:
# Reproducibility
seed = 42
torch.manual_seed(seed)
if DEVICE == 'cuda': torch.cuda.manual_seed(seed)
np.random.seed(seed)

note_kwargs = {'seed': seed}

### Load Data

In [6]:
# Dataset loading
dataset_name = 'BrainChromatin'

if dataset_name == 'scNMT':
    dataset_dir = os.path.join(DATA_FOLDER, 'UnionCom/scNMT')
    M1 = pd.read_csv(os.path.join(dataset_dir, 'Paccessibility_300.txt'), delimiter=' ', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(dataset_dir, 'Pmethylation_300.txt'), delimiter=' ', header=None).to_numpy()
    M3 = pd.read_csv(os.path.join(dataset_dir, 'RNA_300.txt'), delimiter=' ', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(dataset_dir, 'type1.txt'), delimiter=' ', header=None).to_numpy().flatten()
    T2 = pd.read_csv(os.path.join(dataset_dir, 'type2.txt'), delimiter=' ', header=None).to_numpy().flatten()
    T3 = pd.read_csv(os.path.join(dataset_dir, 'type3.txt'), delimiter=' ', header=None).to_numpy().flatten()

elif dataset_name == 'BrainChromatin':
    nrows = None  # 2_000
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_rna_counts.tsv'), delimiter='\t', nrows=nrows).transpose()  # 4.6 Gb in memory
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_atac_gene_activities.tsv'), delimiter='\t', nrows=nrows).transpose()  # 2.6 Gb in memory
    M2 = M2.transpose()[M1.index].transpose()
    meta = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_cell_metadata.txt'), delimiter='\t')
    meta_names = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_cluster_names.txt'), delimiter='\t')
    meta_names = meta_names[meta_names['Assay'] == 'Multiome ATAC']
    meta = pd.merge(meta, meta_names, left_on='ATAC_cluster', right_on='Cluster.ID', how='left')
    meta.index = meta['Cell.ID']
    T1 = T2 = np.array(meta.transpose()[M1.index].transpose()['Cluster.Name'])
    F1, F2 = M1.columns, M2.columns
    M1, M2 = M1.to_numpy(), M2.to_numpy()

    del meta, meta_names

elif dataset_name == 'scGEM':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/GeneExpression.txt'), delimiter=' ', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/DNAmethylation.txt'), delimiter=' ', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type1.txt'), delimiter=' ', header=None).to_numpy()
    T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type2.txt'), delimiter=' ', header=None).to_numpy()
    F1 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/gex_names.txt'), dtype='str')
    F2 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/dm_names.txt'), dtype='str')

# MMD-MA data
elif dataset_name == 'MMD-MA':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped1.txt'), delimiter='\t', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped2.txt'), delimiter='\t', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type1.txt'), delimiter='\t', header=None).to_numpy()
    T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type2.txt'), delimiter='\t', header=None).to_numpy()

# Random data
elif dataset_name == 'Random':
    num_nodes = 100
    M1 = torch.rand((num_nodes, 8), device=DEVICE)
    M2 = torch.rand((num_nodes, 16), device=DEVICE)

else: assert False, 'No matching dataset found.'

# Parameters
num_nodes = 100  # M1.shape[0]
modalities = [M1, M2]  # [[M1, M2, M3][1]]  # TODO: Make more flexible
types = [T1, T2]  # [[T1, T2, T3][1]]

# Modify data
modalities = inept.utilities.normalize(*modalities, keep_array=True)  # Normalize
modalities = inept.utilities.pca_features(*modalities, num_features=(512, 512), keep_array=True)  # PCA features (2 min for 8k x 35+k)
subsample = inept.utilities.subsample_nodes(*modalities, *types, num_nodes=num_nodes, keep_array=True)  # Subsample nodes
modalities, types = subsample[:len(modalities)], subsample[len(modalities):]
# modalities = inept.utilities.subsample_features(*modalities, num_features=(16, 16), keep_array=True)  # Subsample features

# Cast types
modalities = [torch.tensor(Mx, dtype=torch.float32, device=DEVICE) for Mx in modalities]

### Parameters

In [7]:
# Data parameters
data_kwargs = {
    'dataset': dataset_name,
    'num_nodes': num_nodes,
}

# Environment parameters
env_kwargs = {
    'dim': 3,  # 2 = (x, y, vx, vy), 3 = (x, y, z, vx, vy, vz)
    'pos_bound': 10,
    'pos_rand_bound': 1,
    'vel_bound': 1,
    'delta': .1,
    # 'reward_distance': 0,
    # 'reward_origin': 0,
    # 'penalty_bound': 0,
    # 'penalty_velocity': 0,
    # 'penalty_action': 0,
    'reward_distance_type': 'euclidean',
}

# Environment weight stages
stages_kwargs = {
    'env': (
        # Stage 0
        {'penalty_bound': 1},
        # Stage 1
        {'reward_origin': 1},
        # Stage 2
        {'penalty_velocity': 1, 'penalty_action': 1},
        # Stage 3
        {'reward_origin': 0, 'reward_distance': 1},
    ),
}

# Training parameters
max_ep_timesteps = 1e3
update_timesteps = 5 * max_ep_timesteps
max_timesteps = 1e3 * update_timesteps
MAX_BATCH = min( 500, data_kwargs['num_nodes'] )  # NOTE: value should be similar to update_minibatch, if a bit larger
MAX_NODES = min( 50, data_kwargs['num_nodes'] )  # Larger means smaller minibatches but a fuller picture for each agent
MAX_BATCH = MAX_NODES = None  # TODO: Currently values other than `None` do not work with update
train_kwargs = {
    'max_ep_timesteps': max_ep_timesteps,
    'max_timesteps': max_timesteps,
    'update_timesteps': update_timesteps,
    'max_batch': MAX_BATCH,  # Max number of nodes to calculate actions for at a time
    'max_nodes': MAX_NODES,  # Max number of nodes to use as neighbors in action calculation
}

# Policy parameters
# num_train_nodes = data_kwargs['num_nodes'] if train_kwargs['max_nodes'] is None else min(data_kwargs['num_nodes'], train_kwargs['max_nodes'])
# GPU_MEMORY = 6; CPU_MEMORY = 16  # Optimized for 6Gb VRAM and 16Gb RAM
# MAX_GPU_RUN_SAMPLES = int( .8 * (GPU_MEMORY / 6) * 1e4 * (2000 / sum(M.shape[1] for M in modalities)) * (20 / num_train_nodes) )
# GPU_STORE_SAMPLES = int( 2 * MAX_GPU_RUN_SAMPLES )  # 3
# MAX_CPU_SAMPLES = int( (CPU_MEMORY / GPU_MEMORY) * MAX_GPU_RUN_SAMPLES )
# IDEAL_BATCH_SIZE = int( max_ep_timesteps )
update_maxbatch = None  # `MAX_CPU_SAMPLES`, `None` takes slightly longer but is more reliable
update_batch = int(1e3)  # Same or larger size as `update_maxbatch` skips GPU cast step inside epoch loop
update_minibatch = int(1e3)
policy_kwargs = {
    # Main arguments
    'num_features_per_node': 2*env_kwargs['dim'],
    'modal_sizes': [M.shape[1] for M in modalities],
    'output_dim': env_kwargs['dim'],
    'action_std_init': .6,
    'action_std_decay': .05,
    'action_std_min': .1,
    'epochs': 80,
    'epsilon_clip': .2,
    'memory_gamma': .95,
    'memory_prune': 100,
    'actor_lr': 3e-4,
    'critic_lr': 1e-3,
    'lr_gamma': 1,
    'update_maxbatch': update_maxbatch,  # Batch to load into RAM
    'update_batch': update_batch,  # Batch to load into VRAM
    'update_minibatch': update_minibatch,  # Batch to compute
    'device': DEVICE,
    # Layer arguments
    'embed_dim': 64,
    'feature_embed_dim': 32,
    'rs_nset': 1e5,  # Inversely proportional to influence of individual reward on moving statistics
}

# Early stopping parameters
es_kwargs = {
    # Global parameters
    'method': 'average',
    'buffer': 6 * int(train_kwargs['update_timesteps'] / train_kwargs['max_ep_timesteps']),  # 6 training cycles
    'delta': .01,
    'decreasing': False,
    # `average` method parameters
    'window_size': 3 * int(train_kwargs['update_timesteps'] / train_kwargs['max_ep_timesteps']),  # 3 training cycles
}

### Train Policy

In [8]:
# Tracking parameters
# Use `watch -d -n 0.5 nvidia-smi` to watch CUDA memory usage
# Use `top` to watch system memory usage
# Run script and put following above function to profile
#    from memory_profiler import profile
#    @profile
# Use cProfiler to profile timing:
#    python -m cProfile -s time -o profile.prof train.py
#    snakeviz profile.prof
use_wandb = True

# Initialize classes
env = inept.environments.trajectory(*modalities, **env_kwargs, **stages_kwargs['env'][0], device=DEVICE)  # Set to first stage
policy = inept.models.PPO(**policy_kwargs).train()
early_stopping = inept.utilities.EarlyStopping(**es_kwargs)

# Initialize wandb
if use_wandb: wandb.init(
    project='INEPT',
    config={
        **{'note/'+k:v for k, v in note_kwargs.items()},
        **{'data/'+k:v for k, v in data_kwargs.items()},
        **{'env/'+k:v for k, v in env_kwargs.items()},
        **{'stages/'+k:v for k, v in stages_kwargs.items()},
        **{'policy/'+k:v for k, v in policy_kwargs.items()},
        **{'train/'+k:v for k, v in train_kwargs.items()},
        **{'es/'+k:v for k, v in es_kwargs.items()},
    },
)

# Initialize logging vars
torch.cuda.reset_peak_memory_stats()
timer = inept.utilities.time_logger(discard_first_sample=True)
timestep = 0; episode = 1; stage = 0

# CLI
print('Beginning training')
num_train_nodes = data_kwargs['num_nodes'] if train_kwargs['max_nodes'] is None else min(data_kwargs['num_nodes'], train_kwargs['max_nodes'])
print(
    f'Training using {num_train_nodes} nodes out of a'
    f' total {data_kwargs["num_nodes"]} with batches of'
    f' size {train_kwargs["max_batch"]}.'
)
update_maxbatch_print = (
    policy_kwargs["update_maxbatch"]
    if policy_kwargs["update_maxbatch"] is not None else 
    'all'
)
print(
    f'Training on {update_maxbatch_print} states'
    f' with batches of size {policy_kwargs["update_batch"]}'
    f' and minibatches of size {policy_kwargs["update_minibatch"]}'
    f' from {int(train_kwargs["update_timesteps"] * data_kwargs["num_nodes"])} total.')

# Simulation loop
while timestep < train_kwargs['max_timesteps']:
    # Reset environment
    env.reset()
    timer.log('Reset Environment')

    # Start episode
    ep_timestep = 0; ep_reward = 0; ep_itemized_reward = defaultdict(lambda: 0)
    while ep_timestep < train_kwargs['max_ep_timesteps']:
        with torch.no_grad():
            # Get current state
            state = env.get_state(include_modalities=True)
            timer.log('Environment Setup')

            # Get actions from policy
            actions = policy.act_macro(
                state,
                keys=list(range(num_nodes)),
                max_batch=train_kwargs['max_batch'],
                max_nodes=train_kwargs['max_nodes'],
            ).detach()
            timer.log('Calculate Actions')

            # Step environment and get reward
            rewards, finished, itemized_rewards = env.step(actions, return_rewards=True)
            finished = finished or (ep_timestep == train_kwargs['max_ep_timesteps']-1)  # Maybe move logic inside env?
            timer.log('Step Environment')

            # Record rewards for policy
            policy.memory.record(
                rewards=rewards.cpu().tolist(),
                is_terminals=finished,
            )

            # Record rewards for logging
            ep_reward = ep_reward + rewards.cpu().mean()
            for k, v in itemized_rewards.items():
                ep_itemized_reward[k] += v.cpu().mean()
            timer.log('Record Rewards')

        # Iterate
        timestep += 1
        ep_timestep += 1

        # Update model
        if timestep % train_kwargs['update_timesteps'] == 0:
            # assert False
            print(f'Updating model with average reward {np.mean(policy.memory.storage["rewards"])} on episode {episode} and timestep {timestep}', end='')
            policy.update()
            print(f' ({torch.cuda.max_memory_allocated() / 1024**3:.2f} GB CUDA)')
            torch.cuda.reset_peak_memory_stats()
            timer.log('Update Policy')

        # Escape if finished
        if finished: break

    # Upload stats
    ep_reward = (ep_reward / ep_timestep).item()
    update = int(timestep / train_kwargs['update_timesteps'])
    if use_wandb:
        wandb.log({
            **{
            # Measurements
            'end_timestep': timestep,
            'episode': episode,
            'update': update,
            'stage': stage,
            # Parameters
            'action_std': policy.action_std,
            # Outputs
            'average_reward': ep_reward,
            },
            **{'rewards/'+k: (v / ep_timestep).item() for k, v in ep_itemized_reward.items()},
        })
    timer.log('Record Stats')

    # Decay model std
    if early_stopping(ep_reward) or timestep >= train_kwargs['max_timesteps']:
        # Save model
        wgt_file = os.path.join(MODEL_FOLDER, f'policy_{stage:02}.wgt')
        torch.save(policy.state_dict(), wgt_file)  # Save just weights
        if use_wandb: wandb.save(wgt_file)
        mdl_file = os.path.join(MODEL_FOLDER, f'policy_{stage:02}.mdl')
        torch.save(policy, mdl_file)  # Save whole model
        if use_wandb: wandb.save(mdl_file)

        # End if maximum timesteps reached
        if timestep >= train_kwargs['max_timesteps']:
            print('Maximal timesteps reached')

        # End if at minimum `action_std`
        if policy.action_std <= policy.action_std_min:
            print(f'Ending early on episode {episode} and timestep {timestep}')
            break

        # Activate next stage or decay
        stage += 1
        # CLI
        print(f'Advancing training to stage {stage}')
        if stage < len(stages_kwargs['env']):
            # Activate next stage
            env.set_rewards(stages_kwargs['env'][stage])
        else:
            # Decay policy randomness
            policy.decay_action_std()
            # CLI
            print(f'Decaying std to {policy.action_std} on episode {episode} and timestep {timestep}')

        # Reset early stopping
        early_stopping.reset()
    timer.log('Early Stopping')

    # Iterate
    episode += 1

# CLI Timer
print()
timer.aggregate('sum')

# Finish wandb
if use_wandb: wandb.finish()

Beginning training
Training using 100 nodes out of a total 100 with batches of size None.
Training on all states with batches of size 1000 and minibatches of size 1000 from 500000 total.


Updating model with average reward -1.628328 on episode 5 and timestep 5000

 (1.78 GB CUDA)


Updating model with average reward -1.582052 on episode 10 and timestep 10000

 (1.78 GB CUDA)


Updating model with average reward -1.529328 on episode 15 and timestep 15000

 (1.78 GB CUDA)


Updating model with average reward -1.417128 on episode 20 and timestep 20000

 (1.78 GB CUDA)


Updating model with average reward -1.292684 on episode 25 and timestep 25000

 (1.78 GB CUDA)


Updating model with average reward -1.149484 on episode 30 and timestep 30000

 (1.78 GB CUDA)


Updating model with average reward -0.91812 on episode 35 and timestep 35000

 (1.78 GB CUDA)


Updating model with average reward -0.733028 on episode 40 and timestep 40000

 (1.78 GB CUDA)


Updating model with average reward -0.49772 on episode 45 and timestep 45000

 (1.78 GB CUDA)


Updating model with average reward -0.33898 on episode 50 and timestep 50000

 (1.78 GB CUDA)


Updating model with average reward -0.30232 on episode 55 and timestep 55000

 (1.78 GB CUDA)


Updating model with average reward -0.11044 on episode 60 and timestep 60000

 (1.78 GB CUDA)


Updating model with average reward -0.060396 on episode 65 and timestep 65000

 (1.78 GB CUDA)


Updating model with average reward -0.034452 on episode 70 and timestep 70000

 (1.78 GB CUDA)


Updating model with average reward -0.019256 on episode 75 and timestep 75000

 (1.78 GB CUDA)


Updating model with average reward -0.010124 on episode 80 and timestep 80000

 (1.78 GB CUDA)


Updating model with average reward -0.005292 on episode 85 and timestep 85000

 (1.78 GB CUDA)


Updating model with average reward -0.002724 on episode 90 and timestep 90000

 (1.78 GB CUDA)


Updating model with average reward -0.001368 on episode 95 and timestep 95000

 (1.78 GB CUDA)


Updating model with average reward -0.000428 on episode 100 and timestep 100000

 (1.78 GB CUDA)


Updating model with average reward -0.000228 on episode 105 and timestep 105000

 (1.78 GB CUDA)


Updating model with average reward -8.4e-05 on episode 110 and timestep 110000

 (1.78 GB CUDA)


Updating model with average reward -0.000112 on episode 115 and timestep 115000

 (1.78 GB CUDA)


Updating model with average reward -8.4e-05 on episode 120 and timestep 120000

 (1.78 GB CUDA)


Updating model with average reward -4.4e-05 on episode 125 and timestep 125000

 (1.78 GB CUDA)


Updating model with average reward -0.00016 on episode 130 and timestep 130000

 (1.78 GB CUDA)


Updating model with average reward -7.6e-05 on episode 135 and timestep 135000

 (1.78 GB CUDA)


Advancing training to stage 1


Updating model with average reward -0.16940993674612045 on episode 140 and timestep 140000

 (1.78 GB CUDA)


Updating model with average reward -0.41095750052535535 on episode 145 and timestep 145000

 (1.78 GB CUDA)


Updating model with average reward -0.2590831870700121 on episode 150 and timestep 150000

 (1.78 GB CUDA)


Updating model with average reward -0.312312955925107 on episode 155 and timestep 155000

 (1.78 GB CUDA)


Updating model with average reward -0.2318356557803154 on episode 160 and timestep 160000

 (1.78 GB CUDA)


Updating model with average reward -0.2498631697883606 on episode 165 and timestep 165000

 (1.78 GB CUDA)


Updating model with average reward -0.1992505472150445 on episode 170 and timestep 170000

 (1.78 GB CUDA)


Updating model with average reward -0.18384151592946052 on episode 175 and timestep 175000

 (1.78 GB CUDA)


Updating model with average reward -0.1655840367846489 on episode 180 and timestep 180000

 (1.78 GB CUDA)


Updating model with average reward -0.13129399597007035 on episode 185 and timestep 185000

 (1.78 GB CUDA)


Updating model with average reward -0.11043081059271097 on episode 190 and timestep 190000

 (1.78 GB CUDA)


Updating model with average reward -0.07633565436720847 on episode 195 and timestep 195000

 (1.78 GB CUDA)


Updating model with average reward -0.053784101577848194 on episode 200 and timestep 200000

 (1.78 GB CUDA)


Updating model with average reward -0.03164093438613415 on episode 205 and timestep 205000

 (1.78 GB CUDA)


Updating model with average reward -0.011188829142987728 on episode 210 and timestep 210000

 (1.78 GB CUDA)


Updating model with average reward 0.0033162311027646065 on episode 215 and timestep 215000

 (1.78 GB CUDA)


Updating model with average reward 0.018673271002590655 on episode 220 and timestep 220000

 (1.78 GB CUDA)


Updating model with average reward 0.03304797007238865 on episode 225 and timestep 225000

 (1.78 GB CUDA)


Updating model with average reward 0.04317513546942174 on episode 230 and timestep 230000

 (1.78 GB CUDA)


Updating model with average reward 0.052260325238868595 on episode 235 and timestep 235000

 (1.78 GB CUDA)


Updating model with average reward 0.056349286316730085 on episode 240 and timestep 240000

 (1.78 GB CUDA)


Updating model with average reward 0.06160808885485679 on episode 245 and timestep 245000

 (1.78 GB CUDA)


Updating model with average reward 0.06820620232840627 on episode 250 and timestep 250000

 (1.78 GB CUDA)


Updating model with average reward 0.06804348649614304 on episode 255 and timestep 255000

 (1.78 GB CUDA)


Updating model with average reward 0.0763447607617043 on episode 260 and timestep 260000

 (1.78 GB CUDA)


Updating model with average reward 0.07598517948756367 on episode 265 and timestep 265000

 (1.78 GB CUDA)


Updating model with average reward 0.07912229116443917 on episode 270 and timestep 270000

 (1.78 GB CUDA)


Updating model with average reward 0.08023764696866274 on episode 275 and timestep 275000

 (1.78 GB CUDA)


Updating model with average reward 0.0826754156888239 on episode 280 and timestep 280000

 (1.78 GB CUDA)


Updating model with average reward 0.08253336940520256 on episode 285 and timestep 285000

 (1.78 GB CUDA)


Updating model with average reward 0.08481904890909418 on episode 290 and timestep 290000

 (1.78 GB CUDA)


Updating model with average reward 0.08520448137632013 on episode 295 and timestep 295000

 (1.78 GB CUDA)


Updating model with average reward 0.08517485166059062 on episode 300 and timestep 300000

 (1.78 GB CUDA)


Updating model with average reward 0.08494544782315568 on episode 305 and timestep 305000

 (1.78 GB CUDA)


Updating model with average reward 0.0866685386195667 on episode 310 and timestep 310000

 (1.78 GB CUDA)


Advancing training to stage 2


Updating model with average reward -0.3653190218814686 on episode 315 and timestep 315000

 (1.78 GB CUDA)


Updating model with average reward -0.4770071059022313 on episode 320 and timestep 320000

 (1.78 GB CUDA)


Updating model with average reward -0.47331967925671115 on episode 325 and timestep 325000

 (1.78 GB CUDA)


Updating model with average reward -0.47394220382260066 on episode 330 and timestep 330000

 (1.78 GB CUDA)


Updating model with average reward -0.4730225387638577 on episode 335 and timestep 335000

 (1.78 GB CUDA)


Updating model with average reward -0.4722387291175984 on episode 340 and timestep 340000

 (1.78 GB CUDA)


Updating model with average reward -0.4703854104325054 on episode 345 and timestep 345000

 (1.78 GB CUDA)


Updating model with average reward -0.471459539022882 on episode 350 and timestep 350000

 (1.78 GB CUDA)


Updating model with average reward -0.471194335018144 on episode 355 and timestep 355000

 (1.78 GB CUDA)
Advancing training to stage 3


Updating model with average reward -1.299683226116269 on episode 360 and timestep 360000

 (1.78 GB CUDA)


Updating model with average reward -0.5680322962053307 on episode 365 and timestep 365000

 (1.78 GB CUDA)


Updating model with average reward -0.5139076099011628 on episode 370 and timestep 370000

 (1.78 GB CUDA)


Updating model with average reward -0.5075765353756155 on episode 375 and timestep 375000

 (1.78 GB CUDA)


Updating model with average reward -0.4484348809912764 on episode 380 and timestep 380000

 (1.78 GB CUDA)


Updating model with average reward -0.3827097403355767 on episode 385 and timestep 385000

 (1.78 GB CUDA)


Updating model with average reward -0.3370393578486592 on episode 390 and timestep 390000

 (1.78 GB CUDA)


Updating model with average reward -0.32772432220769954 on episode 395 and timestep 395000

 (1.78 GB CUDA)


Updating model with average reward -0.2967028703212156 on episode 400 and timestep 400000

 (1.78 GB CUDA)


Updating model with average reward -0.24947023698645643 on episode 405 and timestep 405000

 (1.78 GB CUDA)


Updating model with average reward -0.1994781714325836 on episode 410 and timestep 410000

 (1.78 GB CUDA)


Updating model with average reward -0.14800640097591466 on episode 415 and timestep 415000

 (1.78 GB CUDA)


Updating model with average reward -0.13787411688510143 on episode 420 and timestep 420000

 (1.78 GB CUDA)


Updating model with average reward -0.09374345185113325 on episode 425 and timestep 425000

 (1.78 GB CUDA)


Updating model with average reward -0.04803333299847902 on episode 430 and timestep 430000

 (1.78 GB CUDA)


Updating model with average reward -0.032362135744002644 on episode 435 and timestep 435000

 (1.78 GB CUDA)


Updating model with average reward -0.023105272550488356 on episode 440 and timestep 440000

 (1.78 GB CUDA)


Updating model with average reward 0.04762794347149297 on episode 445 and timestep 445000

 (1.78 GB CUDA)


Updating model with average reward 0.05844217849151604 on episode 450 and timestep 450000

 (1.78 GB CUDA)


Updating model with average reward 0.06603414581818308 on episode 455 and timestep 455000

 (1.78 GB CUDA)


Updating model with average reward 0.11808489905386604 on episode 460 and timestep 460000

 (1.78 GB CUDA)


Updating model with average reward 0.12912624972030531 on episode 465 and timestep 465000

 (1.78 GB CUDA)


Updating model with average reward 0.13043849811348227 on episode 470 and timestep 470000

 (1.78 GB CUDA)


Updating model with average reward 0.1672647311105253 on episode 475 and timestep 475000

 (1.78 GB CUDA)


Updating model with average reward 0.19367085450267046 on episode 480 and timestep 480000

 (1.78 GB CUDA)


Updating model with average reward 0.1881259905019277 on episode 485 and timestep 485000

 (1.78 GB CUDA)


Updating model with average reward 0.1947924060878791 on episode 490 and timestep 490000

 (1.78 GB CUDA)


Updating model with average reward 0.20684540094582038 on episode 495 and timestep 495000

 (1.78 GB CUDA)


Updating model with average reward 0.22346745608749147 on episode 500 and timestep 500000

 (1.78 GB CUDA)


Updating model with average reward 0.19806308902835357 on episode 505 and timestep 505000

 (1.78 GB CUDA)


Updating model with average reward 0.20080364366901898 on episode 510 and timestep 510000

 (1.78 GB CUDA)


Updating model with average reward 0.19661888697832333 on episode 515 and timestep 515000

 (1.78 GB CUDA)


Updating model with average reward 0.21361184079920872 on episode 520 and timestep 520000

 (1.78 GB CUDA)


Updating model with average reward 0.21562315064503929 on episode 525 and timestep 525000

 (1.78 GB CUDA)


Advancing training to stage 4


Decaying std to 0.5499999999999999 on episode 528 and timestep 528000


Updating model with average reward 0.2454511097630111 on episode 530 and timestep 530000

 (1.78 GB CUDA)


Updating model with average reward 0.3061815464648595 on episode 535 and timestep 535000

 (1.78 GB CUDA)


Updating model with average reward 0.2899715855352908 on episode 540 and timestep 540000

 (1.78 GB CUDA)


Updating model with average reward 0.30569954364449065 on episode 545 and timestep 545000

 (1.78 GB CUDA)


Updating model with average reward 0.2982566854379233 on episode 550 and timestep 550000

 (1.78 GB CUDA)


Updating model with average reward 0.29053613907881987 on episode 555 and timestep 555000

 (1.78 GB CUDA)


Updating model with average reward 0.2943973569070157 on episode 560 and timestep 560000

 (1.78 GB CUDA)


Updating model with average reward 0.28724453098205105 on episode 565 and timestep 565000

 (1.78 GB CUDA)


Updating model with average reward 0.2948987080765087 on episode 570 and timestep 570000

 (1.78 GB CUDA)


Advancing training to stage 5
Decaying std to 0.49999999999999994 on episode 572 and timestep 572000


Updating model with average reward 0.35198217569258067 on episode 575 and timestep 575000

 (1.78 GB CUDA)


Updating model with average reward 0.37514165842107194 on episode 580 and timestep 580000

 (1.78 GB CUDA)


Updating model with average reward 0.38899495591727923 on episode 585 and timestep 585000

 (1.78 GB CUDA)


Updating model with average reward 0.3743087070654924 on episode 590 and timestep 590000

 (1.78 GB CUDA)


Updating model with average reward 0.3863318539178674 on episode 595 and timestep 595000

 (1.78 GB CUDA)


Updating model with average reward 0.3771306260813553 on episode 600 and timestep 600000

 (1.78 GB CUDA)


Updating model with average reward 0.3734683440833525 on episode 605 and timestep 605000

 (1.78 GB CUDA)


Updating model with average reward 0.3896595107449158 on episode 610 and timestep 610000

 (1.78 GB CUDA)


Updating model with average reward 0.3825341304759434 on episode 615 and timestep 615000

 (1.78 GB CUDA)


Advancing training to stage 6
Decaying std to 0.44999999999999996 on episode 616 and timestep 616000


Updating model with average reward 0.46224707940739973 on episode 620 and timestep 620000

 (1.78 GB CUDA)


Updating model with average reward 0.4591957792883769 on episode 625 and timestep 625000

 (1.78 GB CUDA)


Updating model with average reward 0.4557032674907015 on episode 630 and timestep 630000

 (1.78 GB CUDA)


Updating model with average reward 0.46395098317492356 on episode 635 and timestep 635000

 (1.78 GB CUDA)


Updating model with average reward 0.47557141306392264 on episode 640 and timestep 640000

 (1.78 GB CUDA)


Updating model with average reward 0.4759730772499234 on episode 645 and timestep 645000

 (1.78 GB CUDA)


Updating model with average reward 0.45840531057144424 on episode 650 and timestep 650000

 (1.78 GB CUDA)


Updating model with average reward 0.46758666709210583 on episode 655 and timestep 655000

 (1.78 GB CUDA)


Updating model with average reward 0.4520337439971869 on episode 660 and timestep 660000

 (1.78 GB CUDA)
Advancing training to stage 7
Decaying std to 0.39999999999999997 on episode 660 and timestep 660000


Updating model with average reward 0.5352307841894927 on episode 665 and timestep 665000

 (1.78 GB CUDA)


Updating model with average reward 0.5179017692939644 on episode 670 and timestep 670000

 (1.78 GB CUDA)


Updating model with average reward 0.5371026041941315 on episode 675 and timestep 675000

 (1.78 GB CUDA)


Updating model with average reward 0.5228270994694345 on episode 680 and timestep 680000

 (1.78 GB CUDA)


Updating model with average reward 0.5051144382281172 on episode 685 and timestep 685000

 (1.78 GB CUDA)


Updating model with average reward 0.5048078011430938 on episode 690 and timestep 690000

 (1.78 GB CUDA)


Updating model with average reward 0.5116758854682105 on episode 695 and timestep 695000

 (1.78 GB CUDA)


Updating model with average reward 0.5024872924605245 on episode 700 and timestep 700000

 (1.78 GB CUDA)


Advancing training to stage 8
Decaying std to 0.35 on episode 704 and timestep 704000


Updating model with average reward 0.5293736073471129 on episode 705 and timestep 705000

 (1.78 GB CUDA)


Updating model with average reward 0.5807991177044846 on episode 710 and timestep 710000

 (1.78 GB CUDA)


Updating model with average reward 0.5764656192879274 on episode 715 and timestep 715000

 (1.78 GB CUDA)


Updating model with average reward 0.588779617337224 on episode 720 and timestep 720000

 (1.78 GB CUDA)


Updating model with average reward 0.5650760935120145 on episode 725 and timestep 725000

 (1.78 GB CUDA)


Updating model with average reward 0.5629145238172 on episode 730 and timestep 730000

 (1.78 GB CUDA)


Updating model with average reward 0.5670021790510678 on episode 735 and timestep 735000

 (1.78 GB CUDA)


Updating model with average reward 0.5544157514718359 on episode 740 and timestep 740000

 (1.78 GB CUDA)


Updating model with average reward 0.5612347035601921 on episode 745 and timestep 745000

 (1.78 GB CUDA)


Advancing training to stage 9
Decaying std to 0.3 on episode 748 and timestep 748000


Updating model with average reward 0.5956847291760681 on episode 750 and timestep 750000

 (1.78 GB CUDA)


Updating model with average reward 0.6102379035202177 on episode 755 and timestep 755000

 (1.78 GB CUDA)


Updating model with average reward 0.6196299229383619 on episode 760 and timestep 760000

 (1.78 GB CUDA)


Updating model with average reward 0.6207724080395551 on episode 765 and timestep 765000

 (1.78 GB CUDA)


Updating model with average reward 0.6317002098950237 on episode 770 and timestep 770000

 (1.78 GB CUDA)


Updating model with average reward 0.5973605452756894 on episode 775 and timestep 775000

 (1.78 GB CUDA)


Updating model with average reward 0.58955795896798 on episode 780 and timestep 780000

 (1.78 GB CUDA)


Updating model with average reward 0.6039823822488791 on episode 785 and timestep 785000

 (1.78 GB CUDA)


Updating model with average reward 0.6032752970648619 on episode 790 and timestep 790000

 (1.78 GB CUDA)


Advancing training to stage 10
Decaying std to 0.25 on episode 792 and timestep 792000


Updating model with average reward 0.62139662973116 on episode 795 and timestep 795000

 (1.78 GB CUDA)


Updating model with average reward 0.6510490414266277 on episode 800 and timestep 800000

 (1.78 GB CUDA)


Updating model with average reward 0.6254453840357421 on episode 805 and timestep 805000

 (1.78 GB CUDA)


Updating model with average reward 0.6386376960925205 on episode 810 and timestep 810000

 (1.78 GB CUDA)


Updating model with average reward 0.6412490310037972 on episode 815 and timestep 815000

 (1.78 GB CUDA)


Updating model with average reward 0.6483119433055591 on episode 820 and timestep 820000

 (1.78 GB CUDA)


Updating model with average reward 0.6523674593453527 on episode 825 and timestep 825000

 (1.78 GB CUDA)


Updating model with average reward 0.6511276088256586 on episode 830 and timestep 830000

 (1.78 GB CUDA)


Updating model with average reward 0.6574850552192992 on episode 835 and timestep 835000

 (1.78 GB CUDA)


Updating model with average reward 0.6466772333594728 on episode 840 and timestep 840000

 (1.78 GB CUDA)


Updating model with average reward 0.6467573737245782 on episode 845 and timestep 845000

 (1.78 GB CUDA)


Updating model with average reward 0.6301054302253045 on episode 850 and timestep 850000

 (1.78 GB CUDA)


Updating model with average reward 0.6509990255295757 on episode 855 and timestep 855000

 (1.78 GB CUDA)


Advancing training to stage 11
Decaying std to 0.2 on episode 856 and timestep 856000


Updating model with average reward 0.6781483584376765 on episode 860 and timestep 860000

 (1.78 GB CUDA)


Updating model with average reward 0.7043407223488694 on episode 865 and timestep 865000

 (1.78 GB CUDA)


Updating model with average reward 0.7020659649352248 on episode 870 and timestep 870000

 (1.78 GB CUDA)


Updating model with average reward 0.681916799398681 on episode 875 and timestep 875000

 (1.78 GB CUDA)


Updating model with average reward 0.6894913828074882 on episode 880 and timestep 880000

 (1.78 GB CUDA)


Updating model with average reward 0.6960582325595158 on episode 885 and timestep 885000

 (1.78 GB CUDA)


Updating model with average reward 0.6844498662780931 on episode 890 and timestep 890000

 (1.78 GB CUDA)


Updating model with average reward 0.6757972732030348 on episode 895 and timestep 895000

 (1.78 GB CUDA)


Updating model with average reward 0.6845402161952678 on episode 900 and timestep 900000

 (1.78 GB CUDA)
Advancing training to stage 12
Decaying std to 0.15000000000000002 on episode 900 and timestep 900000


Updating model with average reward 0.7077017750506117 on episode 905 and timestep 905000

 (1.78 GB CUDA)


Updating model with average reward 0.7174001695776113 on episode 910 and timestep 910000

 (1.78 GB CUDA)


Updating model with average reward 0.7041718007444413 on episode 915 and timestep 915000

 (1.78 GB CUDA)


Updating model with average reward 0.70876446050693 on episode 920 and timestep 920000

 (1.78 GB CUDA)


Updating model with average reward 0.6947343093391303 on episode 925 and timestep 925000

 (1.78 GB CUDA)


Updating model with average reward 0.7090227597950126 on episode 930 and timestep 930000

 (1.78 GB CUDA)


Updating model with average reward 0.7047307900710735 on episode 935 and timestep 935000

 (1.78 GB CUDA)


Updating model with average reward 0.7075085897100833 on episode 940 and timestep 940000

 (1.78 GB CUDA)


Advancing training to stage 13
Decaying std to 0.10000000000000002 on episode 944 and timestep 944000


Updating model with average reward 0.7060110018125663 on episode 945 and timestep 945000

 (1.78 GB CUDA)


Updating model with average reward 0.7317688301873835 on episode 950 and timestep 950000

 (1.78 GB CUDA)


Updating model with average reward 0.695767634826594 on episode 955 and timestep 955000

 (1.78 GB CUDA)


Updating model with average reward 0.7013819039074867 on episode 960 and timestep 960000

 (1.78 GB CUDA)


Updating model with average reward 0.6644950821779609 on episode 965 and timestep 965000

 (1.78 GB CUDA)


Updating model with average reward 0.6844303296852119 on episode 970 and timestep 970000

 (1.78 GB CUDA)


Updating model with average reward 0.6959488233490291 on episode 975 and timestep 975000

 (1.78 GB CUDA)


Updating model with average reward 0.6851107452278097 on episode 980 and timestep 980000

 (1.78 GB CUDA)


Updating model with average reward 0.6888228552367698 on episode 985 and timestep 985000

 (1.78 GB CUDA)


Advancing training to stage 14
Decaying std to 0.1 on episode 988 and timestep 988000


Updating model with average reward 0.6586845181778833 on episode 990 and timestep 990000

 (1.78 GB CUDA)


Updating model with average reward 0.6640390258374903 on episode 995 and timestep 995000

 (1.78 GB CUDA)


Updating model with average reward 0.6370852961622988 on episode 1000 and timestep 1000000

 (1.78 GB CUDA)


Updating model with average reward 0.6483384864419739 on episode 1005 and timestep 1005000

 (1.78 GB CUDA)


Updating model with average reward 0.6478385610406414 on episode 1010 and timestep 1010000

 (1.78 GB CUDA)


Updating model with average reward 0.6580406951539941 on episode 1015 and timestep 1015000

 (1.78 GB CUDA)


Updating model with average reward 0.6704249516909712 on episode 1020 and timestep 1020000

 (1.78 GB CUDA)


Updating model with average reward 0.6517173623917445 on episode 1025 and timestep 1025000

 (1.78 GB CUDA)


Updating model with average reward 0.6688983391801431 on episode 1030 and timestep 1030000

 (1.78 GB CUDA)


Updating model with average reward 0.6711186536739485 on episode 1035 and timestep 1035000

 (1.78 GB CUDA)


Updating model with average reward 0.6667853517973851 on episode 1040 and timestep 1040000

 (1.78 GB CUDA)


Updating model with average reward 0.6522457268521009 on episode 1045 and timestep 1045000

 (1.78 GB CUDA)


Updating model with average reward 0.6744468702247777 on episode 1050 and timestep 1050000

 (1.78 GB CUDA)


Updating model with average reward 0.6733387265438805 on episode 1055 and timestep 1055000

 (1.78 GB CUDA)


Updating model with average reward 0.6531642218788647 on episode 1060 and timestep 1060000

 (1.78 GB CUDA)


Updating model with average reward 0.6828798902645751 on episode 1065 and timestep 1065000

 (1.78 GB CUDA)


Ending early on episode 1068 and timestep 1068000

Reset Environment: 0.1569526009261608


Environment Setup: 35.3107045609504


Calculate Actions: 8220.172086023726


Step Environment: 902.692119977437


Record Rewards: 280.14890175592154
Record Stats: 0.3551984829828143
Early Stopping: 7.390081713907421
Update Policy: 7613.554197344929
Total: 17059.78024246078
