In [1]:
## To Check
# Check that rewards are normalized after (?) advantage

## High Priority Training Changes
# Make backward (MAX_NODES, MAX_BATCH) batching work
# Add multithreading to forward and distributed to backward
# Add compatibility for env being on CPU, check for timing changes

## Backburner Priority Training Changes
# Add compatibility for cells with missing modalities (add mask to distance reward)
# Try imitation learning to better learn CT trajectories
# Add parallel envs of different sizes, with different data to help generality
# Fix off-center positioning in large environments
# Revise distance reward - Maybe add cell attraction (all should be close to each other) and repulsion (repulsion based on distance in modality)
# Revise velocity and action penalties to encourage early cell-type separation (i.e. sqrt of vec length or similar)

## Bookkeeping and QOL
# Save every time early stopping occurs
# Hook up sweeps API for wandb

In [2]:
# Original paper (pg 24)
# https://arxiv.org/pdf/1909.07528.pdf

# Original blog
# https://openai.com/research/emergent-tool-use

# Gym
# https://gymnasium.farama.org/

# Slides
# https://glouppe.github.io/info8004-advanced-machine-learning/pdf/pleroy-hide-and-seek.pdf

# PPO implementation
# https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO.py#L38

# Residual SA
# https://github.com/openai/multi-agent-emergence-environments/blob/bafaf1e11e6398624116761f91ae7c93b136f395/ma_policy/layers.py#L89

In [3]:
%load_ext autoreload
%autoreload 2
%env WANDB_NOTEBOOK_NAME train.ipynb
%env WANDB_SILENT true

env: WANDB_NOTEBOOK_NAME=train.ipynb
env: WANDB_SILENT=true


In [4]:
from collections import defaultdict
import os

import inept
import numpy as np
import pandas as pd
import torch
import wandb

# Set params
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATA_FOLDER = os.path.join(os.path.abspath(''), '../data')
MODEL_FOLDER = os.path.join(os.path.abspath(''), 'temp/trained_models')

# Script arguments
# import sys
# arg1 = int(sys.argv[1])

In [5]:
# Reproducibility
seed = 42
torch.manual_seed(seed)
if DEVICE == 'cuda': torch.cuda.manual_seed(seed)
np.random.seed(seed)

note_kwargs = {'seed': seed}

### Load Data

In [6]:
# Dataset loading
dataset_name = 'BrainChromatin'

if dataset_name == 'scNMT':
    dataset_dir = os.path.join(DATA_FOLDER, 'UnionCom/scNMT')
    M1 = pd.read_csv(os.path.join(dataset_dir, 'Paccessibility_300.txt'), delimiter=' ', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(dataset_dir, 'Pmethylation_300.txt'), delimiter=' ', header=None).to_numpy()
    M3 = pd.read_csv(os.path.join(dataset_dir, 'RNA_300.txt'), delimiter=' ', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(dataset_dir, 'type1.txt'), delimiter=' ', header=None).to_numpy().flatten()
    T2 = pd.read_csv(os.path.join(dataset_dir, 'type2.txt'), delimiter=' ', header=None).to_numpy().flatten()
    T3 = pd.read_csv(os.path.join(dataset_dir, 'type3.txt'), delimiter=' ', header=None).to_numpy().flatten()

elif dataset_name == 'BrainChromatin':
    nrows = None  # 2_000
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_rna_counts.tsv'), delimiter='\t', nrows=nrows).transpose()  # 4.6 Gb in memory
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_atac_gene_activities.tsv'), delimiter='\t', nrows=nrows).transpose()  # 2.6 Gb in memory
    M2 = M2.transpose()[M1.index].transpose()
    meta = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_cell_metadata.txt'), delimiter='\t')
    meta_names = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_cluster_names.txt'), delimiter='\t')
    meta_names = meta_names[meta_names['Assay'] == 'Multiome ATAC']
    meta = pd.merge(meta, meta_names, left_on='ATAC_cluster', right_on='Cluster.ID', how='left')
    meta.index = meta['Cell.ID']
    T1 = T2 = np.array(meta.transpose()[M1.index].transpose()['Cluster.Name'])
    F1, F2 = M1.columns, M2.columns
    M1, M2 = M1.to_numpy(), M2.to_numpy()

    del meta, meta_names

elif dataset_name == 'scGEM':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/GeneExpression.txt'), delimiter=' ', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/DNAmethylation.txt'), delimiter=' ', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type1.txt'), delimiter=' ', header=None).to_numpy()
    T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type2.txt'), delimiter=' ', header=None).to_numpy()
    F1 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/gex_names.txt'), dtype='str')
    F2 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/dm_names.txt'), dtype='str')

# MMD-MA data
elif dataset_name == 'MMD-MA':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped1.txt'), delimiter='\t', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped2.txt'), delimiter='\t', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type1.txt'), delimiter='\t', header=None).to_numpy()
    T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type2.txt'), delimiter='\t', header=None).to_numpy()

# Random data
elif dataset_name == 'Random':
    num_nodes = 100
    M1 = torch.rand((num_nodes, 8), device=DEVICE)
    M2 = torch.rand((num_nodes, 16), device=DEVICE)

else: assert False, 'No matching dataset found.'

# Parameters
num_nodes = 100  # M1.shape[0]
modalities = [M1, M2]  # [[M1, M2, M3][1]]  # TODO: Make more flexible
types = [T1, T2]  # [[T1, T2, T3][1]]

# Modify data
modalities = inept.utilities.normalize(*modalities, keep_array=True)  # Normalize
modalities = inept.utilities.pca_features(*modalities, num_features=(512, 512), keep_array=True)  # PCA features (2 min for 8k x 35+k)
subsample = inept.utilities.subsample_nodes(*modalities, *types, num_nodes=num_nodes, keep_array=True)  # Subsample nodes
modalities, types = subsample[:len(modalities)], subsample[len(modalities):]
# modalities = inept.utilities.subsample_features(*modalities, num_features=(16, 16), keep_array=True)  # Subsample features

# Cast types
modalities = [torch.tensor(Mx, dtype=torch.float32, device=DEVICE) for Mx in modalities]

### Parameters

In [7]:
# Data parameters
data_kwargs = {
    'dataset': dataset_name,
    'num_nodes': num_nodes,
}

# Environment parameters
env_kwargs = {
    'dim': 16,  # 2 = (x, y, vx, vy), 3 = (x, y, z, vx, vy, vz)
    'pos_bound': 10,
    'pos_rand_bound': 1,
    'vel_bound': 1,
    'delta': .1,
    # 'reward_distance': 0,
    # 'reward_origin': 0,
    # 'penalty_bound': 0,
    # 'penalty_velocity': 0,
    # 'penalty_action': 0,
    'reward_distance_type': 'euclidean',
}

# Environment weight stages
stages_kwargs = {
    'env': (
        # Stage 0
        {'penalty_bound': 1},
        # Stage 1
        {'reward_origin': 1},
        # Stage 2
        {'penalty_velocity': 1, 'penalty_action': 1},
        # Stage 3
        {'reward_origin': 0, 'reward_distance': 1},
    ),
}

# Training parameters
max_ep_timesteps = 1e3
update_timesteps = 5 * max_ep_timesteps
max_timesteps = 1e3 * update_timesteps
MAX_BATCH = min( 500, data_kwargs['num_nodes'] )  # NOTE: value should be similar to update_minibatch, if a bit larger
MAX_NODES = min( 50, data_kwargs['num_nodes'] )  # Larger means smaller minibatches but a fuller picture for each agent
MAX_BATCH = MAX_NODES = None  # TODO: Currently values other than `None` do not work with update
train_kwargs = {
    'max_ep_timesteps': max_ep_timesteps,
    'max_timesteps': max_timesteps,
    'update_timesteps': update_timesteps,
    'max_batch': MAX_BATCH,  # Max number of nodes to calculate actions for at a time
    'max_nodes': MAX_NODES,  # Max number of nodes to use as neighbors in action calculation
}

# Policy parameters
# num_train_nodes = data_kwargs['num_nodes'] if train_kwargs['max_nodes'] is None else min(data_kwargs['num_nodes'], train_kwargs['max_nodes'])
# GPU_MEMORY = 6; CPU_MEMORY = 16  # Optimized for 6Gb VRAM and 16Gb RAM
# MAX_GPU_RUN_SAMPLES = int( .8 * (GPU_MEMORY / 6) * 1e4 * (2000 / sum(M.shape[1] for M in modalities)) * (20 / num_train_nodes) )
# GPU_STORE_SAMPLES = int( 2 * MAX_GPU_RUN_SAMPLES )  # 3
# MAX_CPU_SAMPLES = int( (CPU_MEMORY / GPU_MEMORY) * MAX_GPU_RUN_SAMPLES )
# IDEAL_BATCH_SIZE = int( max_ep_timesteps )
update_maxbatch = None  # `MAX_CPU_SAMPLES`, `None` takes slightly longer but is more reliable
update_batch = int(1e3)  # Same or larger size as `update_maxbatch` skips GPU cast step inside epoch loop
update_minibatch = int(1e3)
policy_kwargs = {
    # Main arguments
    'num_features_per_node': 2*env_kwargs['dim'],
    'modal_sizes': [M.shape[1] for M in modalities],
    'output_dim': env_kwargs['dim'],
    'action_std_init': .6,
    'action_std_decay': .05,
    'action_std_min': .3,  # .1
    'epochs': 80,
    'epsilon_clip': .2,
    'memory_gamma': .95,
    'memory_prune': 100,
    'actor_lr': 3e-4,
    'critic_lr': 1e-3,
    'lr_gamma': 1,
    'update_maxbatch': update_maxbatch,  # Batch to load into RAM
    'update_batch': update_batch,  # Batch to load into VRAM
    'update_minibatch': update_minibatch,  # Batch to compute
    'device': DEVICE,
    # Layer arguments
    'embed_dim': 64,
    'feature_embed_dim': 32,
    'rs_nset': 1e5,  # Inversely proportional to influence of individual reward on moving statistics
}

# Early stopping parameters
es_kwargs = {
    # Global parameters
    'method': 'average',
    'buffer': 6 * int(train_kwargs['update_timesteps'] / train_kwargs['max_ep_timesteps']),  # 6 training cycles
    'delta': .01,
    'decreasing': False,
    # `average` method parameters
    'window_size': 3 * int(train_kwargs['update_timesteps'] / train_kwargs['max_ep_timesteps']),  # 3 training cycles
}

### Train Policy

In [8]:
# Tracking parameters
# Use `watch -d -n 0.5 nvidia-smi` to watch CUDA memory usage
# Use `top` to watch system memory usage
# Run script and put following above function to profile
#    from memory_profiler import profile
#    @profile
# Use cProfiler to profile timing:
#    python -m cProfile -s time -o profile.prof train.py
#    snakeviz profile.prof
use_wandb = True

# Initialize classes
env = inept.environments.trajectory(*modalities, **env_kwargs, **stages_kwargs['env'][0], device=DEVICE)  # Set to first stage
policy = inept.models.PPO(**policy_kwargs).train()
early_stopping = inept.utilities.EarlyStopping(**es_kwargs)

# Initialize wandb
if use_wandb: wandb.init(
    project='INEPT',
    config={
        **{'note/'+k:v for k, v in note_kwargs.items()},
        **{'data/'+k:v for k, v in data_kwargs.items()},
        **{'env/'+k:v for k, v in env_kwargs.items()},
        **{'stages/'+k:v for k, v in stages_kwargs.items()},
        **{'policy/'+k:v for k, v in policy_kwargs.items()},
        **{'train/'+k:v for k, v in train_kwargs.items()},
        **{'es/'+k:v for k, v in es_kwargs.items()},
    },
)

# Initialize logging vars
torch.cuda.reset_peak_memory_stats()
timer = inept.utilities.time_logger(discard_first_sample=True)
timestep = 0; episode = 1; stage = 0

# CLI
print('Beginning training')
num_train_nodes = data_kwargs['num_nodes'] if train_kwargs['max_nodes'] is None else min(data_kwargs['num_nodes'], train_kwargs['max_nodes'])
print(
    f'Training using {num_train_nodes} nodes out of a'
    f' total {data_kwargs["num_nodes"]} with batches of'
    f' size {train_kwargs["max_batch"]}.'
)
update_maxbatch_print = (
    policy_kwargs["update_maxbatch"]
    if policy_kwargs["update_maxbatch"] is not None else 
    'all'
)
print(
    f'Training on {update_maxbatch_print} states'
    f' with batches of size {policy_kwargs["update_batch"]}'
    f' and minibatches of size {policy_kwargs["update_minibatch"]}'
    f' from {int(train_kwargs["update_timesteps"] * data_kwargs["num_nodes"])} total.')

# Simulation loop
while timestep < train_kwargs['max_timesteps']:
    # Reset environment
    env.reset()
    timer.log('Reset Environment')

    # Start episode
    ep_timestep = 0; ep_reward = 0; ep_itemized_reward = defaultdict(lambda: 0)
    while ep_timestep < train_kwargs['max_ep_timesteps']:
        with torch.no_grad():
            # Get current state
            state = env.get_state(include_modalities=True)
            timer.log('Environment Setup')

            # Get actions from policy
            actions = policy.act_macro(
                state,
                keys=list(range(num_nodes)),
                max_batch=train_kwargs['max_batch'],
                max_nodes=train_kwargs['max_nodes'],
            ).detach()
            timer.log('Calculate Actions')

            # Step environment and get reward
            rewards, finished, itemized_rewards = env.step(actions, return_rewards=True)
            finished = finished or (ep_timestep == train_kwargs['max_ep_timesteps']-1)  # Maybe move logic inside env?
            timer.log('Step Environment')

            # Record rewards for policy
            policy.memory.record(
                rewards=rewards.cpu().tolist(),
                is_terminals=finished,
            )

            # Record rewards for logging
            ep_reward = ep_reward + rewards.cpu().mean()
            for k, v in itemized_rewards.items():
                ep_itemized_reward[k] += v.cpu().mean()
            timer.log('Record Rewards')

        # Iterate
        timestep += 1
        ep_timestep += 1

        # Update model
        if timestep % train_kwargs['update_timesteps'] == 0:
            # assert False
            print(f'Updating model with average reward {np.mean(policy.memory.storage["rewards"])} on episode {episode} and timestep {timestep}', end='')
            policy.update()
            print(f' ({torch.cuda.max_memory_allocated() / 1024**3:.2f} GB CUDA)')
            torch.cuda.reset_peak_memory_stats()
            timer.log('Update Policy')

        # Escape if finished
        if finished: break

    # Upload stats
    ep_reward = (ep_reward / ep_timestep).item()
    update = int(timestep / train_kwargs['update_timesteps'])
    if use_wandb:
        wandb.log({
            **{
            # Measurements
            'end_timestep': timestep,
            'episode': episode,
            'update': update,
            'stage': stage,
            # Parameters
            'action_std': policy.action_std,
            # Outputs
            'average_reward': ep_reward,
            },
            **{'rewards/'+k: (v / ep_timestep).item() for k, v in ep_itemized_reward.items()},
        })
    timer.log('Record Stats')

    # Decay model std
    if early_stopping(ep_reward) or timestep >= train_kwargs['max_timesteps']:
        # Save model
        wgt_file = os.path.join(MODEL_FOLDER, f'policy_{stage:02}.wgt')
        torch.save(policy.state_dict(), wgt_file)  # Save just weights
        if use_wandb: wandb.save(wgt_file)
        mdl_file = os.path.join(MODEL_FOLDER, f'policy_{stage:02}.mdl')
        torch.save(policy, mdl_file)  # Save whole model
        if use_wandb: wandb.save(mdl_file)

        # End if maximum timesteps reached
        if timestep >= train_kwargs['max_timesteps']:
            print('Maximal timesteps reached')

        # End if at minimum `action_std`
        if policy.action_std <= policy.action_std_min:
            print(f'Ending early on episode {episode} and timestep {timestep}')
            break

        # Activate next stage or decay
        stage += 1
        # CLI
        print(f'Advancing training to stage {stage}')
        if stage < len(stages_kwargs['env']):
            # Activate next stage
            env.set_rewards(stages_kwargs['env'][stage])
        else:
            # Decay policy randomness
            policy.decay_action_std()
            # CLI
            print(f'Decaying std to {policy.action_std} on episode {episode} and timestep {timestep}')

        # Reset early stopping
        early_stopping.reset()
    timer.log('Early Stopping')

    # Iterate
    episode += 1

# CLI Timer
print()
timer.aggregate('sum')

# Finish wandb
if use_wandb: wandb.finish()

Beginning training
Training using 100 nodes out of a total 100 with batches of size None.
Training on all states with batches of size 1000 and minibatches of size 1000 from 500000 total.


Updating model with average reward -1.799784 on episode 5 and timestep 5000

 (1.84 GB CUDA)


Updating model with average reward -1.799884 on episode 10 and timestep 10000

 (1.84 GB CUDA)


Updating model with average reward -1.79962 on episode 15 and timestep 15000

 (1.84 GB CUDA)


Updating model with average reward -1.799592 on episode 20 and timestep 20000

 (1.84 GB CUDA)


Updating model with average reward -1.800364 on episode 25 and timestep 25000

 (1.84 GB CUDA)


Updating model with average reward -1.799824 on episode 30 and timestep 30000

 (1.84 GB CUDA)


Updating model with average reward -1.800448 on episode 35 and timestep 35000

 (1.84 GB CUDA)


Updating model with average reward -1.800808 on episode 40 and timestep 40000

 (1.84 GB CUDA)


Advancing training to stage 1


Updating model with average reward -2.776248787691116 on episode 45 and timestep 45000

 (1.84 GB CUDA)


Updating model with average reward -6.884418346988678 on episode 50 and timestep 50000

 (1.84 GB CUDA)


Updating model with average reward -6.8529901787719725 on episode 55 and timestep 55000

 (1.84 GB CUDA)


Updating model with average reward -6.834051157831192 on episode 60 and timestep 60000

 (1.84 GB CUDA)


Updating model with average reward -6.85424992301178 on episode 65 and timestep 65000

 (1.84 GB CUDA)


Updating model with average reward -6.81192519704628 on episode 70 and timestep 70000

 (1.84 GB CUDA)


Updating model with average reward -6.827983090917587 on episode 75 and timestep 75000

 (1.84 GB CUDA)


Updating model with average reward -6.784271094841003 on episode 80 and timestep 80000

 (1.84 GB CUDA)


Updating model with average reward -6.752155093124389 on episode 85 and timestep 85000

 (1.84 GB CUDA)


Updating model with average reward -6.792695621879577 on episode 90 and timestep 90000

 (1.84 GB CUDA)


Updating model with average reward -6.818986689817429 on episode 95 and timestep 95000

 (1.84 GB CUDA)


Updating model with average reward -6.902526187099457 on episode 100 and timestep 100000

 (1.84 GB CUDA)


Updating model with average reward -6.873219559595108 on episode 105 and timestep 105000

 (1.84 GB CUDA)


Updating model with average reward -6.867688735774994 on episode 110 and timestep 110000

 (1.84 GB CUDA)


Updating model with average reward -6.833577895097733 on episode 115 and timestep 115000

 (1.84 GB CUDA)


Advancing training to stage 2


Updating model with average reward -7.393195750871718 on episode 120 and timestep 120000

 (1.84 GB CUDA)


Updating model with average reward -9.558482573588103 on episode 125 and timestep 125000

 (1.84 GB CUDA)


Updating model with average reward -9.647641744598776 on episode 130 and timestep 130000

 (1.84 GB CUDA)


Updating model with average reward -9.611228645627678 on episode 135 and timestep 135000

 (1.84 GB CUDA)


Updating model with average reward -9.643206862432898 on episode 140 and timestep 140000

 (1.84 GB CUDA)


Updating model with average reward -9.574857114470989 on episode 145 and timestep 145000

 (1.84 GB CUDA)


Updating model with average reward -9.712794361851842 on episode 150 and timestep 150000

 (1.84 GB CUDA)


Updating model with average reward -9.71495167450732 on episode 155 and timestep 155000

 (1.84 GB CUDA)


Updating model with average reward -9.66722729694587 on episode 160 and timestep 160000

 (1.84 GB CUDA)


Advancing training to stage 3


Updating model with average reward -32.60445423313662 on episode 165 and timestep 165000

 (1.84 GB CUDA)


Updating model with average reward -78.66042102457335 on episode 170 and timestep 170000

 (1.84 GB CUDA)


Updating model with average reward -99.94819492009971 on episode 175 and timestep 175000

 (1.84 GB CUDA)


Updating model with average reward -116.24918414047903 on episode 180 and timestep 180000

 (1.84 GB CUDA)


Updating model with average reward -121.2322001483891 on episode 185 and timestep 185000

 (1.84 GB CUDA)


Updating model with average reward -105.37549084166798 on episode 190 and timestep 190000

 (1.84 GB CUDA)


Updating model with average reward -121.70409619530284 on episode 195 and timestep 195000

 (1.84 GB CUDA)


Updating model with average reward -126.63770343954104 on episode 200 and timestep 200000

 (1.84 GB CUDA)


Updating model with average reward -98.04618187507188 on episode 205 and timestep 205000

 (1.84 GB CUDA)


Advancing training to stage 4


Decaying std to 0.5499999999999999 on episode 207 and timestep 207000


Updating model with average reward -87.40525031784219 on episode 210 and timestep 210000

 (1.84 GB CUDA)


Updating model with average reward -67.32637202479572 on episode 215 and timestep 215000

 (1.84 GB CUDA)


Updating model with average reward -65.33762345031953 on episode 220 and timestep 220000

 (1.84 GB CUDA)


Updating model with average reward -49.60963911421084 on episode 225 and timestep 225000

 (1.84 GB CUDA)


Updating model with average reward -51.092656703962504 on episode 230 and timestep 230000

 (1.84 GB CUDA)


Updating model with average reward -51.0932673756643 on episode 235 and timestep 235000

 (1.84 GB CUDA)


Updating model with average reward -50.54438122324136 on episode 240 and timestep 240000

 (1.84 GB CUDA)


Updating model with average reward -37.87332980942005 on episode 245 and timestep 245000

 (1.84 GB CUDA)


Updating model with average reward -51.15872349173971 on episode 250 and timestep 250000

 (1.84 GB CUDA)


Updating model with average reward -51.81262960460174 on episode 255 and timestep 255000

 (1.84 GB CUDA)


Updating model with average reward -43.366038636549774 on episode 260 and timestep 260000

 (1.84 GB CUDA)


Updating model with average reward -51.97554910318351 on episode 265 and timestep 265000

 (1.84 GB CUDA)


Updating model with average reward -37.790457100164325 on episode 270 and timestep 270000

 (1.84 GB CUDA)


Updating model with average reward -31.119845344733715 on episode 275 and timestep 275000

 (1.84 GB CUDA)


Updating model with average reward -30.634702656709074 on episode 280 and timestep 280000

 (1.84 GB CUDA)


Updating model with average reward -28.30142200693935 on episode 285 and timestep 285000

 (1.84 GB CUDA)


Updating model with average reward -23.485484711388974 on episode 290 and timestep 290000

 (1.84 GB CUDA)


Updating model with average reward -20.563424723508774 on episode 295 and timestep 295000

 (1.84 GB CUDA)


Updating model with average reward -16.99750821725562 on episode 300 and timestep 300000

 (1.84 GB CUDA)


Updating model with average reward -17.153563096957534 on episode 305 and timestep 305000

 (1.84 GB CUDA)


Updating model with average reward -15.654784315916151 on episode 310 and timestep 310000

 (1.84 GB CUDA)


Updating model with average reward -17.39854250883046 on episode 315 and timestep 315000

 (1.84 GB CUDA)


Updating model with average reward -22.7224079766981 on episode 320 and timestep 320000

 (1.84 GB CUDA)


Updating model with average reward -15.486493563870757 on episode 325 and timestep 325000

 (1.84 GB CUDA)


Updating model with average reward -15.767392972273916 on episode 330 and timestep 330000

 (1.84 GB CUDA)


Updating model with average reward -16.445460737044513 on episode 335 and timestep 335000

 (1.84 GB CUDA)


Updating model with average reward -16.065669628213524 on episode 340 and timestep 340000

 (1.84 GB CUDA)


Updating model with average reward -18.39757518992743 on episode 345 and timestep 345000

 (1.84 GB CUDA)


Updating model with average reward -15.335296819274276 on episode 350 and timestep 350000

 (1.84 GB CUDA)


Updating model with average reward -18.777852646656484 on episode 355 and timestep 355000

 (1.84 GB CUDA)


Updating model with average reward -15.899976679019987 on episode 360 and timestep 360000

 (1.84 GB CUDA)


Updating model with average reward -17.836049972095193 on episode 365 and timestep 365000

 (1.84 GB CUDA)


Updating model with average reward -20.53052566667676 on episode 370 and timestep 370000

 (1.84 GB CUDA)


Advancing training to stage 5
Decaying std to 0.49999999999999994 on episode 371 and timestep 371000


Updating model with average reward -29.07488057235685 on episode 375 and timestep 375000

 (1.84 GB CUDA)


Updating model with average reward -42.46807474957165 on episode 380 and timestep 380000

 (1.84 GB CUDA)


Updating model with average reward -35.74519739513254 on episode 385 and timestep 385000

 (1.84 GB CUDA)


Updating model with average reward -27.24644309745866 on episode 390 and timestep 390000

 (1.84 GB CUDA)


Updating model with average reward -27.42211533705148 on episode 395 and timestep 395000

 (1.84 GB CUDA)


Updating model with average reward -21.012215574471444 on episode 400 and timestep 400000

 (1.84 GB CUDA)


Updating model with average reward -33.173414230924365 on episode 405 and timestep 405000

 (1.84 GB CUDA)


Updating model with average reward -13.230051661337168 on episode 410 and timestep 410000

 (1.84 GB CUDA)


Updating model with average reward -14.360528806894958 on episode 415 and timestep 415000

 (1.84 GB CUDA)


Updating model with average reward -14.163790727821649 on episode 420 and timestep 420000

 (1.84 GB CUDA)


Updating model with average reward -17.042552826623886 on episode 425 and timestep 425000

 (1.84 GB CUDA)


Updating model with average reward -12.543445029318004 on episode 430 and timestep 430000

 (1.84 GB CUDA)


Updating model with average reward -11.5042566763874 on episode 435 and timestep 435000

 (1.84 GB CUDA)


Updating model with average reward -13.364013537219375 on episode 440 and timestep 440000

 (1.84 GB CUDA)


Updating model with average reward -8.787366990727515 on episode 445 and timestep 445000

 (1.84 GB CUDA)


Updating model with average reward -12.212988546191633 on episode 450 and timestep 450000

 (1.84 GB CUDA)


Updating model with average reward -12.854487040140093 on episode 455 and timestep 455000

 (1.84 GB CUDA)


Updating model with average reward -13.002863217254191 on episode 460 and timestep 460000

 (1.84 GB CUDA)


Updating model with average reward -13.249766091472925 on episode 465 and timestep 465000

 (1.84 GB CUDA)


Updating model with average reward -12.647579032315164 on episode 470 and timestep 470000

 (1.84 GB CUDA)


Updating model with average reward -12.452738695024758 on episode 475 and timestep 475000

 (1.84 GB CUDA)


Advancing training to stage 6
Decaying std to 0.44999999999999996 on episode 476 and timestep 476000


Updating model with average reward -12.020394026265562 on episode 480 and timestep 480000

 (1.84 GB CUDA)


Updating model with average reward -17.846755481451034 on episode 485 and timestep 485000

 (1.84 GB CUDA)


Updating model with average reward -14.530612812152713 on episode 490 and timestep 490000

 (1.84 GB CUDA)


Updating model with average reward -16.853917465153692 on episode 495 and timestep 495000

 (1.84 GB CUDA)


Updating model with average reward -14.03370317368883 on episode 500 and timestep 500000

 (1.84 GB CUDA)


Updating model with average reward -10.415919594438135 on episode 505 and timestep 505000

 (1.84 GB CUDA)


Updating model with average reward -11.111618050969064 on episode 510 and timestep 510000

 (1.84 GB CUDA)


Updating model with average reward -10.86419264424953 on episode 515 and timestep 515000

 (1.84 GB CUDA)


Updating model with average reward -7.7310030492261355 on episode 520 and timestep 520000

 (1.84 GB CUDA)


Updating model with average reward -5.84333940972659 on episode 525 and timestep 525000

 (1.84 GB CUDA)


Updating model with average reward -5.6742688058885635 on episode 530 and timestep 530000

 (1.84 GB CUDA)


Updating model with average reward -5.938145953436404 on episode 535 and timestep 535000

 (1.84 GB CUDA)


Updating model with average reward -6.059726064467817 on episode 540 and timestep 540000

 (1.84 GB CUDA)


Updating model with average reward -5.390146471552759 on episode 545 and timestep 545000

 (1.84 GB CUDA)


Updating model with average reward -5.150482372455776 on episode 550 and timestep 550000

 (1.84 GB CUDA)


Updating model with average reward -5.138583801909968 on episode 555 and timestep 555000

 (1.84 GB CUDA)


Updating model with average reward -5.152726130607814 on episode 560 and timestep 560000

 (1.84 GB CUDA)


Updating model with average reward -5.109961903067499 on episode 565 and timestep 565000

 (1.84 GB CUDA)


Updating model with average reward -5.42837537838769 on episode 570 and timestep 570000

 (1.84 GB CUDA)


Updating model with average reward -5.20334746860227 on episode 575 and timestep 575000

 (1.84 GB CUDA)


Updating model with average reward -5.351800757605136 on episode 580 and timestep 580000

 (1.84 GB CUDA)


Updating model with average reward -5.218373548769742 on episode 585 and timestep 585000

 (1.84 GB CUDA)


Advancing training to stage 7
Decaying std to 0.39999999999999997 on episode 589 and timestep 589000


Updating model with average reward -5.333351573022037 on episode 590 and timestep 590000

 (1.84 GB CUDA)


Updating model with average reward -6.778255011552587 on episode 595 and timestep 595000

 (1.84 GB CUDA)


Updating model with average reward -6.30175199424313 on episode 600 and timestep 600000

 (1.84 GB CUDA)


Updating model with average reward -7.685170701672867 on episode 605 and timestep 605000

 (1.84 GB CUDA)


Updating model with average reward -7.483269036755353 on episode 610 and timestep 610000

 (1.84 GB CUDA)


Updating model with average reward -10.038381953746036 on episode 615 and timestep 615000

 (1.84 GB CUDA)


Updating model with average reward -12.538861920259178 on episode 620 and timestep 620000

 (1.84 GB CUDA)


Updating model with average reward -13.704133693677575 on episode 625 and timestep 625000

 (1.84 GB CUDA)


Updating model with average reward -13.829688348166645 on episode 630 and timestep 630000

 (1.84 GB CUDA)


Advancing training to stage 8
Decaying std to 0.35 on episode 633 and timestep 633000


Updating model with average reward -19.58728083446881 on episode 635 and timestep 635000

 (1.84 GB CUDA)


Updating model with average reward -25.787675192118883 on episode 640 and timestep 640000

 (1.84 GB CUDA)


Updating model with average reward -20.75011000195354 on episode 645 and timestep 645000

 (1.84 GB CUDA)


Updating model with average reward -27.48413649053067 on episode 650 and timestep 650000

 (1.84 GB CUDA)


Updating model with average reward -29.601040708065987 on episode 655 and timestep 655000

 (1.84 GB CUDA)


Updating model with average reward -25.64412389472054 on episode 660 and timestep 660000

 (1.84 GB CUDA)


Updating model with average reward -22.00385346833095 on episode 665 and timestep 665000

 (1.84 GB CUDA)


Updating model with average reward -20.24297930950263 on episode 670 and timestep 670000

 (1.84 GB CUDA)


Updating model with average reward -16.755163912676096 on episode 675 and timestep 675000

 (1.84 GB CUDA)


Updating model with average reward -17.03976842506145 on episode 680 and timestep 680000

 (1.84 GB CUDA)


Updating model with average reward -14.952116297067343 on episode 685 and timestep 685000

 (1.84 GB CUDA)


Updating model with average reward -16.953737919374166 on episode 690 and timestep 690000

 (1.84 GB CUDA)


Updating model with average reward -15.914461164268822 on episode 695 and timestep 695000

 (1.84 GB CUDA)


Updating model with average reward -14.582468785744965 on episode 700 and timestep 700000

 (1.84 GB CUDA)


Updating model with average reward -15.546346342666805 on episode 705 and timestep 705000

 (1.84 GB CUDA)


Updating model with average reward -13.183835221581012 on episode 710 and timestep 710000

 (1.84 GB CUDA)


Updating model with average reward -13.19383310176602 on episode 715 and timestep 715000

 (1.84 GB CUDA)


Updating model with average reward -12.6697707370812 on episode 720 and timestep 720000

 (1.84 GB CUDA)


Updating model with average reward -12.625116043154419 on episode 725 and timestep 725000

 (1.84 GB CUDA)


Updating model with average reward -10.787695040001958 on episode 730 and timestep 730000

 (1.84 GB CUDA)


Updating model with average reward -11.30267589387536 on episode 735 and timestep 735000

 (1.84 GB CUDA)


Updating model with average reward -11.832542544986442 on episode 740 and timestep 740000

 (1.84 GB CUDA)


Updating model with average reward -12.251899414376677 on episode 745 and timestep 745000

 (1.84 GB CUDA)


Updating model with average reward -8.08748215738979 on episode 750 and timestep 750000

 (1.84 GB CUDA)


Updating model with average reward -11.719394545398593 on episode 755 and timestep 755000

 (1.84 GB CUDA)


Updating model with average reward -11.70419696477431 on episode 760 and timestep 760000

 (1.84 GB CUDA)


Updating model with average reward -11.807432777734697 on episode 765 and timestep 765000

 (1.84 GB CUDA)


Updating model with average reward -12.043643930234433 on episode 770 and timestep 770000

 (1.84 GB CUDA)


Updating model with average reward -11.838740572102248 on episode 775 and timestep 775000

 (1.84 GB CUDA)


Updating model with average reward -10.593112510543556 on episode 780 and timestep 780000

 (1.84 GB CUDA)


Updating model with average reward -10.509028247152418 on episode 785 and timestep 785000

 (1.84 GB CUDA)


Updating model with average reward -8.184810003987879 on episode 790 and timestep 790000

 (1.84 GB CUDA)


Updating model with average reward -7.840547834730238 on episode 795 and timestep 795000

 (1.84 GB CUDA)


Updating model with average reward -6.48766321342507 on episode 800 and timestep 800000

 (1.84 GB CUDA)


Updating model with average reward -5.759580464462012 on episode 805 and timestep 805000

 (1.84 GB CUDA)


Updating model with average reward -6.833442268452823 on episode 810 and timestep 810000

 (1.84 GB CUDA)


Updating model with average reward -7.419431463851303 on episode 815 and timestep 815000

 (1.84 GB CUDA)


Updating model with average reward -6.353537752172277 on episode 820 and timestep 820000

 (1.84 GB CUDA)


Updating model with average reward -6.29478213696006 on episode 825 and timestep 825000

 (1.84 GB CUDA)


Updating model with average reward -8.350443088573456 on episode 830 and timestep 830000

 (1.84 GB CUDA)


Updating model with average reward -7.886364068330556 on episode 835 and timestep 835000

 (1.84 GB CUDA)


Updating model with average reward -7.521026623786629 on episode 840 and timestep 840000

 (1.84 GB CUDA)
Advancing training to stage 9
Decaying std to 0.3 on episode 840 and timestep 840000


Updating model with average reward -7.370886019222707 on episode 845 and timestep 845000

 (1.84 GB CUDA)


Updating model with average reward -8.712226154088974 on episode 850 and timestep 850000

 (1.84 GB CUDA)


Updating model with average reward -9.175240665392637 on episode 855 and timestep 855000

 (1.84 GB CUDA)


Updating model with average reward -10.137140256930202 on episode 860 and timestep 860000

 (1.84 GB CUDA)


Updating model with average reward -10.221659603412599 on episode 865 and timestep 865000

 (1.84 GB CUDA)


Updating model with average reward -11.730524170288206 on episode 870 and timestep 870000

 (1.84 GB CUDA)


Updating model with average reward -11.561935556562096 on episode 875 and timestep 875000

 (1.84 GB CUDA)


Updating model with average reward -11.854204524522007 on episode 880 and timestep 880000

 (1.84 GB CUDA)


Ending early on episode 884 and timestep 884000

Reset Environment: 0.13335915002971888


Environment Setup: 29.903530121780932


Calculate Actions: 6436.811601095833


Step Environment: 745.1742371665314


Record Rewards: 238.9386536339298
Record Stats: 0.36428650841116905
Early Stopping: 10.651592061854899
Update Policy: 6361.596251037903
Total: 13823.573510776274
