In [1]:
## To Check
# Check that rewards are normalized after (?) advantage

## High Priority Training Changes
# Make backward (MAX_NODES, MAX_BATCH) batching work
# Add multithreading to forward and distributed to backward
# Add compatibility for env being on CPU, check for timing changes

## Backburner Priority Training Changes
# Add compatibility for cells with missing modalities (add mask to distance reward)
# Try imitation learning to better learn CT trajectories
# Add parallel envs of different sizes, with different data to help generality
# Fix off-center positioning in large environments
# Revise distance reward - Maybe add cell attraction (all should be close to each other) and repulsion (repulsion based on distance in modality)
# Revise velocity and action penalties to encourage early cell-type separation (i.e. sqrt of vec length or similar)

## Bookkeeping and QOL
# Save every time early stopping occurs
# Hook up sweeps API for wandb

In [2]:
# Original paper (pg 24)
# https://arxiv.org/pdf/1909.07528.pdf

# Original blog
# https://openai.com/research/emergent-tool-use

# Gym
# https://gymnasium.farama.org/

# Slides
# https://glouppe.github.io/info8004-advanced-machine-learning/pdf/pleroy-hide-and-seek.pdf

# PPO implementation
# https://github.com/nikhilbarhate99/PPO-PyTorch/blob/master/PPO.py#L38

# Residual SA
# https://github.com/openai/multi-agent-emergence-environments/blob/bafaf1e11e6398624116761f91ae7c93b136f395/ma_policy/layers.py#L89

In [3]:
%load_ext autoreload
%autoreload 2
%env WANDB_NOTEBOOK_NAME train.ipynb
%env WANDB_SILENT true

env: WANDB_NOTEBOOK_NAME=train.ipynb
env: WANDB_SILENT=true


In [4]:
from collections import defaultdict
import os

import inept
import numpy as np
import pandas as pd
import torch
import wandb

# Set params
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DATA_FOLDER = os.path.join(os.path.abspath(''), '../data')
MODEL_FOLDER = os.path.join(os.path.abspath(''), 'temp/trained_models')

# Script arguments
# import sys
# arg1 = int(sys.argv[1])

In [5]:
# Reproducibility
seed = 42
torch.manual_seed(seed)
if DEVICE == 'cuda': torch.cuda.manual_seed(seed)
np.random.seed(seed)

note_kwargs = {'seed': seed}

### Load Data

In [6]:
# Dataset loading
dataset_name = 'BrainChromatin'

if dataset_name == 'scNMT':
    dataset_dir = os.path.join(DATA_FOLDER, 'UnionCom/scNMT')
    M1 = pd.read_csv(os.path.join(dataset_dir, 'Paccessibility_300.txt'), delimiter=' ', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(dataset_dir, 'Pmethylation_300.txt'), delimiter=' ', header=None).to_numpy()
    M3 = pd.read_csv(os.path.join(dataset_dir, 'RNA_300.txt'), delimiter=' ', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(dataset_dir, 'type1.txt'), delimiter=' ', header=None).to_numpy().flatten()
    T2 = pd.read_csv(os.path.join(dataset_dir, 'type2.txt'), delimiter=' ', header=None).to_numpy().flatten()
    T3 = pd.read_csv(os.path.join(dataset_dir, 'type3.txt'), delimiter=' ', header=None).to_numpy().flatten()

elif dataset_name == 'BrainChromatin':
    nrows = None  # 2_000
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_rna_counts.tsv'), delimiter='\t', nrows=nrows).transpose()  # 4.6 Gb in memory
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_atac_gene_activities.tsv'), delimiter='\t', nrows=nrows).transpose()  # 2.6 Gb in memory
    M2 = M2.transpose()[M1.index].transpose()
    meta = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_cell_metadata.txt'), delimiter='\t')
    meta_names = pd.read_csv(os.path.join(DATA_FOLDER, 'brainchromatin/multiome_cluster_names.txt'), delimiter='\t')
    meta_names = meta_names[meta_names['Assay'] == 'Multiome ATAC']
    meta = pd.merge(meta, meta_names, left_on='ATAC_cluster', right_on='Cluster.ID', how='left')
    meta.index = meta['Cell.ID']
    T1 = T2 = np.array(meta.transpose()[M1.index].transpose()['Cluster.Name'])
    F1, F2 = M1.columns, M2.columns
    M1, M2 = M1.to_numpy(), M2.to_numpy()

    del meta, meta_names

elif dataset_name == 'scGEM':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/GeneExpression.txt'), delimiter=' ', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/DNAmethylation.txt'), delimiter=' ', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type1.txt'), delimiter=' ', header=None).to_numpy()
    T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/type2.txt'), delimiter=' ', header=None).to_numpy()
    F1 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/gex_names.txt'), dtype='str')
    F2 = np.loadtxt(os.path.join(DATA_FOLDER, 'UnionCom/scGEM/dm_names.txt'), dtype='str')

# MMD-MA data
elif dataset_name == 'MMD-MA':
    M1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped1.txt'), delimiter='\t', header=None).to_numpy()
    M2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_mapped2.txt'), delimiter='\t', header=None).to_numpy()
    T1 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type1.txt'), delimiter='\t', header=None).to_numpy()
    T2 = pd.read_csv(os.path.join(DATA_FOLDER, 'UnionCom/MMD/s1_type2.txt'), delimiter='\t', header=None).to_numpy()

# Random data
elif dataset_name == 'Random':
    num_nodes = 100
    M1 = torch.rand((num_nodes, 8), device=DEVICE)
    M2 = torch.rand((num_nodes, 16), device=DEVICE)

else: assert False, 'No matching dataset found.'

# Parameters
num_nodes = 100  # M1.shape[0]
modalities = [M1, M2]  # [[M1, M2, M3][1]]  # TODO: Make more flexible
types = [T1, T2]  # [[T1, T2, T3][1]]

# Modify data
modalities = inept.utilities.normalize(*modalities, keep_array=True)  # Normalize
modalities = inept.utilities.pca_features(*modalities, num_features=(512, 512), keep_array=True)  # PCA features (2 min for 8k x 35+k)
subsample = inept.utilities.subsample_nodes(*modalities, *types, num_nodes=num_nodes, keep_array=True)  # Subsample nodes
modalities, types = subsample[:len(modalities)], subsample[len(modalities):]
# modalities = inept.utilities.subsample_features(*modalities, num_features=(16, 16), keep_array=True)  # Subsample features

# Cast types
modalities = [torch.tensor(Mx, dtype=torch.float32, device=DEVICE) for Mx in modalities]

### Parameters

In [7]:
# Data parameters
data_kwargs = {
    'dataset': dataset_name,
    'num_nodes': num_nodes,
}

# Environment parameters
env_kwargs = {
    'dim': 32,  # 2 = (x, y, vx, vy), 3 = (x, y, z, vx, vy, vz)
    'pos_bound': 10,
    'pos_rand_bound': 1,
    'vel_bound': 1,
    'delta': .1,
    # 'reward_distance': 0,
    # 'reward_origin': 0,
    # 'penalty_bound': 0,
    # 'penalty_velocity': 0,
    # 'penalty_action': 0,
    'reward_distance_type': 'euclidean',
}

# Environment weight stages
stages_kwargs = {
    'env': (
        # Stage 0
        {'penalty_bound': 1},
        # Stage 1
        {'reward_origin': 1},
        # Stage 2
        {'penalty_velocity': 1, 'penalty_action': 1},
        # Stage 3
        {'reward_origin': 0, 'reward_distance': 1},
    ),
}

# Training parameters
max_ep_timesteps = 1e3
update_timesteps = 5 * max_ep_timesteps
max_timesteps = 1e3 * update_timesteps
MAX_BATCH = min( 500, data_kwargs['num_nodes'] )  # NOTE: value should be similar to update_minibatch, if a bit larger
MAX_NODES = min( 50, data_kwargs['num_nodes'] )  # Larger means smaller minibatches but a fuller picture for each agent
MAX_BATCH = MAX_NODES = None  # TODO: Currently values other than `None` do not work with update
train_kwargs = {
    'max_ep_timesteps': max_ep_timesteps,
    'max_timesteps': max_timesteps,
    'update_timesteps': update_timesteps,
    'max_batch': MAX_BATCH,  # Max number of nodes to calculate actions for at a time
    'max_nodes': MAX_NODES,  # Max number of nodes to use as neighbors in action calculation
}

# Policy parameters
# num_train_nodes = data_kwargs['num_nodes'] if train_kwargs['max_nodes'] is None else min(data_kwargs['num_nodes'], train_kwargs['max_nodes'])
# GPU_MEMORY = 6; CPU_MEMORY = 16  # Optimized for 6Gb VRAM and 16Gb RAM
# MAX_GPU_RUN_SAMPLES = int( .8 * (GPU_MEMORY / 6) * 1e4 * (2000 / sum(M.shape[1] for M in modalities)) * (20 / num_train_nodes) )
# GPU_STORE_SAMPLES = int( 2 * MAX_GPU_RUN_SAMPLES )  # 3
# MAX_CPU_SAMPLES = int( (CPU_MEMORY / GPU_MEMORY) * MAX_GPU_RUN_SAMPLES )
# IDEAL_BATCH_SIZE = int( max_ep_timesteps )
update_maxbatch = None  # `MAX_CPU_SAMPLES`, `None` takes slightly longer but is more reliable
update_batch = int(1e3)  # Same or larger size as `update_maxbatch` skips GPU cast step inside epoch loop
update_minibatch = int(1e3)
policy_kwargs = {
    # Main arguments
    'num_features_per_node': 2*env_kwargs['dim'],
    'modal_sizes': [M.shape[1] for M in modalities],
    'output_dim': env_kwargs['dim'],
    'action_std_init': .6,
    'action_std_decay': .05,
    'action_std_min': .3,  # .1
    'epochs': 80,
    'epsilon_clip': .2,
    'memory_gamma': .95,
    'memory_prune': 100,
    'actor_lr': 3e-4,
    'critic_lr': 1e-3,
    'lr_gamma': 1,
    'update_maxbatch': update_maxbatch,  # Batch to load into RAM
    'update_batch': update_batch,  # Batch to load into VRAM
    'update_minibatch': update_minibatch,  # Batch to compute
    'device': DEVICE,
    # Layer arguments
    'embed_dim': 64,
    'feature_embed_dim': 32,
    'rs_nset': 1e5,  # Inversely proportional to influence of individual reward on moving statistics
}

# Early stopping parameters
es_kwargs = {
    # Global parameters
    'method': 'average',
    'buffer': 6 * int(train_kwargs['update_timesteps'] / train_kwargs['max_ep_timesteps']),  # 6 training cycles
    'delta': .01,
    'decreasing': False,
    # `average` method parameters
    'window_size': 3 * int(train_kwargs['update_timesteps'] / train_kwargs['max_ep_timesteps']),  # 3 training cycles
}

### Train Policy

In [8]:
# Tracking parameters
# Use `watch -d -n 0.5 nvidia-smi` to watch CUDA memory usage
# Use `top` to watch system memory usage
# Run script and put following above function to profile
#    from memory_profiler import profile
#    @profile
# Use cProfiler to profile timing:
#    python -m cProfile -s time -o profile.prof train.py
#    snakeviz profile.prof
use_wandb = True

# Initialize classes
env = inept.environments.trajectory(*modalities, **env_kwargs, **stages_kwargs['env'][0], device=DEVICE)  # Set to first stage
policy = inept.models.PPO(**policy_kwargs).train()
early_stopping = inept.utilities.EarlyStopping(**es_kwargs)

# Initialize wandb
if use_wandb: wandb.init(
    project='INEPT',
    config={
        **{'note/'+k:v for k, v in note_kwargs.items()},
        **{'data/'+k:v for k, v in data_kwargs.items()},
        **{'env/'+k:v for k, v in env_kwargs.items()},
        **{'stages/'+k:v for k, v in stages_kwargs.items()},
        **{'policy/'+k:v for k, v in policy_kwargs.items()},
        **{'train/'+k:v for k, v in train_kwargs.items()},
        **{'es/'+k:v for k, v in es_kwargs.items()},
    },
)

# Initialize logging vars
torch.cuda.reset_peak_memory_stats()
timer = inept.utilities.time_logger(discard_first_sample=True)
timestep = 0; episode = 1; stage = 0

# CLI
print('Beginning training')
num_train_nodes = data_kwargs['num_nodes'] if train_kwargs['max_nodes'] is None else min(data_kwargs['num_nodes'], train_kwargs['max_nodes'])
print(
    f'Training using {num_train_nodes} nodes out of a'
    f' total {data_kwargs["num_nodes"]} with batches of'
    f' size {train_kwargs["max_batch"]}.'
)
update_maxbatch_print = (
    policy_kwargs["update_maxbatch"]
    if policy_kwargs["update_maxbatch"] is not None else 
    'all'
)
print(
    f'Training on {update_maxbatch_print} states'
    f' with batches of size {policy_kwargs["update_batch"]}'
    f' and minibatches of size {policy_kwargs["update_minibatch"]}'
    f' from {int(train_kwargs["update_timesteps"] * data_kwargs["num_nodes"])} total.')

# Simulation loop
while timestep < train_kwargs['max_timesteps']:
    # Reset environment
    env.reset()
    timer.log('Reset Environment')

    # Start episode
    ep_timestep = 0; ep_reward = 0; ep_itemized_reward = defaultdict(lambda: 0)
    while ep_timestep < train_kwargs['max_ep_timesteps']:
        with torch.no_grad():
            # Get current state
            state = env.get_state(include_modalities=True)
            timer.log('Environment Setup')

            # Get actions from policy
            actions = policy.act_macro(
                state,
                keys=list(range(num_nodes)),
                max_batch=train_kwargs['max_batch'],
                max_nodes=train_kwargs['max_nodes'],
            ).detach()
            timer.log('Calculate Actions')

            # Step environment and get reward
            rewards, finished, itemized_rewards = env.step(actions, return_rewards=True)
            finished = finished or (ep_timestep == train_kwargs['max_ep_timesteps']-1)  # Maybe move logic inside env?
            timer.log('Step Environment')

            # Record rewards for policy
            policy.memory.record(
                rewards=rewards.cpu().tolist(),
                is_terminals=finished,
            )

            # Record rewards for logging
            ep_reward = ep_reward + rewards.cpu().mean()
            for k, v in itemized_rewards.items():
                ep_itemized_reward[k] += v.cpu().mean()
            timer.log('Record Rewards')

        # Iterate
        timestep += 1
        ep_timestep += 1

        # Update model
        if timestep % train_kwargs['update_timesteps'] == 0:
            # assert False
            print(f'Updating model with average reward {np.mean(policy.memory.storage["rewards"])} on episode {episode} and timestep {timestep}', end='')
            policy.update()
            print(f' ({torch.cuda.max_memory_allocated() / 1024**3:.2f} GB CUDA)')
            torch.cuda.reset_peak_memory_stats()
            timer.log('Update Policy')

        # Escape if finished
        if finished: break

    # Upload stats
    ep_reward = (ep_reward / ep_timestep).item()
    update = int(timestep / train_kwargs['update_timesteps'])
    if use_wandb:
        wandb.log({
            **{
            # Measurements
            'end_timestep': timestep,
            'episode': episode,
            'update': update,
            'stage': stage,
            # Parameters
            'action_std': policy.action_std,
            # Outputs
            'average_reward': ep_reward,
            },
            **{'rewards/'+k: (v / ep_timestep).item() for k, v in ep_itemized_reward.items()},
        })
    timer.log('Record Stats')

    # Decay model std
    if early_stopping(ep_reward) or timestep >= train_kwargs['max_timesteps']:
        # Save model
        wgt_file = os.path.join(MODEL_FOLDER, f'policy_{stage:02}.wgt')
        torch.save(policy.state_dict(), wgt_file)  # Save just weights
        if use_wandb: wandb.save(wgt_file)
        mdl_file = os.path.join(MODEL_FOLDER, f'policy_{stage:02}.mdl')
        torch.save(policy, mdl_file)  # Save whole model
        if use_wandb: wandb.save(mdl_file)

        # End if maximum timesteps reached
        if timestep >= train_kwargs['max_timesteps']:
            print('Maximal timesteps reached')

        # End if at minimum `action_std`
        if policy.action_std <= policy.action_std_min:
            print(f'Ending early on episode {episode} and timestep {timestep}')
            break

        # Activate next stage or decay
        stage += 1
        # CLI
        print(f'Advancing training to stage {stage}')
        if stage < len(stages_kwargs['env']):
            # Activate next stage
            env.set_rewards(stages_kwargs['env'][stage])
        else:
            # Decay policy randomness
            policy.decay_action_std()
            # CLI
            print(f'Decaying std to {policy.action_std} on episode {episode} and timestep {timestep}')

        # Reset early stopping
        early_stopping.reset()
    timer.log('Early Stopping')

    # Iterate
    episode += 1

# CLI Timer
print()
timer.aggregate('sum')

# Finish wandb
if use_wandb: wandb.finish()

Beginning training
Training using 100 nodes out of a total 100 with batches of size None.
Training on all states with batches of size 1000 and minibatches of size 1000 from 500000 total.


Updating model with average reward -1.76972 on episode 5 and timestep 5000

 (1.81 GB CUDA)


Updating model with average reward -1.76868 on episode 10 and timestep 10000

 (1.81 GB CUDA)


Updating model with average reward -1.767284 on episode 15 and timestep 15000

 (1.81 GB CUDA)


Updating model with average reward -1.767596 on episode 20 and timestep 20000

 (1.81 GB CUDA)


Updating model with average reward -1.770804 on episode 25 and timestep 25000

 (1.81 GB CUDA)


Updating model with average reward -1.76954 on episode 30 and timestep 30000

 (1.81 GB CUDA)


Updating model with average reward -1.767636 on episode 35 and timestep 35000

 (1.81 GB CUDA)


Updating model with average reward -1.761024 on episode 40 and timestep 40000

 (1.81 GB CUDA)


Advancing training to stage 1


Updating model with average reward -2.4431372834939955 on episode 45 and timestep 45000

 (1.81 GB CUDA)


Updating model with average reward -5.150358582660675 on episode 50 and timestep 50000

 (1.81 GB CUDA)


Updating model with average reward -5.22037965877533 on episode 55 and timestep 55000

 (1.81 GB CUDA)


Updating model with average reward -5.178675681824684 on episode 60 and timestep 60000

 (1.81 GB CUDA)


Updating model with average reward -5.174409704520225 on episode 65 and timestep 65000

 (1.81 GB CUDA)


Updating model with average reward -5.158429193963051 on episode 70 and timestep 70000

 (1.81 GB CUDA)


Updating model with average reward -5.095199993280411 on episode 75 and timestep 75000

 (1.81 GB CUDA)


Updating model with average reward -5.099091115325928 on episode 80 and timestep 80000

 (1.81 GB CUDA)


Updating model with average reward -5.040901105064392 on episode 85 and timestep 85000

 (1.81 GB CUDA)


Updating model with average reward -4.899419071359635 on episode 90 and timestep 90000

 (1.81 GB CUDA)


Updating model with average reward -4.785427339237213 on episode 95 and timestep 95000

 (1.81 GB CUDA)


Updating model with average reward -4.801900182310105 on episode 100 and timestep 100000

 (1.81 GB CUDA)


Updating model with average reward -4.803137774448395 on episode 105 and timestep 105000

 (1.81 GB CUDA)


Updating model with average reward -4.742333192028999 on episode 110 and timestep 110000

 (1.81 GB CUDA)


Updating model with average reward -4.709117849959373 on episode 115 and timestep 115000

 (1.81 GB CUDA)


Updating model with average reward -4.648782464631081 on episode 120 and timestep 120000

 (1.81 GB CUDA)


Updating model with average reward -4.6006415150108335 on episode 125 and timestep 125000

 (1.81 GB CUDA)


Updating model with average reward -4.486319894997597 on episode 130 and timestep 130000

 (1.81 GB CUDA)


Updating model with average reward -4.38367129315567 on episode 135 and timestep 135000

 (1.81 GB CUDA)


Updating model with average reward -4.313249837913514 on episode 140 and timestep 140000

 (1.81 GB CUDA)


Updating model with average reward -4.375335452681542 on episode 145 and timestep 145000

 (1.81 GB CUDA)


Updating model with average reward -4.420805288287163 on episode 150 and timestep 150000

 (1.81 GB CUDA)


Updating model with average reward -4.5000155507507325 on episode 155 and timestep 155000

 (1.81 GB CUDA)


Updating model with average reward -4.560484150821686 on episode 160 and timestep 160000

 (1.81 GB CUDA)


Updating model with average reward -4.431016390799522 on episode 165 and timestep 165000

 (1.81 GB CUDA)


Updating model with average reward -4.286881664196015 on episode 170 and timestep 170000

 (1.81 GB CUDA)


Updating model with average reward -4.131248676699639 on episode 175 and timestep 175000

 (1.81 GB CUDA)


Updating model with average reward -3.9260034138679503 on episode 180 and timestep 180000

 (1.81 GB CUDA)


Updating model with average reward -3.7378350486688614 on episode 185 and timestep 185000

 (1.81 GB CUDA)


Updating model with average reward -3.675259731898308 on episode 190 and timestep 190000

 (1.81 GB CUDA)


Updating model with average reward -3.5641274983882902 on episode 195 and timestep 195000

 (1.81 GB CUDA)


Updating model with average reward -3.563867688373566 on episode 200 and timestep 200000

 (1.81 GB CUDA)


Updating model with average reward -3.3869495948114396 on episode 205 and timestep 205000

 (1.81 GB CUDA)


Updating model with average reward -3.204677615692139 on episode 210 and timestep 210000

 (1.81 GB CUDA)


Updating model with average reward -3.044422238823891 on episode 215 and timestep 215000

 (1.81 GB CUDA)


Updating model with average reward -3.017860737736702 on episode 220 and timestep 220000

 (1.81 GB CUDA)


Updating model with average reward -2.8830821204738615 on episode 225 and timestep 225000

 (1.81 GB CUDA)


Updating model with average reward -2.702534335971832 on episode 230 and timestep 230000

 (1.81 GB CUDA)


Updating model with average reward -2.702737649468422 on episode 235 and timestep 235000

 (1.81 GB CUDA)


Updating model with average reward -2.6895892842540743 on episode 240 and timestep 240000

 (1.81 GB CUDA)


Updating model with average reward -2.703547066637993 on episode 245 and timestep 245000

 (1.81 GB CUDA)


Updating model with average reward -2.4567613436050415 on episode 250 and timestep 250000

 (1.81 GB CUDA)


Updating model with average reward -2.5230433129673004 on episode 255 and timestep 255000

 (1.81 GB CUDA)


Updating model with average reward -2.732683615620613 on episode 260 and timestep 260000

 (1.81 GB CUDA)


Updating model with average reward -2.77439728888607 on episode 265 and timestep 265000

 (1.81 GB CUDA)


Updating model with average reward -2.980037393208504 on episode 270 and timestep 270000

 (1.81 GB CUDA)


Updating model with average reward -2.8509595416555404 on episode 275 and timestep 275000

 (1.81 GB CUDA)


Updating model with average reward -2.6628367605781555 on episode 280 and timestep 280000

 (1.81 GB CUDA)


Advancing training to stage 2


Updating model with average reward -3.768447155842036 on episode 285 and timestep 285000

 (1.81 GB CUDA)


Updating model with average reward -9.158133687633068 on episode 290 and timestep 290000

 (1.81 GB CUDA)


Updating model with average reward -8.939847342014478 on episode 295 and timestep 295000

 (1.81 GB CUDA)


Updating model with average reward -8.781898157962992 on episode 300 and timestep 300000

 (1.81 GB CUDA)


Updating model with average reward -8.54945520001401 on episode 305 and timestep 305000

 (1.81 GB CUDA)


Updating model with average reward -8.348422226926967 on episode 310 and timestep 310000

 (1.81 GB CUDA)


Updating model with average reward -8.270895623290345 on episode 315 and timestep 315000

 (1.81 GB CUDA)


Updating model with average reward -8.270217949811057 on episode 320 and timestep 320000

 (1.81 GB CUDA)


Updating model with average reward -8.404898813183204 on episode 325 and timestep 325000

 (1.81 GB CUDA)


Updating model with average reward -8.782882794228717 on episode 330 and timestep 330000

 (1.81 GB CUDA)


Updating model with average reward -9.009695087511316 on episode 335 and timestep 335000

 (1.81 GB CUDA)


Updating model with average reward -8.990984476755052 on episode 340 and timestep 340000

 (1.81 GB CUDA)


Updating model with average reward -8.83658692575036 on episode 345 and timestep 345000

 (1.81 GB CUDA)


Updating model with average reward -8.811801310954525 on episode 350 and timestep 350000

 (1.81 GB CUDA)
Advancing training to stage 3


Updating model with average reward -25.619220543592572 on episode 355 and timestep 355000

 (1.81 GB CUDA)


Updating model with average reward -26.337432921039344 on episode 360 and timestep 360000

 (1.81 GB CUDA)


Updating model with average reward -24.873488533540904 on episode 365 and timestep 365000

 (1.81 GB CUDA)


Updating model with average reward -21.485412589577525 on episode 370 and timestep 370000

 (1.81 GB CUDA)


Updating model with average reward -20.18803994704053 on episode 375 and timestep 375000

 (1.81 GB CUDA)


Updating model with average reward -18.018847217302383 on episode 380 and timestep 380000

 (1.81 GB CUDA)


Updating model with average reward -16.942693016330928 on episode 385 and timestep 385000

 (1.81 GB CUDA)


Updating model with average reward -15.869268755369097 on episode 390 and timestep 390000

 (1.81 GB CUDA)


Updating model with average reward -15.61686473682034 on episode 395 and timestep 395000

 (1.81 GB CUDA)


Updating model with average reward -14.635548849535853 on episode 400 and timestep 400000

 (1.81 GB CUDA)


Updating model with average reward -13.86498855302608 on episode 405 and timestep 405000

 (1.81 GB CUDA)


Updating model with average reward -12.231202721424237 on episode 410 and timestep 410000

 (1.81 GB CUDA)


Updating model with average reward -11.24128080898583 on episode 415 and timestep 415000

 (1.81 GB CUDA)


Updating model with average reward -11.161594948764353 on episode 420 and timestep 420000

 (1.81 GB CUDA)


Updating model with average reward -10.932026160179392 on episode 425 and timestep 425000

 (1.81 GB CUDA)


Updating model with average reward -10.747592022878498 on episode 430 and timestep 430000

 (1.81 GB CUDA)


Updating model with average reward -10.156570855002702 on episode 435 and timestep 435000

 (1.81 GB CUDA)


Updating model with average reward -10.032548719214827 on episode 440 and timestep 440000

 (1.81 GB CUDA)


Updating model with average reward -9.70410887329285 on episode 445 and timestep 445000

 (1.81 GB CUDA)


Updating model with average reward -9.71428098711802 on episode 450 and timestep 450000

 (1.81 GB CUDA)


Updating model with average reward -9.558072767733618 on episode 455 and timestep 455000

 (1.81 GB CUDA)


Updating model with average reward -9.288522725422979 on episode 460 and timestep 460000

 (1.81 GB CUDA)


Updating model with average reward -9.325356439290852 on episode 465 and timestep 465000

 (1.81 GB CUDA)


Updating model with average reward -9.197955506757305 on episode 470 and timestep 470000

 (1.81 GB CUDA)


Updating model with average reward -9.292178949480235 on episode 475 and timestep 475000

 (1.81 GB CUDA)


Updating model with average reward -8.846236029228152 on episode 480 and timestep 480000

 (1.81 GB CUDA)


Updating model with average reward -8.478872188887895 on episode 485 and timestep 485000

 (1.81 GB CUDA)


Updating model with average reward -8.46416358209753 on episode 490 and timestep 490000

 (1.81 GB CUDA)


Updating model with average reward -8.360574474501014 on episode 495 and timestep 495000

 (1.81 GB CUDA)


Updating model with average reward -8.187626521309435 on episode 500 and timestep 500000

 (1.81 GB CUDA)


Updating model with average reward -7.801966007236719 on episode 505 and timestep 505000

 (1.81 GB CUDA)


Updating model with average reward -7.629791135190934 on episode 510 and timestep 510000

 (1.81 GB CUDA)


Updating model with average reward -7.573148320644125 on episode 515 and timestep 515000

 (1.81 GB CUDA)


Updating model with average reward -7.418398756962955 on episode 520 and timestep 520000

 (1.81 GB CUDA)


Updating model with average reward -7.546788020700902 on episode 525 and timestep 525000

 (1.81 GB CUDA)


Updating model with average reward -7.327877332996458 on episode 530 and timestep 530000

 (1.81 GB CUDA)


Updating model with average reward -7.4489096690110115 on episode 535 and timestep 535000

 (1.81 GB CUDA)


Updating model with average reward -7.1561164325272735 on episode 540 and timestep 540000

 (1.81 GB CUDA)


Updating model with average reward -6.985171889607474 on episode 545 and timestep 545000

 (1.81 GB CUDA)


Updating model with average reward -6.755367001061782 on episode 550 and timestep 550000

 (1.81 GB CUDA)


Updating model with average reward -6.690079194410696 on episode 555 and timestep 555000

 (1.81 GB CUDA)


Updating model with average reward -6.498531914248824 on episode 560 and timestep 560000

 (1.81 GB CUDA)


Updating model with average reward -6.379817281716183 on episode 565 and timestep 565000

 (1.81 GB CUDA)


Updating model with average reward -6.347995465930462 on episode 570 and timestep 570000

 (1.81 GB CUDA)


Updating model with average reward -6.248638497124076 on episode 575 and timestep 575000

 (1.81 GB CUDA)


Updating model with average reward -6.121763787551701 on episode 580 and timestep 580000

 (1.81 GB CUDA)


Updating model with average reward -6.150534900045842 on episode 585 and timestep 585000

 (1.81 GB CUDA)


Updating model with average reward -6.109722024214118 on episode 590 and timestep 590000

 (1.81 GB CUDA)


Updating model with average reward -6.055951465284869 on episode 595 and timestep 595000

 (1.81 GB CUDA)


Updating model with average reward -6.015313597943887 on episode 600 and timestep 600000

 (1.81 GB CUDA)


Updating model with average reward -6.021991432378844 on episode 605 and timestep 605000

 (1.81 GB CUDA)


Updating model with average reward -5.998513194571257 on episode 610 and timestep 610000

 (1.81 GB CUDA)


Updating model with average reward -5.937068458719566 on episode 615 and timestep 615000

 (1.81 GB CUDA)


Updating model with average reward -6.027970104200736 on episode 620 and timestep 620000

 (1.81 GB CUDA)


Updating model with average reward -5.885186056242123 on episode 625 and timestep 625000

 (1.81 GB CUDA)


Updating model with average reward -5.894890726028234 on episode 630 and timestep 630000

 (1.81 GB CUDA)


Updating model with average reward -5.839905601200834 on episode 635 and timestep 635000

 (1.81 GB CUDA)


Updating model with average reward -5.815822807002157 on episode 640 and timestep 640000

 (1.81 GB CUDA)


Updating model with average reward -5.82433473435694 on episode 645 and timestep 645000

 (1.81 GB CUDA)


Updating model with average reward -5.858246704496697 on episode 650 and timestep 650000

 (1.81 GB CUDA)


Updating model with average reward -5.8434623042209894 on episode 655 and timestep 655000

 (1.81 GB CUDA)


Updating model with average reward -5.912718244079336 on episode 660 and timestep 660000

 (1.81 GB CUDA)


Updating model with average reward -5.81451218303676 on episode 665 and timestep 665000

 (1.81 GB CUDA)


Updating model with average reward -5.83293299982275 on episode 670 and timestep 670000

 (1.81 GB CUDA)


Updating model with average reward -5.740551555781588 on episode 675 and timestep 675000

 (1.81 GB CUDA)


Updating model with average reward -5.5940144651530535 on episode 680 and timestep 680000

 (1.81 GB CUDA)


Updating model with average reward -5.436743384497717 on episode 685 and timestep 685000

 (1.81 GB CUDA)


Updating model with average reward -5.2682852411962005 on episode 690 and timestep 690000

 (1.81 GB CUDA)


Updating model with average reward -5.245745052311644 on episode 695 and timestep 695000

 (1.81 GB CUDA)


Updating model with average reward -5.305336429720178 on episode 700 and timestep 700000

 (1.81 GB CUDA)


Updating model with average reward -5.184778140163362 on episode 705 and timestep 705000

 (1.81 GB CUDA)


Updating model with average reward -5.065645605673001 on episode 710 and timestep 710000

 (1.81 GB CUDA)


Updating model with average reward -5.0418129081109315 on episode 715 and timestep 715000

 (1.81 GB CUDA)


Updating model with average reward -4.911776344461262 on episode 720 and timestep 720000

 (1.81 GB CUDA)


Updating model with average reward -4.791738417405307 on episode 725 and timestep 725000

 (1.81 GB CUDA)


Updating model with average reward -4.622030040626809 on episode 730 and timestep 730000

 (1.81 GB CUDA)


Updating model with average reward -4.397344840708986 on episode 735 and timestep 735000

 (1.81 GB CUDA)


Updating model with average reward -4.336976519739851 on episode 740 and timestep 740000

 (1.81 GB CUDA)


Updating model with average reward -4.225397724000961 on episode 745 and timestep 745000

 (1.81 GB CUDA)


Updating model with average reward -4.2277043946090345 on episode 750 and timestep 750000

 (1.81 GB CUDA)


Updating model with average reward -4.075741080918685 on episode 755 and timestep 755000

 (1.81 GB CUDA)


Updating model with average reward -3.9615312079446614 on episode 760 and timestep 760000

 (1.81 GB CUDA)


Updating model with average reward -3.9706332820732744 on episode 765 and timestep 765000

 (1.81 GB CUDA)


Updating model with average reward -3.8578693515601157 on episode 770 and timestep 770000

 (1.81 GB CUDA)


Updating model with average reward -3.9080739200318755 on episode 775 and timestep 775000

 (1.81 GB CUDA)


Updating model with average reward -3.796719055732861 on episode 780 and timestep 780000

 (1.81 GB CUDA)


Updating model with average reward -3.7121953514299544 on episode 785 and timestep 785000

 (1.81 GB CUDA)


Updating model with average reward -3.6541287043349 on episode 790 and timestep 790000

 (1.81 GB CUDA)


Updating model with average reward -3.6337283631887285 on episode 795 and timestep 795000

 (1.81 GB CUDA)


Updating model with average reward -3.5628262599420846 on episode 800 and timestep 800000

 (1.81 GB CUDA)


Updating model with average reward -3.548137740189582 on episode 805 and timestep 805000

 (1.81 GB CUDA)


Updating model with average reward -3.516812096701354 on episode 810 and timestep 810000

 (1.81 GB CUDA)


Updating model with average reward -3.4826105055125804 on episode 815 and timestep 815000

 (1.81 GB CUDA)


Updating model with average reward -3.4267073004790545 on episode 820 and timestep 820000

 (1.81 GB CUDA)


Updating model with average reward -3.3881964621133207 on episode 825 and timestep 825000

 (1.81 GB CUDA)


Updating model with average reward -3.3562561181787105 on episode 830 and timestep 830000

 (1.81 GB CUDA)


Updating model with average reward -3.328642033213139 on episode 835 and timestep 835000

 (1.81 GB CUDA)


Updating model with average reward -3.2924462415670006 on episode 840 and timestep 840000

 (1.81 GB CUDA)


Updating model with average reward -3.2802094557349233 on episode 845 and timestep 845000

 (1.81 GB CUDA)


Updating model with average reward -3.2439574870005847 on episode 850 and timestep 850000

 (1.81 GB CUDA)


Updating model with average reward -3.2207313420725763 on episode 855 and timestep 855000

 (1.81 GB CUDA)


Updating model with average reward -3.2129239054817855 on episode 860 and timestep 860000

 (1.81 GB CUDA)


Updating model with average reward -3.2110074577243477 on episode 865 and timestep 865000

 (1.81 GB CUDA)


Updating model with average reward -3.2116005299370887 on episode 870 and timestep 870000

 (1.81 GB CUDA)


Updating model with average reward -3.196375015575737 on episode 875 and timestep 875000

 (1.81 GB CUDA)


Updating model with average reward -3.2051606497854888 on episode 880 and timestep 880000

 (1.81 GB CUDA)


Updating model with average reward -3.193183948107943 on episode 885 and timestep 885000

 (1.81 GB CUDA)


Updating model with average reward -3.1675517600808143 on episode 890 and timestep 890000

 (1.81 GB CUDA)


Updating model with average reward -3.1464090798748883 on episode 895 and timestep 895000

 (1.81 GB CUDA)


Updating model with average reward -3.137149278338626 on episode 900 and timestep 900000

 (1.81 GB CUDA)


Updating model with average reward -3.1346006026678084 on episode 905 and timestep 905000

 (1.81 GB CUDA)


Updating model with average reward -3.115123199907005 on episode 910 and timestep 910000

 (1.81 GB CUDA)


Updating model with average reward -3.120644033807352 on episode 915 and timestep 915000

 (1.81 GB CUDA)


Updating model with average reward -3.1090119459898173 on episode 920 and timestep 920000

 (1.81 GB CUDA)


Updating model with average reward -3.1149432015964837 on episode 925 and timestep 925000

 (1.81 GB CUDA)


Updating model with average reward -3.1101760830863117 on episode 930 and timestep 930000

 (1.81 GB CUDA)


Updating model with average reward -3.105602011617258 on episode 935 and timestep 935000

 (1.81 GB CUDA)


Updating model with average reward -3.0962033948744683 on episode 940 and timestep 940000

 (1.81 GB CUDA)


Updating model with average reward -3.088176246038422 on episode 945 and timestep 945000

 (1.81 GB CUDA)


Updating model with average reward -3.08108169516705 on episode 950 and timestep 950000

 (1.81 GB CUDA)


Updating model with average reward -3.05982871178028 on episode 955 and timestep 955000

 (1.81 GB CUDA)


Updating model with average reward -3.056361473531455 on episode 960 and timestep 960000

 (1.81 GB CUDA)


Updating model with average reward -3.0478904657449575 on episode 965 and timestep 965000

 (1.81 GB CUDA)


Updating model with average reward -3.0350538149439097 on episode 970 and timestep 970000

 (1.81 GB CUDA)


Updating model with average reward -3.0398697145787774 on episode 975 and timestep 975000

 (1.81 GB CUDA)


Updating model with average reward -3.0251730132381023 on episode 980 and timestep 980000

 (1.81 GB CUDA)


Updating model with average reward -3.0199652675252406 on episode 985 and timestep 985000

 (1.81 GB CUDA)


Updating model with average reward -3.0056181499403865 on episode 990 and timestep 990000

 (1.81 GB CUDA)


Updating model with average reward -3.012559822516784 on episode 995 and timestep 995000

 (1.81 GB CUDA)


Updating model with average reward -3.00849589865458 on episode 1000 and timestep 1000000

 (1.81 GB CUDA)


Updating model with average reward -2.994779666840926 on episode 1005 and timestep 1005000

 (1.81 GB CUDA)


Updating model with average reward -2.988242291181609 on episode 1010 and timestep 1010000

 (1.81 GB CUDA)


Updating model with average reward -2.970340549138114 on episode 1015 and timestep 1015000

 (1.81 GB CUDA)


Updating model with average reward -2.97201374612315 on episode 1020 and timestep 1020000

 (1.81 GB CUDA)


Updating model with average reward -2.9642674088595955 on episode 1025 and timestep 1025000

 (1.81 GB CUDA)


Updating model with average reward -2.972123230958149 on episode 1030 and timestep 1030000

 (1.81 GB CUDA)


Updating model with average reward -2.9575400421964377 on episode 1035 and timestep 1035000

 (1.81 GB CUDA)


Updating model with average reward -2.9605220004434734 on episode 1040 and timestep 1040000

 (1.81 GB CUDA)


Updating model with average reward -2.9512009718681127 on episode 1045 and timestep 1045000

 (1.81 GB CUDA)


Updating model with average reward -2.9494771164064852 on episode 1050 and timestep 1050000

 (1.81 GB CUDA)


Updating model with average reward -2.9313811539388746 on episode 1055 and timestep 1055000

 (1.81 GB CUDA)


Updating model with average reward -2.9269123560654373 on episode 1060 and timestep 1060000

 (1.81 GB CUDA)


Updating model with average reward -2.9371511374212353 on episode 1065 and timestep 1065000

 (1.81 GB CUDA)


Updating model with average reward -2.9369615535828473 on episode 1070 and timestep 1070000

 (1.81 GB CUDA)


Updating model with average reward -2.948083761536971 on episode 1075 and timestep 1075000

 (1.81 GB CUDA)


Updating model with average reward -2.9218244066331533 on episode 1080 and timestep 1080000

 (1.81 GB CUDA)


Updating model with average reward -2.9330627744358333 on episode 1085 and timestep 1085000

 (1.81 GB CUDA)


Updating model with average reward -2.9144522734508365 on episode 1090 and timestep 1090000

 (1.81 GB CUDA)


Updating model with average reward -2.910293200655341 on episode 1095 and timestep 1095000

 (1.81 GB CUDA)


Updating model with average reward -2.9153026361364125 on episode 1100 and timestep 1100000

 (1.81 GB CUDA)


Updating model with average reward -2.9136308174154015 on episode 1105 and timestep 1105000

 (1.81 GB CUDA)


Updating model with average reward -2.9038371841805577 on episode 1110 and timestep 1110000

 (1.81 GB CUDA)


Updating model with average reward -2.900569365838334 on episode 1115 and timestep 1115000

 (1.81 GB CUDA)


Updating model with average reward -2.890039485819057 on episode 1120 and timestep 1120000

 (1.81 GB CUDA)


Updating model with average reward -2.898982989787787 on episode 1125 and timestep 1125000

 (1.81 GB CUDA)


Updating model with average reward -2.9018899434908034 on episode 1130 and timestep 1130000

 (1.81 GB CUDA)


Updating model with average reward -2.9012393527471425 on episode 1135 and timestep 1135000

 (1.81 GB CUDA)


Updating model with average reward -2.9012343093269912 on episode 1140 and timestep 1140000

 (1.81 GB CUDA)


Updating model with average reward -2.8982776759024413 on episode 1145 and timestep 1145000

 (1.81 GB CUDA)


Updating model with average reward -2.896320237906575 on episode 1150 and timestep 1150000

 (1.81 GB CUDA)


Advancing training to stage 4


Decaying std to 0.5499999999999999 on episode 1154 and timestep 1154000


Updating model with average reward -2.896801035315961 on episode 1155 and timestep 1155000

 (1.81 GB CUDA)


Updating model with average reward -2.8088695102246106 on episode 1160 and timestep 1160000

 (1.81 GB CUDA)


Updating model with average reward -2.812866799986407 on episode 1165 and timestep 1165000

 (1.81 GB CUDA)


Updating model with average reward -2.830874840032339 on episode 1170 and timestep 1170000

 (1.81 GB CUDA)


Updating model with average reward -2.8346905208123623 on episode 1175 and timestep 1175000

 (1.81 GB CUDA)


Updating model with average reward -2.8428612550399603 on episode 1180 and timestep 1180000

 (1.81 GB CUDA)


Updating model with average reward -2.8346414123530237 on episode 1185 and timestep 1185000

 (1.81 GB CUDA)


Updating model with average reward -2.831227574265018 on episode 1190 and timestep 1190000

 (1.81 GB CUDA)


Updating model with average reward -2.840940926487729 on episode 1195 and timestep 1195000

 (1.81 GB CUDA)


Advancing training to stage 5
Decaying std to 0.49999999999999994 on episode 1198 and timestep 1198000


Updating model with average reward -2.8043833560282887 on episode 1200 and timestep 1200000

 (1.81 GB CUDA)


Updating model with average reward -2.735362788974121 on episode 1205 and timestep 1205000

 (1.81 GB CUDA)


Updating model with average reward -2.741642118261531 on episode 1210 and timestep 1210000

 (1.81 GB CUDA)


Updating model with average reward -2.7284869084376098 on episode 1215 and timestep 1215000

 (1.81 GB CUDA)


Updating model with average reward -2.7311689628435225 on episode 1220 and timestep 1220000

 (1.81 GB CUDA)


Updating model with average reward -2.722818738432556 on episode 1225 and timestep 1225000

 (1.81 GB CUDA)


Updating model with average reward -2.721499996913403 on episode 1230 and timestep 1230000

 (1.81 GB CUDA)


Updating model with average reward -2.7333979144759923 on episode 1235 and timestep 1235000

 (1.81 GB CUDA)


Updating model with average reward -2.7138109300310314 on episode 1240 and timestep 1240000

 (1.81 GB CUDA)


Updating model with average reward -2.7216004156187474 on episode 1245 and timestep 1245000

 (1.81 GB CUDA)


Updating model with average reward -2.715601808569163 on episode 1250 and timestep 1250000

 (1.81 GB CUDA)


Updating model with average reward -2.7165141356135756 on episode 1255 and timestep 1255000

 (1.81 GB CUDA)


Updating model with average reward -2.720205105404794 on episode 1260 and timestep 1260000

 (1.81 GB CUDA)


Updating model with average reward -2.721398887809396 on episode 1265 and timestep 1265000

 (1.81 GB CUDA)


Advancing training to stage 6
Decaying std to 0.44999999999999996 on episode 1269 and timestep 1269000


Updating model with average reward -2.7264532844371647 on episode 1270 and timestep 1270000

 (1.81 GB CUDA)


Updating model with average reward -2.631589435748652 on episode 1275 and timestep 1275000

 (1.81 GB CUDA)


Updating model with average reward -2.6405459302927405 on episode 1280 and timestep 1280000

 (1.81 GB CUDA)


Updating model with average reward -2.626284649820447 on episode 1285 and timestep 1285000

 (1.81 GB CUDA)


Updating model with average reward -2.632181003381327 on episode 1290 and timestep 1290000

 (1.81 GB CUDA)


Updating model with average reward -2.6236291851745546 on episode 1295 and timestep 1295000

 (1.81 GB CUDA)


Updating model with average reward -2.6265454625260385 on episode 1300 and timestep 1300000

 (1.81 GB CUDA)


Updating model with average reward -2.6240847421907483 on episode 1305 and timestep 1305000

 (1.81 GB CUDA)


Updating model with average reward -2.620340050229043 on episode 1310 and timestep 1310000

 (1.81 GB CUDA)


Updating model with average reward -2.629368321761802 on episode 1315 and timestep 1315000

 (1.81 GB CUDA)


Updating model with average reward -2.6172997216920257 on episode 1320 and timestep 1320000

 (1.81 GB CUDA)


Updating model with average reward -2.612608774857521 on episode 1325 and timestep 1325000

 (1.81 GB CUDA)


Updating model with average reward -2.6172141964767577 on episode 1330 and timestep 1330000

 (1.81 GB CUDA)


Updating model with average reward -2.612281802263677 on episode 1335 and timestep 1335000

 (1.81 GB CUDA)


Updating model with average reward -2.605712342636943 on episode 1340 and timestep 1340000

 (1.81 GB CUDA)


Updating model with average reward -2.596250061066717 on episode 1345 and timestep 1345000

 (1.81 GB CUDA)


Updating model with average reward -2.595478794281006 on episode 1350 and timestep 1350000

 (1.81 GB CUDA)


Updating model with average reward -2.6102770676389784 on episode 1355 and timestep 1355000

 (1.81 GB CUDA)


Updating model with average reward -2.6003235647554845 on episode 1360 and timestep 1360000

 (1.81 GB CUDA)


Updating model with average reward -2.5940494722602665 on episode 1365 and timestep 1365000

 (1.81 GB CUDA)


Updating model with average reward -2.6026882017246336 on episode 1370 and timestep 1370000

 (1.81 GB CUDA)


Updating model with average reward -2.6134141632648706 on episode 1375 and timestep 1375000

 (1.81 GB CUDA)
Advancing training to stage 7
Decaying std to 0.39999999999999997 on episode 1375 and timestep 1375000


Updating model with average reward -2.5427491025415363 on episode 1380 and timestep 1380000

 (1.81 GB CUDA)


Updating model with average reward -2.5486786686240586 on episode 1385 and timestep 1385000

 (1.81 GB CUDA)


Updating model with average reward -2.5505634205714314 on episode 1390 and timestep 1390000

 (1.81 GB CUDA)


Updating model with average reward -2.5462818740829825 on episode 1395 and timestep 1395000

 (1.81 GB CUDA)


Updating model with average reward -2.5361484013543056 on episode 1400 and timestep 1400000

 (1.81 GB CUDA)


Updating model with average reward -2.54924702833727 on episode 1405 and timestep 1405000

 (1.81 GB CUDA)


Updating model with average reward -2.5557763772052375 on episode 1410 and timestep 1410000

 (1.81 GB CUDA)


Updating model with average reward -2.5615313505869657 on episode 1415 and timestep 1415000

 (1.81 GB CUDA)


Advancing training to stage 8
Decaying std to 0.35 on episode 1419 and timestep 1419000


Updating model with average reward -2.5450111402316837 on episode 1420 and timestep 1420000

 (1.81 GB CUDA)


Updating model with average reward -2.493452742987439 on episode 1425 and timestep 1425000

 (1.81 GB CUDA)


Updating model with average reward -2.490986298845045 on episode 1430 and timestep 1430000

 (1.81 GB CUDA)


Updating model with average reward -2.4951211323555706 on episode 1435 and timestep 1435000

 (1.81 GB CUDA)


Updating model with average reward -2.4994015560631455 on episode 1440 and timestep 1440000

 (1.81 GB CUDA)


Updating model with average reward -2.4937297196498065 on episode 1445 and timestep 1445000

 (1.81 GB CUDA)


Updating model with average reward -2.4924668962431773 on episode 1450 and timestep 1450000

 (1.81 GB CUDA)


Updating model with average reward -2.492115678951129 on episode 1455 and timestep 1455000

 (1.81 GB CUDA)


Updating model with average reward -2.5064686043691413 on episode 1460 and timestep 1460000

 (1.81 GB CUDA)


Advancing training to stage 9
Decaying std to 0.3 on episode 1463 and timestep 1463000


Updating model with average reward -2.4803038324044273 on episode 1465 and timestep 1465000

 (1.81 GB CUDA)


Updating model with average reward -2.444334612938352 on episode 1470 and timestep 1470000

 (1.81 GB CUDA)


Updating model with average reward -2.4341167074570507 on episode 1475 and timestep 1475000

 (1.81 GB CUDA)


Updating model with average reward -2.436723080003027 on episode 1480 and timestep 1480000

 (1.81 GB CUDA)


Updating model with average reward -2.447055008222066 on episode 1485 and timestep 1485000

 (1.81 GB CUDA)


Updating model with average reward -2.4464021215931995 on episode 1490 and timestep 1490000

 (1.81 GB CUDA)


Updating model with average reward -2.437300584771305 on episode 1495 and timestep 1495000

 (1.81 GB CUDA)


Updating model with average reward -2.4470436681092083 on episode 1500 and timestep 1500000

 (1.81 GB CUDA)


Updating model with average reward -2.4379538315712885 on episode 1505 and timestep 1505000

 (1.81 GB CUDA)


Advancing training to stage 10
Decaying std to 0.25 on episode 1507 and timestep 1507000


Updating model with average reward -2.4126180814311504 on episode 1510 and timestep 1510000

 (1.81 GB CUDA)


Updating model with average reward -2.388116255625561 on episode 1515 and timestep 1515000

 (1.81 GB CUDA)


Updating model with average reward -2.393191769104358 on episode 1520 and timestep 1520000

 (1.81 GB CUDA)


Updating model with average reward -2.3925641980361045 on episode 1525 and timestep 1525000

 (1.81 GB CUDA)


Updating model with average reward -2.390320081065781 on episode 1530 and timestep 1530000

 (1.81 GB CUDA)


Updating model with average reward -2.4040770135387404 on episode 1535 and timestep 1535000

 (1.81 GB CUDA)


Updating model with average reward -2.3962420680195615 on episode 1540 and timestep 1540000

 (1.81 GB CUDA)


Updating model with average reward -2.395839773979783 on episode 1545 and timestep 1545000

 (1.81 GB CUDA)


Updating model with average reward -2.3975352683462052 on episode 1550 and timestep 1550000

 (1.81 GB CUDA)


Advancing training to stage 11
Decaying std to 0.2 on episode 1551 and timestep 1551000


Updating model with average reward -2.3654170307795965 on episode 1555 and timestep 1555000

 (1.81 GB CUDA)


Updating model with average reward -2.3906946749162303 on episode 1560 and timestep 1560000

 (1.81 GB CUDA)


Updating model with average reward -2.375152248135038 on episode 1565 and timestep 1565000

 (1.81 GB CUDA)


Updating model with average reward -2.4598402607928067 on episode 1570 and timestep 1570000

 (1.81 GB CUDA)


Updating model with average reward -6.6983037834279795 on episode 1575 and timestep 1575000

 (1.81 GB CUDA)


Updating model with average reward -7.598490321177028 on episode 1580 and timestep 1580000

 (1.81 GB CUDA)


Updating model with average reward -7.581374009608395 on episode 1585 and timestep 1585000

 (1.81 GB CUDA)


Updating model with average reward -7.4051439494846685 on episode 1590 and timestep 1590000

 (1.81 GB CUDA)


Updating model with average reward -7.043131083230875 on episode 1595 and timestep 1595000

 (1.81 GB CUDA)
Advancing training to stage 12
Decaying std to 0.15000000000000002 on episode 1595 and timestep 1595000


Updating model with average reward -6.624081139506996 on episode 1600 and timestep 1600000

 (1.81 GB CUDA)


Updating model with average reward -7.891078732459307 on episode 1605 and timestep 1605000

 (1.81 GB CUDA)


Updating model with average reward -7.911660637037188 on episode 1610 and timestep 1610000

 (1.81 GB CUDA)


Updating model with average reward -7.764879035510034 on episode 1615 and timestep 1615000

 (1.81 GB CUDA)


Updating model with average reward -7.6870941067698 on episode 1620 and timestep 1620000

 (1.81 GB CUDA)


Updating model with average reward -7.765254784391209 on episode 1625 and timestep 1625000

 (1.81 GB CUDA)


Updating model with average reward -7.476992452012226 on episode 1630 and timestep 1630000

 (1.81 GB CUDA)


Updating model with average reward -7.431018617785916 on episode 1635 and timestep 1635000

 (1.81 GB CUDA)


Updating model with average reward -7.365839178644926 on episode 1640 and timestep 1640000

 (1.81 GB CUDA)


Updating model with average reward -7.263192810144216 on episode 1645 and timestep 1645000

 (1.81 GB CUDA)


Updating model with average reward -6.662728660322249 on episode 1650 and timestep 1650000

 (1.81 GB CUDA)


Updating model with average reward -7.556413830827415 on episode 1655 and timestep 1655000

 (1.81 GB CUDA)


Updating model with average reward -7.912866453171096 on episode 1660 and timestep 1660000

 (1.81 GB CUDA)


Updating model with average reward -5.2006520112711785 on episode 1665 and timestep 1665000

 (1.81 GB CUDA)


Updating model with average reward -6.508255772632845 on episode 1670 and timestep 1670000

 (1.81 GB CUDA)


Updating model with average reward -6.5082986654557216 on episode 1675 and timestep 1675000

 (1.81 GB CUDA)


Updating model with average reward -6.537495514213569 on episode 1680 and timestep 1680000

 (1.81 GB CUDA)


Updating model with average reward -6.5685284883848505 on episode 1685 and timestep 1685000

 (1.81 GB CUDA)


Updating model with average reward -6.482892741497696 on episode 1690 and timestep 1690000

 (1.81 GB CUDA)


Updating model with average reward -6.523663360810198 on episode 1695 and timestep 1695000

 (1.81 GB CUDA)


Updating model with average reward -6.435285828970269 on episode 1700 and timestep 1700000

 (1.81 GB CUDA)


Updating model with average reward -6.351955664672211 on episode 1705 and timestep 1705000

 (1.81 GB CUDA)
Advancing training to stage 13
Decaying std to 0.10000000000000002 on episode 1705 and timestep 1705000


Updating model with average reward -6.255804534407943 on episode 1710 and timestep 1710000

 (1.81 GB CUDA)


Updating model with average reward -6.2558469403433055 on episode 1715 and timestep 1715000

 (1.81 GB CUDA)


Updating model with average reward -6.075239631297797 on episode 1720 and timestep 1720000

 (1.81 GB CUDA)


Updating model with average reward -6.99654992468138 on episode 1725 and timestep 1725000

 (1.81 GB CUDA)


Updating model with average reward -5.601663137746602 on episode 1730 and timestep 1730000

 (1.81 GB CUDA)


Updating model with average reward -8.782488526778877 on episode 1735 and timestep 1735000

 (1.81 GB CUDA)


Updating model with average reward -8.353149227939703 on episode 1740 and timestep 1740000

 (1.81 GB CUDA)


Updating model with average reward -8.134333640015557 on episode 1745 and timestep 1745000

 (1.81 GB CUDA)


Advancing training to stage 14
Decaying std to 0.1 on episode 1749 and timestep 1749000


Updating model with average reward -7.239651355395265 on episode 1750 and timestep 1750000

 (1.81 GB CUDA)


Updating model with average reward -8.005773841472395 on episode 1755 and timestep 1755000

 (1.81 GB CUDA)


Updating model with average reward -8.632366702548131 on episode 1760 and timestep 1760000

 (1.81 GB CUDA)


Updating model with average reward -8.943974478903174 on episode 1765 and timestep 1765000

 (1.81 GB CUDA)


Updating model with average reward -6.070357347361192 on episode 1770 and timestep 1770000

 (1.81 GB CUDA)


Updating model with average reward -7.015586546355843 on episode 1775 and timestep 1775000

 (1.81 GB CUDA)


Updating model with average reward -6.8272376136785 on episode 1780 and timestep 1780000

 (1.81 GB CUDA)


Updating model with average reward -5.7472614669923185 on episode 1785 and timestep 1785000

 (1.81 GB CUDA)


Updating model with average reward -5.065583892153338 on episode 1790 and timestep 1790000

 (1.81 GB CUDA)


Updating model with average reward -9.186078527706117 on episode 1795 and timestep 1795000

 (1.81 GB CUDA)


Updating model with average reward -9.10006606189169 on episode 1800 and timestep 1800000

 (1.81 GB CUDA)


Updating model with average reward -9.244993759051964 on episode 1805 and timestep 1805000

 (1.81 GB CUDA)


Updating model with average reward -9.160053815606773 on episode 1810 and timestep 1810000

 (1.81 GB CUDA)


Updating model with average reward -8.969165362629697 on episode 1815 and timestep 1815000

 (1.81 GB CUDA)


Updating model with average reward -9.444704014195711 on episode 1820 and timestep 1820000

 (1.81 GB CUDA)
Ending early on episode 1820 and timestep 1820000

Reset Environment: 0.24676854722201824


Environment Setup: 60.814437836408615


Calculate Actions: 13143.250296032988


Step Environment: 1558.0300123943016


Record Rewards: 490.78125261329114
Record Stats: 0.6916137486696243
Early Stopping: 12.388156717643142
Update Policy: 13113.322501863353
Total: 28379.525039753877
