In [1]:
from stock_env import StockEnv
import pandas as pd
import numpy as np
from replay_buffer import Buffer
from ddpg import ActorCritic
import tensorflow as tf

%load_ext autoreload
%autoreload 2

In [2]:
env_args = {
    "asset_codes": ['AAPL', 'V', 'BABA', 'ADBE', 'SNE'],
    "features": ["close", "high", "low"],
    "start_date": "2017-5-1",
    "end_date": "2017-6-23", 
    "window_len": 50,
    "data_path": "AmericaStock.csv"
}

In [3]:
stock_env = StockEnv(**env_args)

In [4]:
print(f'total number of states: {len(stock_env.states)}')

total number of states: 4


In [5]:
state_dim = tuple(stock_env.states[0].shape[1: ])
action_dim = (len(env_args['asset_codes']) + 1, )
print(f'state dimention {state_dim}')
print(f'action dimention {action_dim}')

state dimention (5, 50, 3)
action dimention (6,)


In [19]:
actor_critic = ActorCritic(state_dim, action_dim, )

Actor Network Summary: 
Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 5, 50, 3)]   0                                            
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 5, 50, 32)    128         input_7[0][0]                    
__________________________________________________________________________________________________
batch_normalization_2 (BatchNor (None, 5, 50, 32)    128         conv2d_2[0][0]                   
__________________________________________________________________________________________________
add_2 (Add)                     (None, 5, 50, 32)    0           batch_normalization_2[0][0]      
                                                                 bat

In [20]:
stock_env.reset()
state0 = stock_env.states[0]

In [21]:
action0 = actor_critic.main_actor(state0)
print(action0)

tf.Tensor([[0.10840821 0.21311744 0.43196115 0.17436635 0.03880751 0.03333935]], shape=(1, 6), dtype=float32)


In [22]:
action0 = np.squeeze(action0)

In [23]:
reward, done, state1 = stock_env.step(action0)

In [24]:
buffer_args = {
    'state_dim': state_dim,
    'action_dim': action_dim
}

In [25]:
stock_buffer = Buffer(**buffer_args)

In [26]:
stock_buffer.insert(state0, action0, reward, state1, done)

In [27]:
train_batch = stock_buffer.sample_batch()

In [28]:
actor_optimizer = tf.keras.optimizers.Adam(0.001)
critic_optimizer = tf.keras.optimizers.Adam(0.002)

In [29]:
actor_critic.one_step_train(train_batch, actor_optimizer, critic_optimizer)

Action: [[0.10840821 0.21311744 0.43196115 0.17436635 0.03880751 0.03333935]]
Rewards:  [[-0.00768568]]
Nest States: [[[[1.0084085  1.0123736  1.0038283 ]
   [1.0053322  1.0082718  0.9862592 ]
   [1.0017091  1.0058792  0.996787  ]
   [1.018321   1.0184578  1.0032814 ]
   [1.018321   1.018321   1.018321  ]
   [1.018321   1.018321   1.018321  ]
   [1.0460076  1.0507246  1.0187995 ]
   [1.0527072  1.0587914  1.0490156 ]
   [1.0477167  1.0523653  1.0398551 ]
   [1.0524337  1.053254   1.0412223 ]
   [1.0671315  1.0693191  1.0573558 ]
   [1.0671315  1.0671315  1.0671315 ]
   [1.0671315  1.0671315  1.0671315 ]
   [1.0643971  1.0708915  1.0599536 ]
   [1.0628247  1.066858   1.0576975 ]
   [1.0271398  1.0566721  1.0234482 ]
   [1.0427946  1.0482635  1.0331556 ]
   [1.0463495  1.0526388  1.04341   ]
   [1.0463495  1.0463495  1.0463495 ]
   [1.0463495  1.0463495  1.0463495 ]
   [1.0527072  1.0567405  1.0453241 ]
   [1.0514083  1.0589281  1.0480585 ]
   [1.0482635  1.0539377  1.0436833 ]
   [1.051

In [30]:
train_args = {
    "env": stock_env,
    "num_eps": 20,
    "actor_lr": 0.000001,
    "critic_lr": 0.00001,
    "train_every_step": 1,
    "batch_size": 1,
    "verbose": True, 
}

In [31]:
rewards = actor_critic.train(**train_args)

step 0, take action [8.8976836e-01 1.2514008e-04 7.4452352e-08 7.3865473e-02 3.5850782e-02
 3.9013856e-04], receive reward -0.0007161348947299601
Action: [[8.8976836e-01 1.2514008e-04 7.4452352e-08 7.3865473e-02 3.5850782e-02
  3.9013856e-04]]
Rewards:  [[-0.00071613]]
Nest States: [[[[1.0084085  1.0123736  1.0038283 ]
   [1.0053322  1.0082718  0.9862592 ]
   [1.0017091  1.0058792  0.996787  ]
   [1.018321   1.0184578  1.0032814 ]
   [1.018321   1.018321   1.018321  ]
   [1.018321   1.018321   1.018321  ]
   [1.0460076  1.0507246  1.0187995 ]
   [1.0527072  1.0587914  1.0490156 ]
   [1.0477167  1.0523653  1.0398551 ]
   [1.0524337  1.053254   1.0412223 ]
   [1.0671315  1.0693191  1.0573558 ]
   [1.0671315  1.0671315  1.0671315 ]
   [1.0671315  1.0671315  1.0671315 ]
   [1.0643971  1.0708915  1.0599536 ]
   [1.0628247  1.066858   1.0576975 ]
   [1.0271398  1.0566721  1.0234482 ]
   [1.0427946  1.0482635  1.0331556 ]
   [1.0463495  1.0526388  1.04341   ]
   [1.0463495  1.0463495  1.04634

Episode 0, reward: 0.0015774746643199876
step 0, take action [8.8629305e-01 1.2864888e-04 7.6204159e-08 7.6107293e-02 3.7074074e-02
 3.9672095e-04], receive reward -0.000737593891099804
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 

step 0, take action [8.8345242e-01 1.3148558e-04 7.7558923e-08 7.7894948e-02 3.8119018e-02
 4.0203900e-04], receive reward -0.0007545431915364316
Action: [[8.8345242e-01 1.3148558e-04 7.7558923e-08 7.7894948e-02 3.8119018e-02
  4.0203900e-04]]
Rewards:  [[-0.00075454]]
Nest States: [[[[1.0084085  1.0123736  1.0038283 ]
   [1.0053322  1.0082718  0.9862592 ]
   [1.0017091  1.0058792  0.996787  ]
   [1.018321   1.0184578  1.0032814 ]
   [1.018321   1.018321   1.018321  ]
   [1.018321   1.018321   1.018321  ]
   [1.0460076  1.0507246  1.0187995 ]
   [1.0527072  1.0587914  1.0490156 ]
   [1.0477167  1.0523653  1.0398551 ]
   [1.0524337  1.053254   1.0412223 ]
   [1.0671315  1.0693191  1.0573558 ]
   [1.0671315  1.0671315  1.0671315 ]
   [1.0671315  1.0671315  1.0671315 ]
   [1.0643971  1.0708915  1.0599536 ]
   [1.0628247  1.066858   1.0576975 ]
   [1.0271398  1.0566721  1.0234482 ]
   [1.0427946  1.0482635  1.0331556 ]
   [1.0463495  1.0526388  1.04341   ]
   [1.0463495  1.0463495  1.04634

step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[nan nan nan nan nan nan]]
Rewards:  [[nan]]
Nest States: [[[[1.0084085  1.0123736  1.0038283 ]
   [1.0053322  1.0082718  0.9862592 ]
   [1.0017091  1.0058792  0.996787  ]
   [1.018321   1.0184578  1.0032814 ]
   [1.018321   1.018321   1.018321  ]
   [1.018321   1.018321   1.018321  ]
   [1.0460076  1.0507246  1.0187995 ]
   [1.0527072  1.0587914  1.0490156 ]
   [1.0477167  1.0523653  1.0398551 ]
   [1.0524337  1.053254   1.0412223 ]
   [1.0671315  1.0693191  1.0573558 ]
   [1.0671315  1.0671315  1.0671315 ]
   [1.0671315  1.0671315  1.0671315 ]
   [1.0643971  1.0708915  1.0599536 ]
   [1.0628247  1.066858   1.0576975 ]
   [1.0271398  1.0566721  1.0234482 ]
   [1.0427946  1.0482635  1.0331556 ]
   [1.0463495  1.0526388  1.04341   ]
   [1.0463495  1.0463495  1.0463495 ]
   [1.0463495  1.0463495  1.0463495 ]
   [1.0527072  1.0567405  1.0453241 ]
   [1.0514083  1.0589281  1.0480585 ]
   [1.0482635  1.0539377  1.043

Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]


step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [

Episode 5, reward: nan
step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0.

Episode 6, reward: nan
step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[nan nan nan nan nan nan]]
Rewards:  [[nan]]
Nest States: [[[[1.0084085  1.0123736  1.0038283 ]
   [1.0053322  1.0082718  0.9862592 ]
   [1.0017091  1.0058792  0.996787  ]
   [1.018321   1.0184578  1.0032814 ]
   [1.018321   1.018321   1.018321  ]
   [1.018321   1.018321   1.018321  ]
   [1.0460076  1.0507246  1.0187995 ]
   [1.0527072  1.0587914  1.0490156 ]
   [1.0477167  1.0523653  1.0398551 ]
   [1.0524337  1.053254   1.0412223 ]
   [1.0671315  1.0693191  1.0573558 ]
   [1.0671315  1.0671315  1.0671315 ]
   [1.0671315  1.0671315  1.0671315 ]
   [1.0643971  1.0708915  1.0599536 ]
   [1.0628247  1.066858   1.0576975 ]
   [1.0271398  1.0566721  1.0234482 ]
   [1.0427946  1.0482635  1.0331556 ]
   [1.0463495  1.0526388  1.04341   ]
   [1.0463495  1.0463495  1.0463495 ]
   [1.0463495  1.0463495  1.0463495 ]
   [1.0527072  1.0567405  1.0453241 ]
   [1.0514083  1.0589281  1.0480585 ]
   [1.04

Episode 7, reward: nan
step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0.

Episode 8, reward: nan
step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0.

Episode 9, reward: nan
step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0.

step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [

step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [

Episode 12, reward: nan
step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0

Episode 13, reward: nan
step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0

Episode 14, reward: nan
step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0

step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [

step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [

step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [

Episode 18, reward: nan
step 0, take action [nan nan nan nan nan nan], receive reward nan
Action: [[0. 0. 0. 0. 0. 0.]]
Rewards:  [[0.]]
Nest States: [[[[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]]

  [[0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0. 0.]
   [0. 0