In [1]:
from stock_env import StockEnv
import pandas as pd
import numpy as np
from replay_buffer import Buffer
from ddpg import ActorCritic
import tensorflow as tf

%load_ext autoreload
%autoreload 2

In [2]:
env_args = {
    "asset_codes": ['AAPL', 'V', 'BABA', 'ADBE', 'SNE'],
    "features": ["close", "high", "low"],
    "start_date": "2017-5-1",
    "end_date": "2017-6-23", 
    "window_len": 50,
    "data_path": "AmericaStock.csv"
}

In [3]:
stock_env = StockEnv(**env_args)

In [4]:
print(f'total number of states: {len(stock_env.states)}')

total number of states: 4


In [5]:
state_dim = tuple(stock_env.states[0].shape[1: ])
action_dim = (len(env_args['asset_codes']) + 1, )
print(f'state dimention {state_dim}')
print(f'action dimention {action_dim}')

state dimention (5, 50, 3)
action dimention (6,)


In [27]:
actor_critic = ActorCritic(state_dim, action_dim, )

Actor Network Summary: 
Model: "model_32"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_49 (InputLayer)           [(None, 5, 50, 3)]   0                                            
__________________________________________________________________________________________________
conv2d_96 (Conv2D)              (None, 5, 50, 32)    128         input_49[0][0]                   
__________________________________________________________________________________________________
batch_normalization_28 (BatchNo (None, 5, 50, 32)    128         conv2d_96[0][0]                  
__________________________________________________________________________________________________
conv2d_97 (Conv2D)              (None, 5, 50, 32)    1056        batch_normalization_28[0][0]     
___________________________________________________________________

In [7]:
stock_env.reset()
state0 = stock_env.states[0]

In [8]:
action0 = actor_critic.main_actor(state0)
print(action0)

tf.Tensor([[0.0293806  0.08401833 0.12408818 0.48286393 0.11546466 0.16418421]], shape=(1, 6), dtype=float32)


In [9]:
action0 = np.squeeze(action0)

In [10]:
reward, done, state1 = stock_env.step(action0)

In [11]:
buffer_args = {
    'state_dim': state_dim,
    'action_dim': action_dim
}

In [12]:
stock_buffer = Buffer(**buffer_args)

In [13]:
stock_buffer.insert(state0, action0, reward, state1, done)

In [14]:
train_batch = stock_buffer.sample_batch()

In [15]:
actor_optimizer = tf.keras.optimizers.Adam(0.0001)
critic_optimizer = tf.keras.optimizers.Adam(0.0002)

In [16]:
actor_critic.one_step_train(train_batch, actor_optimizer, critic_optimizer)

In [28]:
train_args = {
    "env": stock_env,
    "num_eps": 20,
    "actor_lr": 0.00001,
    "critic_lr": 0.0001,
    "train_every_step": 1,
    "batch_size": 1,
    "verbose": True, 
}

In [29]:
rewards = actor_critic.train(**train_args)

step 0, take action [0.17249607 0.3047047  0.06427959 0.09446795 0.19902344 0.16502827], receive reward -0.007135758145050558
critic loss 8.90993105713278e-05
step 1, take action [1.7502789e-05 9.9994421e-01 4.5521746e-12 1.8818406e-13 3.8226473e-05
 4.4497203e-15], receive reward 0.0034253098730827776
critic loss 273533536.0
step 2, take action [1.4952620e-08 1.0000000e+00 2.3712484e-19 8.7527593e-22 5.1884477e-08
 1.0560687e-24], receive reward -0.0041538824461249515
critic loss 324928864.0
step 3, take action [5.7302770e-11 1.0000000e+00 4.4215693e-25 2.4579679e-28 2.8900235e-10
 2.8536918e-32], receive reward 0.0019613902260496247
critic loss 239450352.0
Episode 0, reward: -0.005902940492043107
step 0, take action [5.6347001e-13 1.0000000e+00 7.2920583e-30 8.4971454e-34 3.8231020e-12
 1.4278344e-38], receive reward -0.011656065788529818
critic loss 12588799.0
step 1, take action [1.2380566e-15 1.0000000e+00 1.2558009e-34 0.0000000e+00 8.1920537e-15
 9.8090893e-45], receive reward 0

step 1, take action [nan nan nan nan nan nan], receive reward nan
critic loss nan
step 2, take action [nan nan nan nan nan nan], receive reward nan
critic loss nan
step 3, take action [nan nan nan nan nan nan], receive reward nan
critic loss nan
Episode 19, reward: nan
