In [1]:
###########################################
## A2C stock trader, based on rl_trader.py
###########################################

# Requires Spark configuration:
#
#  spark.rpc.message.maxSize 2000
#  spark.databricks.workspace.matplotlibInline.enabled true
#  spark.sql.execution.arrow.enabled true
#  spark.databricks.delta.preview.enabled true
#  spark.driver.maxResultSize 0
#
# Requires environment variables:
#  ARROW_PRE_0_15_IPC_FORMAT=1http://localhost:8888/notebooks/20200310/rl_trader_A2C_RexBarker.ipynb#

In [2]:
%matplotlib inline
import os
from glob import glob,iglob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam

from datetime import datetime
import itertools
import argparse
import re
import pickle

from sklearn.preprocessing import StandardScaler

In [3]:
basedir = '/dbfs/dbfs/tradingA2C'
datadir = '/dbfs/FileStore/tables'
# create base directory if not existing
if not os.path.exists(basedir): os.mkdir(basedir)

In [4]:
for s in iglob(os.path.join(datadir,"*_SW*.csv")):
  print(s)

In [5]:
# Currently configured for 3 stocks
df_all = pd.DataFrame()
stocks = ['ABBN','NESN', 'UBSG']
for s in stocks:
  files = glob(os.path.join(datadir,f"{s}*.csv"))
  assert files, f"Stock '{s}' file was not found!"
  filepath = files[0]
  df = pd.read_csv(filepath)
  df.index = df.Date
  df_all[s] = df.Close

df_all.interpolate(inplace=True,limit_direction='both')
df_all.dropna(inplace=True)


In [6]:
df_all.loc[df_all.index > '2019-01-01'].plot()

In [7]:
# Entire range of data, StandardScaled
scaler = StandardScaler().fit(df_all)
df_scaled = scaler.transform(df_all)
plt.plot(df_scaled)

In [8]:
# Reduced set of data, for test/train
df_use = df_all.loc[df_all.index > '2018-01-01']
scaler.fit(df_use)
plt.plot(scaler.transform(df_use))

In [9]:
def get_data():
  # Let's use AAPL (Apple), MSI (Motorola), SBUX (Starbucks)
  # returns a T x 3 list of stock prices
  # each row is a different stock
  # 0 = AAPL
  # 1 = MSI
  # 2 = SBUX
  #csvfile = os.path.join(basedir,'aapl_msi_sbux.csv')
  #assert os.path.exists(csvfile)
  #df = pd.read_csv(csvfile)
  return df_use.values

In [10]:
def maybe_make_dir(directory):
  if not os.path.exists(directory):
    os.makedirs(directory)

In [11]:
def get_scaler(env):
  # return scikit-learn scaler object to scale the states
  # Note: you could also populate the replay buffer here

  states = []
  for _ in range(env.n_step):
    action = np.random.choice(env.action_space)
    state, reward, done, info = env.step(action)
    states.append(state)
    if done:
      break

  scaler = StandardScaler()
  scaler.fit(states)
  return scaler

In [12]:
def play_one_episode(agent, env, is_train):
  # note: after transforming states are already 1xD
  state = env.reset()
  state = scaler.transform([state])
  done = False

  while not done:
    action = agent.act(state)
    next_state, reward, done, info = env.step(action)
    next_state = scaler.transform([next_state])
    if is_train == 'train':
      agent.update_replay_memory(state, action, reward, next_state, done)
      agent.replay(batch_size)
    state = next_state

  return info['cur_val']

In [13]:
class ReplayBuffer:
  ### The experience replay memory ###
  def __init__(self, obs_dim, act_dim, size):
    self.obs1_buf = np.zeros([size, obs_dim], dtype=np.float32)
    self.obs2_buf = np.zeros([size, obs_dim], dtype=np.float32)
    self.acts_buf = np.zeros(size, dtype=np.uint8)
    self.rews_buf = np.zeros(size, dtype=np.float32)
    self.done_buf = np.zeros(size, dtype=np.uint8)
    self.ptr, self.size, self.max_size = 0, 0, size

  def store(self, obs, act, rew, next_obs, done):
    self.obs1_buf[self.ptr] = obs
    self.obs2_buf[self.ptr] = next_obs
    self.acts_buf[self.ptr] = act
    self.rews_buf[self.ptr] = rew
    self.done_buf[self.ptr] = done
    self.ptr = (self.ptr+1) % self.max_size
    self.size = min(self.size+1, self.max_size)

  def sample_batch(self, batch_size=32):
    idxs = np.random.randint(0, self.size, size=batch_size)
    return dict(s=self.obs1_buf[idxs],
                s2=self.obs2_buf[idxs],
                a=self.acts_buf[idxs],
                r=self.rews_buf[idxs],
                d=self.done_buf[idxs])

In [14]:
def mlp_actor(input_dim, n_action, n_hidden_layers=1, hidden_dim=32):
  """ A multi-layer perceptron """

  # input layer
  i = Input(shape=(input_dim,))
  x = i

  # hidden layers
  for _ in range(n_hidden_layers):
    x = Dense(hidden_dim, activation='relu')(x)
  
  # final layer
  x = Dense(n_action, activation = 'softmax')(x)

  # make the model
  model = Model(i, x)

  model.compile(loss='categorical_crossentropy', optimizer='adam')
  print((model.summary()))
  return model


In [15]:
def mlp_critic(input_dim, n_action, n_hidden_layers=1, hidden_dim=32):
  """ A multi-layer perceptron """

  # input layer
  i = Input(shape=(input_dim,))
  x = i

  # hidden layers
  for _ in range(n_hidden_layers):
    x = Dense(hidden_dim, activation='relu')(x)
  
  # final layer
  x = Dense(n_action, activation = 'linear')(x)

  # make the model
  model = Model(i, x)

  model.compile(loss='mse', optimizer='adam')
  print((model.summary()))
  return model

In [16]:
class DQNAgent(object):
  def __init__(self, state_size, action_size):
    self.state_size = state_size
    self.action_size = action_size
    self.memory = ReplayBuffer(state_size, action_size, size=500)
    self.gamma = 0.95  # discount rate
    self.epsilon = 1.0  # exploration rate
    self.epsilon_min = 0.01
    self.epsilon_decay = 0.995
    self.critic_model = mlp_critic(state_size, action_size)
    self.actor_model = mlp_actor(state_size, action_size)


  def update_replay_memory(self, state, action, reward, next_state, done):
    self.memory.store(state, action, reward, next_state, done)


  def act(self, state):
    if np.random.rand() <= self.epsilon:
      return np.random.choice(self.action_size)
    values = self.critic_model.predict(state)
    return np.argmax(values[0])  # returns action


  def replay(self, batch_size=32):
    # first check if replay buffer contains enough data
    if self.memory.size < batch_size:
      return

    # sample a batch of data from the replay memory
    minibatch = self.memory.sample_batch(batch_size)
    states = minibatch['s']
    actions = minibatch['a']
    rewards = minibatch['r']
    next_states = minibatch['s2']
    done = minibatch['d']
    advantages = np.zeros((batch_size, self.action_size))
    
    values = self.critic_model.predict(states)
    next_values = self.critic_model.predict(next_states)
    
    for i in range(batch_size):
      action = actions[i]
      if done[i]:
        advantages[i][action] = rewards[i] - values[i][action]
        values[i][action] = rewards[i]
      else:
        advantages[i][action] = (rewards[i] - self.gamma * next_values[i][action]) - values[i][action]
        values[i][action] = rewards[i] + self.gamma * next_values[i][action]


    # Calculate the tentative target: Q(s',a)
    ##target = rewards + self.gamma * np.amax(self.model.predict(next_states), axis=1)

    # The value of terminal states is zero
    # so set the target to be the reward only
    ##target[done] = rewards[done]

    # With the Keras API, the target (usually) must have the same
    # shape as the predictions.
    # However, we only need to update the network for the actions
    # which were actually taken.
    # We can accomplish this by setting the target to be equal to
    # the prediction for all values.
    # Then, only change the targets for the actions taken.
    # Q(s,a)
    ##target_full = self.model.predict(states)
    ##target_full[np.arange(batch_size), actions] = target

    # Run one training step
    self.actor_model.train_on_batch(states, advantages)
    self.critic_model.train_on_batch(states, values)

    if self.epsilon > self.epsilon_min:
      self.epsilon *= self.epsilon_decay


  def load(self, name):
    dirname = os.path.dirname(name)
    filename = os.path.basename(name)
    
    self.actor_model.load_weights(os.path.join(dirname,'actor_' + filename))
    self.critic_model.load_weights(os.path.join(dirname,'critic_' + filename))


  def save(self, name):
    dirname = os.path.dirname(name)
    filename = os.path.basename(name)
    
    self.actor_model.save_weights(os.path.join(dirname,'actor_' + filename))
    self.critic_model.save_weights(os.path.join(dirname,'critic_' + filename))

In [17]:
class MultiStockEnv:
  """
  A 3-stock trading environment.
  State: vector of size 7 (n_stock * 2 + 1)
    - # shares of stock 1 owned
    - # shares of stock 2 owned
    - # shares of stock 3 owned
    - price of stock 1 (using daily close price)
    - price of stock 2
    - price of stock 3
    - cash owned (can be used to purchase more stocks)
  Action: categorical variable with 27 (3^3) possibilities
    - for each stock, you can:
    - 0 = sell
    - 1 = hold
    - 2 = buy
  """
  def __init__(self, data, initial_investment=20000):
    # data
    self.stock_price_history = data
    self.n_step, self.n_stock = self.stock_price_history.shape

    # instance attributes
    self.initial_investment = initial_investment
    self.cur_step = None
    self.stock_owned = None
    self.stock_price = None
    self.cash_in_hand = None

    self.action_space = np.arange(3**self.n_stock)

    # action permutations
    # returns a nested list with elements like:
    # [0,0,0]
    # [0,0,1]
    # [0,0,2]
    # [0,1,0]
    # [0,1,1]
    # etc.
    # 0 = sell
    # 1 = hold
    # 2 = buy
    self.action_list = list(map(list, itertools.product([0, 1, 2], repeat=self.n_stock)))

    # calculate size of state
    self.state_dim = self.n_stock * 2 + 1

    self.reset()


  def reset(self):
    self.cur_step = 0
    self.stock_owned = np.zeros(self.n_stock)
    self.stock_price = self.stock_price_history[self.cur_step]
    self.cash_in_hand = self.initial_investment
    return self._get_obs()


  def step(self, action):
    assert action in self.action_space

    # get current value before performing the action
    prev_val = self._get_val()

    # update price, i.e. go to the next day
    self.cur_step += 1
    self.stock_price = self.stock_price_history[self.cur_step]

    # perform the trade
    self._trade(action)

    # get the new value after taking the action
    cur_val = self._get_val()

    # reward is the increase in porfolio value
    reward = cur_val - prev_val

    # done if we have run out of data
    done = self.cur_step == self.n_step - 1

    # store the current value of the portfolio here
    info = {'cur_val': cur_val}

    # conform to the Gym API
    return self._get_obs(), reward, done, info


  def _get_obs(self):
    obs = np.empty(self.state_dim)
    obs[:self.n_stock] = self.stock_owned
    obs[self.n_stock:2*self.n_stock] = self.stock_price
    obs[-1] = self.cash_in_hand
    return obs
    


  def _get_val(self):
    return self.stock_owned.dot(self.stock_price) + self.cash_in_hand


  def _trade(self, action):
    # index the action we want to perform
    # 0 = sell
    # 1 = hold
    # 2 = buy
    # e.g. [2,1,0] means:
    # buy first stock
    # hold second stock
    # sell third stock
    action_vec = self.action_list[action]

    # determine which stocks to buy or sell
    sell_index = [] # stores index of stocks we want to sell
    buy_index = [] # stores index of stocks we want to buy
    for i, a in enumerate(action_vec):
      if a == 0:
        sell_index.append(i)
      elif a == 2:
        buy_index.append(i)

    # sell any stocks we want to sell
    # then buy any stocks we want to buy
    if sell_index:
      # NOTE: to simplify the problem, when we sell, we will sell ALL shares of that stock
      for i in sell_index:
        self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
        self.stock_owned[i] = 0
    if buy_index:
      # NOTE: when buying, we will loop through each stock we want to buy,
      #       and buy one share at a time until we run out of cash
      can_buy = True
      while can_buy:
        for i in buy_index:
          if self.cash_in_hand > self.stock_price[i]:
            self.stock_owned[i] += 1 # buy one share
            self.cash_in_hand -= self.stock_price[i]
          else:
            can_buy = False

In [18]:
# config
models_folder = os.path.join(basedir,'rl_trader_models')
rewards_folder = os.path.join(basedir,'rl_trader_rewards')
num_episodes = 200
batch_size = 32
initial_investment = 20000

mode = 'train'

#parser = argparse.ArgumentParser()
#parser.add_argument('-m', '--mode', type=str, required=True,
#                    help='either "train" or "test"')
#args = parser.parse_args()

In [19]:
maybe_make_dir(models_folder)
maybe_make_dir(rewards_folder)

data = get_data()
n_timesteps, n_stocks = data.shape

n_train = n_timesteps // 2

train_data = data[:n_train]
test_data = data[n_train:]

env = MultiStockEnv(train_data, initial_investment)
state_size = env.state_dim
action_size = len(env.action_space)
agent = DQNAgent(state_size, action_size)
scaler = get_scaler(env)

# store the final value of the portfolio (end of episode)
portfolio_value = []

In [20]:
# play the game num_episodes times
for e in range(num_episodes):
  t0 = datetime.now()
  val = play_one_episode(agent, env, mode)
  dt = datetime.now() - t0
  print(f"episode: {e + 1}/{num_episodes}, episode end value: {val:.2f}, duration: {dt}")
  portfolio_value.append(val) # append episode end portfolio value

# save the weights when we are done
if mode == 'train':
  # save the DQN
  agent.save(f'{models_folder}/dqn.h5')

  # save the scaler
  with open(f'{models_folder}/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


# save portfolio value for each episode
np.save(f'{rewards_folder}/{mode}.npy', portfolio_value)

In [21]:
mode = 'test'

In [22]:
portfolio_pred = []
if mode == 'test':
  # then load the previous scaler
  with open(f'{models_folder}/scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

  # remake the env with test data
  env = MultiStockEnv(test_data, initial_investment)

  # make sure epsilon is not 1!
  # no need to run multiple episodes if epsilon = 0, it's deterministic
  agent.epsilon = 0.01

  # load trained weights
  agent.load(f'{models_folder}/dqn.h5')

# play the game num_episodes times
for e in range(num_episodes):
  t0 = datetime.now()
  val = play_one_episode(agent, env, mode)
  dt = datetime.now() - t0
  print(f"episode: {e + 1}/{num_episodes}, episode end value: {val:.2f}, duration: {dt}")
  portfolio_pred.append(val) # append episode end portfolio value

In [23]:
plt.plot(portfolio_value, label='train')
plt.plot(portfolio_pred, label='test')
plt.legend(loc='upper left')
plt.show()