<a href="https://colab.research.google.com/github/VVasanth/RL_StockTrader-TFAgents/blob/main/TFAgents_StockTrader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install "gym>=0.21.0"
!pip install tf-agents

!sudo apt-get update
!sudo apt-get install -y xvfb ffmpeg freeglut3-dev
!pip install 'imageio==2.4.0'
!pip install pyvirtualdisplay
!pip install tf-agents[reverb]
!pip install pyglet

Collecting gym>=0.21.0
  Downloading gym-0.23.1.tar.gz (626 kB)
[K     |████████████████████████████████| 626 kB 4.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting gym-notices>=0.0.4
  Downloading gym_notices-0.0.6-py3-none-any.whl (2.7 kB)
Building wheels for collected packages: gym
  Building wheel for gym (PEP 517) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.23.1-py3-none-any.whl size=701375 sha256=8c39386103edb5627803fe6b78e26daa56e6c5039f5f3af62be2db9caee3e0c1
  Stored in directory: /root/.cache/pip/wheels/e3/33/04/6723848e46f0f1ebe794bb329b7c761c3329a0d7ffade99da7
Successfully built gym
Installing collected packages: gym-notices, gym
  Attempting uninstall: gym
    Found existing installation: gym 0.17.3
    Uninstalling gym-0.17.3:
      Successfully uninstalled gym-0.17.3
Successfully installed gym-0.23.1 gym-notices-0.0.6
C

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import itertools

from tf_agents.environments import py_environment
from tf_agents.specs import array_spec
from tf_agents.trajectories import time_step as ts
import numpy as np

class StockTraderEnv(py_environment.PyEnvironment):

    def __init__(self, data, initInvestment=20000):
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=26, name='action')
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(7,), dtype=np.float64, name='observation')
        self.stock_price_history = data
        self.init_investment = initInvestment
        self.state = 0
        self.episode_ended = False
        self.stock_owned = None
        self.stock_price = None
        self.cash_in_hand = None
        self.n_stock = 3
        self.n_step = self.stock_price_history.shape[0]
        self.action_list = list(map(list, itertools.product([0, 1, 2], repeat=self.n_stock)))
#[[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 1, 0], [0, 1, 1], [0, 1, 2], [0, 2, 0], [0, 2, 1], [0, 2, 2], [1, 0, 0], [1, 0, 1], [1, 0, 2], [1, 1, 0],
#[1, 1, 1], [1, 1, 2], [1, 2, 0], [1, 2, 1], [1, 2, 2], [2, 0, 0], [2, 0, 1], [2, 0, 2], [2, 1, 0], [2, 1, 1], [2, 1, 2], [2, 2, 0], [2, 2, 1], [2, 2, 2]]
        self.action_space = np.arange(3 ** self.n_stock)
        ###[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26]##np.ndarray
        self.scaler = None

    def action_spec(self):
        return self._action_spec

    def observation_spec(self):
        return self._observation_spec

    def set_scaler(self, scaler):
        self.scaler = scaler

    def get_obs(self):
        obs = np.empty(7)
        obs[:self.n_stock] = self.stock_owned
        obs[self.n_stock:2 * self.n_stock] = self.stock_price
        obs[-1] = self.cash_in_hand
        if self.scaler != None:
            obs = self.scaler.transform([obs]).flatten()
        return obs

    def get_val(self):
        return self.stock_owned.dot(self.stock_price) + self.cash_in_hand

    def _reset(self):
        self.state = 0
        self.episode_ended = False
        self.stock_owned = np.zeros(self.n_stock)
        self.stock_price = self.stock_price_history[self.state]
        self.cash_in_hand = self.init_investment
        return ts.restart(self.get_obs())

    def _step(self, action):

        if self.episode_ended:
            # The last action ended the episode. Ignore the current action and start
            # a new episode.
            return self._reset()

        # get current value before performing the action
        prev_val = self.get_val()

        self.stock_price = self.stock_price_history[self.state]

        # update price, i.e. go to the next day
        self.state += 1

        # perform the trade
        self.trade(action)

        # get the new value after taking the action
        cur_val = self.get_val()

        # reward is the increase in porfolio value
        reward = cur_val - prev_val

        # done if we have run out of data
        done = self.state == self.n_step - 1

        # store the current value of the portfolio here
        info = {'cur_val': self.state}

        # conform to the Gym API
        # return self._get_obs(), reward, done, info
        if (done != True):
            return ts.transition(self.get_obs(), reward=reward, discount=1.0)
        else:
            self.episode_ended = True
            return ts.termination(self.get_obs(), reward)

    def trade(self, action):
        # index the action we want to perform
        # 0 = sell
        # 1 = hold
        # 2 = buy
        # e.g. [2,1,0] means:
        # buy first stock
        # hold second stock
        # sell third stock
        action_vec = self.action_list[action]

        # determine which stocks to buy or sell
        sell_index = []  # stores index of stocks we want to sell
        buy_index = []  # stores index of stocks we want to buy
        for i, a in enumerate(action_vec):
            if a == 0:
                sell_index.append(i)
            elif a == 2:
                buy_index.append(i)

        # sell any stocks we want to sell
        # then buy any stocks we want to buy
        if sell_index:
            # NOTE: to simplify the problem, when we sell, we will sell ALL shares of that stock
            for i in sell_index:
                self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
                self.stock_owned[i] = 0
        if buy_index:
            # NOTE: when buying, we will loop through each stock we want to buy,
            #       and buy one share at a time until we run out of cash
            can_buy = True
            while can_buy:
                for i in buy_index:
                    if self.cash_in_hand > self.stock_price[i]:
                        self.stock_owned[i] += 1  # buy one share
                        self.cash_in_hand -= self.stock_price[i]
                    else:
                        can_buy = False

In [29]:
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd

from tf_agents.trajectories import trajectory

def get_data():
    # returns a T x 3 list of stock prices
    # each row is a different stock
    # 0 = AAPL
    # 1 = MSI
    # 2 = SBUX
    df = pd.read_csv('/content/drive/My Drive/ReinforcementLearning/stockprices.csv')
    return df.values


def get_scaler(env):
    # return scikit-learn scaler object to scale the states
    # Note: you could also populate the replay buffer here

    states = []
    for _ in range(env.n_step):
        action = np.random.choice(env.action_space)
        timestep = env._step(action)
        states.append(timestep[3])

    scaler = StandardScaler()
    scaler.fit(states)
    return scaler


def collect_step(environment, policy):
    time_step = environment.current_time_step()
    action_step = policy.action(time_step)
    next_time_step = environment.step(action_step.action)
    traj = trajectory.from_transition(time_step, action_step, next_time_step)
    print(traj)
    # Add trajectory to the replay buffer
    #replay_buffer.add_batch(traj)


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
data = get_data()
data

array([[ 67.8542,  60.3   ,  28.185 ],
       [ 68.5614,  60.9   ,  28.07  ],
       [ 66.8428,  60.83  ,  28.13  ],
       ...,
       [156.49  , 101.06  ,  54.69  ],
       [163.03  , 102.76  ,  55.61  ],
       [159.54  , 102.63  ,  54.46  ]])

In [7]:
scaler_env = StockTraderEnv(data)
scaler_env._reset()
scaler = get_scaler(scaler_env)
scaler_env.set_scaler(scaler)

In [8]:
train_qt_env = StockTraderEnv(data)
train_qt_env._reset()
train_qt_env.set_scaler(scaler)
train_qt_env._reset()
from tf_agents.environments import utils
utils.validate_py_environment(train_qt_env, episodes=5)

In [9]:
eval_qt_env = StockTraderEnv(data)
eval_qt_env._reset()
eval_qt_env.set_scaler(scaler)

In [10]:
train_qt_env._reset()

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([-0.65187811, -0.64139681, -0.66663252, -1.34735989, -0.94341068,
       -1.93681045,  1.81764853]),
 'reward': array(0., dtype=float32),
 'step_type': array(0, dtype=int32)})

In [11]:
o1 = eval_qt_env._reset()
total_reward = 0.0
while True:
    #action = eval_qt_env.action_space.sample()
    action = 2
    ts_1 = eval_qt_env._step(action)
    print(ts_1)
    break

TimeStep(
{'discount': array(1., dtype=float32),
 'observation': array([-0.65187811, -0.64139681,  2.49511586, -1.34735989, -0.94341068,
       -1.93681045, -0.45608382]),
 'reward': array(-6.91216e-11, dtype=float32),
 'step_type': array(1, dtype=int32)})


In [12]:

train_qt_env.time_step_spec()

TimeStep(
{'discount': BoundedArraySpec(shape=(), dtype=dtype('float32'), name='discount', minimum=0.0, maximum=1.0),
 'observation': BoundedArraySpec(shape=(7,), dtype=dtype('float64'), name='observation', minimum=-1.7976931348623157e+308, maximum=1.7976931348623157e+308),
 'reward': ArraySpec(shape=(), dtype=dtype('float32'), name='reward'),
 'step_type': ArraySpec(shape=(), dtype=dtype('int32'), name='step_type')})

In [13]:
from tf_agents.environments import tf_py_environment

#train_py_env = suite_gym.wrap_env(train_qt_env)
train_env = tf_py_environment.TFPyEnvironment(train_qt_env)

#eval_py_env = suite_gym.wrap_env(eval_qt_env)
eval_env = tf_py_environment.TFPyEnvironment(eval_qt_env)

In [14]:
train_env.action_spec()

BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(26, dtype=int32))

In [15]:
train_env.time_step_spec()

TimeStep(
{'discount': BoundedTensorSpec(shape=(), dtype=tf.float32, name='discount', minimum=array(0., dtype=float32), maximum=array(1., dtype=float32)),
 'observation': BoundedTensorSpec(shape=(7,), dtype=tf.float64, name='observation', minimum=array(-1.79769313e+308), maximum=array(1.79769313e+308)),
 'reward': TensorSpec(shape=(), dtype=tf.float32, name='reward'),
 'step_type': TensorSpec(shape=(), dtype=tf.int32, name='step_type')})

In [16]:
from tf_agents.policies import random_tf_policy
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())

In [17]:
time_step = train_env._reset()

In [18]:
random_policy.action_spec

BoundedTensorSpec(shape=(), dtype=tf.int32, name='action', minimum=array(0, dtype=int32), maximum=array(26, dtype=int32))

In [19]:
action_step = random_policy.action(time_step)

In [20]:
action_step

PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([22], dtype=int32)>, state=(), info=())

In [21]:
learning_rate = 1e-3 
num_eval_episodes = 10
replay_buffer_max_length = 100000

In [22]:
import tensorflow as tf
from tf_agents.specs import tensor_spec
from tf_agents.utils import common
from tf_agents.networks import sequential

fc_layer_params = (100, 50)
action_tensor_spec = tensor_spec.from_spec(train_env.action_spec())
num_actions = action_tensor_spec.maximum - action_tensor_spec.minimum + 1

# Define a helper function to create Dense layers configured with the right
# activation and kernel initializer.
def dense_layer(num_units):
  return tf.keras.layers.Dense(
      num_units,
      activation=tf.keras.activations.relu,
      kernel_initializer=tf.keras.initializers.VarianceScaling(
          scale=2.0, mode='fan_in', distribution='truncated_normal'))

# QNetwork consists of a sequence of Dense layers followed by a dense layer
# with `num_actions` units to generate one q_value per available action as
# its output.
dense_layers = [dense_layer(num_units) for num_units in fc_layer_params]
q_values_layer = tf.keras.layers.Dense(
    num_actions,
    activation=None,
    kernel_initializer=tf.keras.initializers.RandomUniform(
        minval=-0.03, maxval=0.03),
    bias_initializer=tf.keras.initializers.Constant(-0.2))
q_net = sequential.Sequential(dense_layers + [q_values_layer])

In [23]:
from tf_agents.agents.dqn import dqn_agent

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)

train_step_counter = tf.Variable(0)

agent = dqn_agent.DqnAgent(
    train_env.time_step_spec(),
    train_env.action_spec(),
    q_network=q_net,
    optimizer=optimizer,
    td_errors_loss_fn=common.element_wise_squared_loss,
    train_step_counter=train_step_counter)

agent.initialize()

In [24]:
eval_policy = agent.policy
collect_policy = agent.collect_policy

In [25]:
random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())

In [26]:
time_step = train_env._reset()

In [27]:
random_policy.action(time_step)

PolicyStep(action=<tf.Tensor: shape=(1,), dtype=int32, numpy=array([11], dtype=int32)>, state=(), info=())

In [28]:
def compute_avg_return(environment, policy, num_episodes=5):

  total_return = 0.0
  for _ in range(num_episodes):

    time_step = environment._reset()
    episode_return = 0.0

    while not time_step.is_last():
      action_step = policy.action(time_step)
      time_step = environment._step(action_step.action)
      episode_return += time_step.reward
    total_return += episode_return

  avg_return = total_return / num_episodes
  return avg_return.numpy()[0]

In [30]:
compute_avg_return(eval_env, random_policy, num_episodes=1)

7861.42

In [31]:
from tf_agents.replay_buffers import TFUniformReplayBuffer
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver

In [32]:
replay_buffer = TFUniformReplayBuffer(
    data_spec=agent.collect_data_spec,
    batch_size=train_env.batch_size,
    max_length=100_000)

In [33]:
train_env._reset()

init_driver = DynamicStepDriver(
    train_env,
    random_policy,
    observers=[replay_buffer.add_batch],
    num_steps=2_500)
final_time_step, final_policy_state = init_driver.run()

In [34]:
replay_buffer.gather_all()

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=True)` instead.


Trajectory(
{'action': <tf.Tensor: shape=(1, 2501), dtype=int32, numpy=array([[23,  4, 18, ..., 10, 24,  8]], dtype=int32)>,
 'discount': <tf.Tensor: shape=(1, 2501), dtype=float32, numpy=array([[1., 1., 1., ..., 1., 1., 1.]], dtype=float32)>,
 'next_step_type': <tf.Tensor: shape=(1, 2501), dtype=int32, numpy=array([[1, 1, 1, ..., 1, 1, 1]], dtype=int32)>,
 'observation': <tf.Tensor: shape=(1, 2501, 7), dtype=float64, numpy=
array([[[-0.65187811, -0.64139681, -0.66663252, ..., -0.94341068,
         -1.93681045,  1.81764853],
        [ 1.47538518, -0.64139681,  0.26093259, ..., -0.94341068,
         -1.93681045, -0.45528605],
        [-0.65187811, -0.64139681,  0.26093259, ..., -0.88720107,
         -1.94786619,  1.16733864],
        ...,
        [-0.65187811, -0.64139681,  1.24647052, ...,  2.03757235,
          1.04295324, -0.45276851],
        [-0.65187811, -0.64139681,  1.24647052, ...,  2.06192985,
          1.10448086, -0.45276851],
        [ 0.32993572,  0.03156486, -0.66663252, 

In [35]:
trajectories, buffer_info = replay_buffer.get_next(sample_batch_size=2, num_steps=3)

Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


In [36]:
dataset = replay_buffer.as_dataset(
    num_parallel_calls=3,
    sample_batch_size=64,
    num_steps=2).prefetch(3)

In [37]:
dataset

<PrefetchDataset element_spec=(Trajectory(
{'action': TensorSpec(shape=(64, 2), dtype=tf.int32, name=None),
 'discount': TensorSpec(shape=(64, 2), dtype=tf.float32, name=None),
 'next_step_type': TensorSpec(shape=(64, 2), dtype=tf.int32, name=None),
 'observation': TensorSpec(shape=(64, 2, 7), dtype=tf.float64, name=None),
 'policy_info': (),
 'reward': TensorSpec(shape=(64, 2), dtype=tf.float32, name=None),
 'step_type': TensorSpec(shape=(64, 2), dtype=tf.int32, name=None)}), BufferInfo(ids=TensorSpec(shape=(64, 2), dtype=tf.int64, name=None), probabilities=TensorSpec(shape=(64,), dtype=tf.float32, name=None)))>

In [38]:
iterator = iter(dataset)

In [39]:
num_iterations = 100_000   # less intelligence, more persistance; 24x7 player
save_interval = 50_000
eval_interval = 5_000
log_interval = 5_000

In [40]:
# Create a driver to collect experience.
collect_driver = DynamicStepDriver(
    train_env,
    agent.collect_policy,
    observers=[replay_buffer.add_batch],
    num_steps=4) # collect 4 steps for each training iteration

In [41]:
# (Optional) Optimize by wrapping some of the code in a graph using TF function.
collect_driver.run = common.function(collect_driver.run)
agent.train = common.function(agent.train)

# Reset the train step.
agent.train_step_counter.assign(0)

# Evaluate the agent's policy once before training.
avg_return = compute_avg_return(train_env, agent.policy, num_episodes=1)
returns = np.array([avg_return])

# Reset the environment.
time_step = None
policy_state = agent.collect_policy.get_initial_state(train_env.batch_size)

In [42]:
returns

array([11388.7], dtype=float32)

In [43]:
num_iterations = 100_000
while True:
    # Collect a few steps using collect_policy and save to the replay buffer.
    time_step, policy_state = collect_driver.run(time_step, policy_state)

    # Sample a batch of data from the buffer and update the agent's network.
    experience, unused_info = next(iterator)
    train_loss = agent.train(experience).loss

    step = agent.train_step_counter.numpy()
    print(f'\r step {step}', end='')

    if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))

    if step % eval_interval == 0:
        avg_return = compute_avg_return(train_env, agent.policy, num_episodes=1)
        print('step = {0}: Average Return = {1}'.format(step, avg_return))
        returns = np.append(returns, avg_return)

    # if step % save_interval == 0:
    #     save_checkpoint_to_local()

    if step > num_iterations:
        break

Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))
 step 5000step = 5000: loss = 1529528320.0
step = 5000: Average Return = 14054.251953125
 step 10000step = 10000: loss = 4072367104.0
step = 10000: Average Return = 27981.68359375
 step 15000step = 15000: loss = 9688755200.0
step = 15000: Average Return = 27981.68359375
 step 20000step = 20000: loss = 27174162432.0
step = 20000: Average Return = 27981.68359375
 step 25000step = 25000: loss = 14264497152.0
step = 25000: Average Return = 0.0
 step 30000step = 30000: loss = 36312207360.0
step = 30000: Average Return = 19444.35546875
 step 35000step = 35000: loss = 48216670208.0
step = 35000: Average Return = 19444.35546875
 step 40000step = 40000: loss = 80179060736.0
step = 40000: Average Return = 21100.3671875
 step 45000step = 45000: loss = 70079340544.0
s

In [44]:
returns

array([11388.7  , 14054.252, 27981.684, 27981.684, 27981.684,     0.   ,
       19444.355, 19444.355, 21100.367, 19444.355, 14054.252, 14054.252,
           0.   , 27981.684, 27981.684,     0.   ,     0.   ,     0.   ,
           0.   ,     0.   , 21471.178], dtype=float32)