In [1]:
# Define environment

In [5]:
import random
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
import pandas as pd

class TradingSPYEnv(gym.Env):
    """
    SPY (S&P500) trading environment.
  
    State: [[short, neutral, long], portfolio value]
      - The states are 

      - Price: current adjusted close price of each stock.
      - Shares: shares owned of each stock.
      - MACD: Moving Average Convergence Divergence (MACD) is calculated using close price.
      - RSI: Relative Strength Index (RSI) is calculated using close price.
      - CCI: Commodity Channel Index (CCI) is calculated using high, low and close price.
      - ADX: Average Directional Index (ADX) is calculated using high, low and close price.  
  
    Action: sell (0), hold (1), and buy (2)
      - I prescribe a very simple policy
      - when selling, sell all the shares
      - when buying, buy as many as cash in hand allows
    """
    def __init__(self, train_data_path='historySPY.csv', sma_len=[5], init_invest=10000, learning_rate=0.0002, gamma=0.98,
                normalize_price = True, mode = 'train', train_test_split = 0.9):
        train_data = pd.read_csv(train_data_path, index_col = False, parse_dates= ['Date'])
        self.stock_price_history = train_data 
        self.max_sma_len = max(sma_len)
        self.current_step = self.max_sma_len # minimum number of steps
        self.iteration = 0 # the iteration step in an episode
        self.init_invest = init_invest
        self.accumulated_profit = 0.0
        self.normalize_price = normalize_price

        feature_dict = {'Date': self.stock_price_history['Date'],
                    'State': np.zeros(self.stock_price_history.shape[0], dtype=int),
                    'accumulated_profit': np.zeros(self.stock_price_history.shape[0], dtype=float), 
                    'portfolio_value': np.zeros(self.stock_price_history.shape[0], dtype=float),
                    'Close': self.stock_price_history['Close']
                    }
                        
        self.stock_price_history.dropna(axis=0,inplace=True)
        self.stock_price_history.reset_index(drop=True,inplace=True)

        self.features = pd.DataFrame(feature_dict)
        if isinstance(sma_len,list):
            self._set_sma(sma_len)
            self._set_breakout_sma(sma_len)
            self._set_daily_return()

#        self.features.reset_index(drop=True,inplace=True)


        train_test_split_index = int(self.features.shape[0] * train_test_split)
        if mode == 'train':
            self.end_step = train_test_split_index
        elif mode == 'test':
            self.features.shape[0]
            self.current_step = train_test_split_index
            self.end_step = self.features.shape[0]

        # Set up data and features
        self.reset(current_step = self.current_step)
            
        # action space
        # 0: short, 1: neutral, 2: long
        self.action_space = spaces.Discrete(3)
    
        # observation space
        # This contains features to make decisions
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(self.features.columns.shape[0] -1,), dtype=np.float16)
    
    def _set_sma(self, sma_len):
        feature = 'Close'
        for sma in sma_len:
            col_name = feature+'_'+str(sma)
            self.stock_price_history[col_name] = self.stock_price_history[feature].rolling(sma).mean()
            self.features[col_name] = self.stock_price_history[feature].rolling(sma).mean()
        self.features = self.features.dropna(axis=0)        

    def _set_daily_return(self):
        self.features['daily_return'] = self.stock_price_history['Close'].pct_change()        

    def _set_breakout_sma(self,sma_len):
        feature = 'Close'
        features = self.features
        for sma in sma_len:
            sma_name = feature+'_'+str(sma)
            breakout_name = 'breakout_' + str(sma)
            features[breakout_name] = features[feature] >= features[sma_name]
            features.drop(columns=[sma_name],inplace=True)        
        
    def _get_observation(self):
        observation = []
        observation.append(self.features['State'].loc[self.current_step-self.max_sma_len+1:self.current_step].to_numpy('float32'))
        observation.append(self.features['accumulated_profit'].loc[self.current_step-self.max_sma_len+1:self.current_step].to_numpy('float32'))
        observation.append(self.features['daily_return'].loc[self.current_step-self.max_sma_len+1:self.current_step].to_numpy('float32'))
        # breakout signals
        for col in self.features.columns:
            if 'breakout_' in col:
                observation.append(self.features[col].loc[self.current_step-self.max_sma_len+1:self.current_step].values)
            
        return np.array(observation).flatten()
        
        
    def reset(self, current_step = None):
        self.iteration = 0 
        self.features['State'] = 1 # State:1 means market neutral
        self.features['portfolio_value'] = 0.0       
        self.features['accumulated_profit'] = 0.0
        
        # Set the current step to a random point within the data frame
        if current_step is not None:
            self.current_step = current_step
        else:
            self.current_step = random.randint(self.max_sma_len, int(self.features.shape[0] * 0.9))
            
        self.features['portfolio_value'].loc[self.current_step] = self.init_invest
        
        if self.normalize_price:
            price = self.stock_price_history['Close'].loc[self.current_step]
            for col in self.features.columns:
                if 'Close' in col:
                    self.features[col].loc[self.current_step:self.end_step] = self.stock_price_history[col].loc[self.current_step:self.end_step] / price
                    
        return self._get_observation()

    """
    Compute what happens next step
    """
    def step(self, action):
        next_step = self.current_step + 1
        prev_step = self.current_step - 1
        
        col_name = 'Close'
        features = self.features
        portfolio_value = self.features.portfolio_value        

        done = False
        r_t =0.0
               
        
        # Current state is set
        self.features.State.loc[self.current_step] = action
    
        # Compute next step
        # compute portfolio value at next step
        if action == 0: # shorting
            portfolio_value.loc[next_step] = portfolio_value.loc[self.current_step] * features[col_name].loc[self.current_step] / features[col_name].loc[next_step]
        elif action == 1: # market-neutral position (100% cash)  
            portfolio_value.loc[next_step] = portfolio_value.loc[self.current_step]
        elif action == 2: # longing
            portfolio_value.loc[next_step] = portfolio_value.loc[self.current_step] * features[col_name].loc[next_step] / features[col_name].loc[self.current_step]
        else:
            raise TypeError("Action is out of the space")
        self.features.State.loc[next_step] = action

        # reward after taking action
        # difference in portfolio value 
        r_t = portfolio_value.loc[next_step] - portfolio_value.loc[self.current_step]

        features['accumulated_profit'].loc[next_step] = features['accumulated_profit'].loc[self.current_step] + r_t    

        if next_step == self.end_step: 
#            print('step ', self.current_step, ' daily profit', r_t)
            # At the end, we have nothing to do
            done = True
            long_return = self.features['Close'].loc[next_step] / self.features['Close'].loc[next_step-self.iteration]
            return None, r_t, done, {'profit_iteration': self.accumulated_profit/self.iteration, 'iterations': self.iteration, 
                                      'long_return': long_return}
         
        
        self.current_step += 1 
        self.iteration += 1
        
        
        s_prime = self._get_observation() # state at t+1
    
        return s_prime, r_t, done, None


In [9]:
env = TradingSPYEnv()
random.randint(0,5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


4

In [10]:
ret=[]
#for i in range(env.current_step,env.end_step):
for i in range(env.current_step,env.current_step+10):
#    print(env.features.portfolio_value.loc[env.current_step])
    s_prime,r_t,_,_ = env.step(2)
    print(s_prime, r_t)
    ret.append(r_t)
    

[ 1.0000000e+00  2.0000000e+00  2.0000000e+00  0.0000000e+00
  0.0000000e+00 -8.1662956e+01 -6.9870064e-03 -2.3453895e-03
 -8.1662955e-03  1.0000000e+00  0.0000000e+00  0.0000000e+00] -81.66295471417834
[ 1.0000000e+00  2.0000000e+00  2.0000000e+00  2.0000000e+00
  0.0000000e+00  0.0000000e+00 -8.1662956e+01 -3.8356842e+01
 -6.9870064e-03 -2.3453895e-03 -8.1662955e-03  4.3662675e-03
  1.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00] 43.30611234842945
[ 1.0000000e+00  2.0000000e+00  2.0000000e+00  2.0000000e+00
  2.0000000e+00  0.0000000e+00  0.0000000e+00 -8.1662956e+01
 -3.8356842e+01 -1.3610492e+02 -6.9870064e-03 -2.3453895e-03
 -8.1662955e-03  4.3662675e-03 -9.8124454e-03  1.0000000e+00
  0.0000000e+00  0.0000000e+00  0.0000000e+00  0.0000000e+00] -97.74808215788289
[ 2.0000000e+00  2.0000000e+00  2.0000000e+00  2.0000000e+00
  2.0000000e+00  0.0000000e+00 -8.1662956e+01 -3.8356842e+01
 -1.3610492e+02 -1.9797079e+02 -2.3453895e-03 -8.1662955e-03
  4.3662675e-03 -9.8124454

In [14]:
s_prime

array([ 2.0000000e+00,  2.0000000e+00,  2.0000000e+00,  2.0000000e+00,
        2.0000000e+00, -3.1799060e+02, -2.4498886e+02, -2.8953229e+02,
       -3.5634744e+02, -2.8210840e+02, -2.0282960e-02,  7.5399359e-03,
       -4.5662099e-03, -6.8807341e-03,  7.6982295e-03,  0.0000000e+00,
        0.0000000e+00,  0.0000000e+00,  0.0000000e+00,  1.0000000e+00],
      dtype=float32)

In [48]:
env.stock_price_history.loc[env.current_step - env.max_sma_len:env.current_step][['Open','High','Low','Close']]

Unnamed: 0,Open,High,Low,Close
35,76.25,76.73,75.09,75.12
36,75.15,76.25,74.82,76.05
37,76.12,77.55,76.09,77.3
38,77.41,77.71,76.69,77.14
39,77.65,78.28,76.75,77.44
40,77.56,78.2,77.01,77.09


In [9]:
env.current_step

10

In [10]:
env.features.loc[env.current_step - env.max_sma_len:env.current_step]

Unnamed: 0,Date,State,accumulated_profit,portfolio_value,Close,Close_5
5,2002-01-08,2,0.0,10000.0,1.0,1.001188
6,2002-01-09,2,-81.662955,9918.337045,0.991834,1.001262
7,2002-01-10,2,-38.356842,9961.643158,0.996164,0.999951
8,2002-01-11,2,-136.104925,9863.895075,0.98639,0.995348
9,2002-01-14,2,-197.970799,9802.029201,0.980203,0.990918
10,2002-01-15,2,-117.545162,9882.454838,0.988245,0.988567


In [11]:
env.features.loc[env.current_step-5:env.current_step]

Unnamed: 0,Date,State,accumulated_profit,portfolio_value,Close,Close_5
5,2002-01-08,2,0.0,10000.0,1.0,1.001188
6,2002-01-09,2,-81.662955,9918.337045,0.991834,1.001262
7,2002-01-10,2,-38.356842,9961.643158,0.996164,0.999951
8,2002-01-11,2,-136.104925,9863.895075,0.98639,0.995348
9,2002-01-14,2,-197.970799,9802.029201,0.980203,0.990918
10,2002-01-15,2,-117.545162,9882.454838,0.988245,0.988567


In [12]:
9802.029201 - -197.970799

10000.0

In [13]:
s = tmp.reset()
print(tmp.current_step)
print(s)
s_prime, r_t, done, info = tmp.step(2)
print(s_prime)
print(r_t)
print(done)

2025
[[91.93 92.52 91.74 92.48]
 [92.89 92.93 92.21 92.61]
 [91.99 92.19 91.39 91.74]
 [91.98 92.78 91.51 92.52]
 [92.41 92.94 92.36 92.77]
 [92.61 92.69 91.37 91.73]]
step  2025  daily profit 124.27777172135757
[[92.89 92.93 92.21 92.61]
 [91.99 92.19 91.39 91.74]
 [91.98 92.78 91.51 92.52]
 [92.41 92.94 92.36 92.77]
 [92.61 92.69 91.37 91.73]
 [91.71 92.93 91.69 92.87]]
124.27777172135757
False


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [14]:
s = tmp.reset()

In [15]:
tmp.step(2)

step  1621  daily profit -147.28240212846868


(array([[106.6 , 107.73, 106.42, 106.85],
        [107.28, 109.07, 107.08, 108.99],
        [108.04, 108.23, 105.46, 105.51],
        [105.95, 106.45, 104.83, 105.77],
        [105.03, 106.14, 104.78, 105.24],
        [105.26, 105.49, 103.68, 103.69]]),
 -147.28240212846868,
 False,
 None)

In [16]:
tmp.accumulated_profit

0.0

In [17]:
'Close' in 'Close_5'

True

In [18]:
"""
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
import itertools


class TradingEnv(gym.Env):
  A 3-stock (MSFT, IBM, QCOM) trading environment.

  State: [# of stock owned, current stock prices, cash in hand]
    - array of length n_stock * 2 + 1
    - price is discretized (to integer) to reduce state space
    - use close price for each stock
    - cash in hand is evaluated at each step based on action performed

  Action: sell (0), hold (1), and buy (2)
    - when selling, sell all the shares
    - when buying, buy as many as cash in hand allows
    - if buying multiple stock, equally distribute cash in hand and then utilize the balance

def __init__(self, train_data, init_invest=20000):
    # data
    self.stock_price_history = np.around(train_data) # round up to integer to reduce state space
    self.n_stock, self.n_step = self.stock_price_history.shape

    # instance attributes
    self.init_invest = init_invest
    self.cur_step = None
    self.stock_owned = None
    self.stock_price = None
    self.cash_in_hand = None

    # action space
    self.action_space = spaces.Discrete(3**self.n_stock)

    # observation space: give estimates in order to sample and build scaler
    stock_max_price = self.stock_price_history.max(axis=1)
    stock_range = [[0, init_invest * 2 // mx] for mx in stock_max_price]
    price_range = [[0, mx] for mx in stock_max_price]
    cash_in_hand_range = [[0, init_invest * 2]]
    self.observation_space = spaces.MultiDiscrete(stock_range + price_range + cash_in_hand_range)

    # seed and start
    self._seed()
    self._reset()


  def _seed(self, seed=None):
    self.np_random, seed = seeding.np_random(seed)
    return [seed]


  def _reset(self):
    self.cur_step = 0
    self.stock_owned = [0] * self.n_stock
    self.stock_price = self.stock_price_history[:, self.cur_step]
    self.cash_in_hand = self.init_invest
    return self._get_obs()


  def _step(self, action):
    assert self.action_space.contains(action)
    prev_val = self._get_val()
    self.cur_step += 1
    self.stock_price = self.stock_price_history[:, self.cur_step] # update price
    self._trade(action)
    cur_val = self._get_val()
    reward = cur_val - prev_val
    done = self.cur_step == self.n_step - 1
    info = {'cur_val': cur_val}
    return self._get_obs(), reward, done, info


  def _get_obs(self):
    obs = []
    obs.extend(self.stock_owned)
    obs.extend(list(self.stock_price))
    obs.append(self.cash_in_hand)
    return obs


  def _get_val(self):
    return np.sum(self.stock_owned * self.stock_price) + self.cash_in_hand


  def _trade(self, action):
    # all combo to sell(0), hold(1), or buy(2) stocks
    action_combo = map(list, itertools.product([0, 1, 2], repeat=self.n_stock))
    action_vec = action_combo[action]

    # one pass to get sell/buy index
    sell_index = []
    buy_index = []
    for i, a in enumerate(action_vec):
      if a == 0:
        sell_index.append(i)
      elif a == 2:
        buy_index.append(i)

    # two passes: sell first, then buy; might be naive in real-world settings
    if sell_index:
      for i in sell_index:
        self.cash_in_hand += self.stock_price[i] * self.stock_owned[i]
        self.stock_owned[i] = 0
    if buy_index:
      can_buy = True
      while can_buy:
        for i in buy_index:
          if self.cash_in_hand > self.stock_price[i]:
            self.stock_owned[i] += 1 # buy one share
            self.cash_in_hand -= self.stock_price[i]
          else:
            can_buy = False

"""

"\nimport gym\nfrom gym import spaces\nfrom gym.utils import seeding\nimport numpy as np\nimport itertools\n\n\nclass TradingEnv(gym.Env):\n  A 3-stock (MSFT, IBM, QCOM) trading environment.\n\n  State: [# of stock owned, current stock prices, cash in hand]\n    - array of length n_stock * 2 + 1\n    - price is discretized (to integer) to reduce state space\n    - use close price for each stock\n    - cash in hand is evaluated at each step based on action performed\n\n  Action: sell (0), hold (1), and buy (2)\n    - when selling, sell all the shares\n    - when buying, buy as many as cash in hand allows\n    - if buying multiple stock, equally distribute cash in hand and then utilize the balance\n\ndef __init__(self, train_data, init_invest=20000):\n    # data\n    self.stock_price_history = np.around(train_data) # round up to integer to reduce state space\n    self.n_stock, self.n_step = self.stock_price_history.shape\n\n    # instance attributes\n    self.init_invest = init_invest\n 