In [1]:
import os
import time
import numpy as np
#import matplotlib.pyplot as plt
from collections import deque
import gym
import pandas as pd
from pathlib import Path
import sys
import random
#from joblib import dump,load
import datetime
import tensorflow as tf

In [79]:
class PortfolioBuffer():
    def __init__(self,assets_names_list,assets_data_list,window):
        #self.names = {0:'CASH'}
        self.names = {}
        for index,value in enumerate(assets_data_list):
            self.names[index] = value
        self.shape = assets_data_list[0].shape
        for i in assets_data_list:
            if self.shape != i.shape:
                raise Exception('Data must be of the same size')
        if len(assets_data_list) != len(assets_names_list):
            raise Exception('The length of assets_names_list is different than the amount of assets in assets_data_list')
        self.data = assets_data_list
        self.shape = self.data.shape
        self.pointer = window
        self.window = window
        self.batch_cache = None
        self.length = self.shape[1]
    
    def get_batch(self):
        if self.batch_cache is None:
            batch = np.zeros(shape=(self.shape[0],self.window,self.shape[2]))
            for index,data in enumerate(self.data):
                batch[index] = data[self.pointer-self.window:self.pointer]/data[self.pointer-1][0]
            self.batch_cache = batch
        return self.batch_cache
    
    def get_next_batch(self):
        self.pointer += 1
        self.batch_cache = None
        return self.get_batch()
    
    def get_current_price(self,index):
        return self.data[index][self.pointer-1][0]
    
    def reset(self,position=None):
        if not position:
            self.pointer = random.randrange(self.window,self.length-self.window)
        else:
            self.pointer = self.window
        self.batch_cache = None
    
    
        

In [80]:
class PortfolioEnvironment(gym.Env):
    def __init__(self,assets_names_list,assets_data_list,fee,initial_capital=100000,look_back_window=50,max_steps=200):
        super(PortfolioEnvironment,self).__init__()
        '''
        assets_names_list: list with the ticker of each security
        assets_data_list: list of pandas dataframes with the data of each security, must have the same length as assets_names_list and the first column of each dataframe must have the price of the asset
        fee: porcentage of operating fee, with decimal, ie 0.1 is equal to 10% fee
        initial_capital: amount of cash at the beginning
        look_back_window: amount of periods to look back while executing a step
        steps: maximum number of possible steps
        '''
        self.buffer = PortfolioBuffer(assets_names_list,np.array(list(map(lambda x: x.to_numpy(),assets_data_list))),look_back_window)
        self.fee = fee
        self.f = self.buffer.shape[2]
        self.n = look_back_window
        self.m = self.buffer.shape[0]
        self.max_steps = max_steps
        self.current_step = 0
        self.initial_capital = initial_capital
        
        self.action_space = gym.spaces.Box(low=0.0,high=1.0,shape=(self.m+1,),dtype=np.float16)
        #self.observation_space = gym.spaces.Box(low=0,high=1,shape=(self.f,self.n,self.m),dtype=np.float16)
        self.observation_space = gym.spaces.Dict({"data": gym.spaces.Box(low=0,high=1,shape=(self.m,self.n,self.f),dtype=np.float16), 
                                              "weights": gym.spaces.Box(low=0.0,high=1.0,shape=(self.m+1,),dtype=np.float16)})
        
        #self.weights = np.resize(np.array([1.0]+[0.0]*(self.m-1)),(self.n,self.m))
        self.weights = np.array([1.0]+[0.0]*(self.m))
        self.portfolio_value = 1.0
        
    def _buy(self,index,price,amount):
        raise NotImplementedError

    def _sell(self,index,price,amount):
        raise NotImplementedError
        
    def _price_relative_vector(self):
        '''
        returns a matrix with the division of each assets value by the previous one
        '''
        prices = self.buffer.get_batch()[:,:,0].T
        prices_diff = prices[:-1]/prices[1:]
        prices_diff = np.concatenate((np.ones(shape=(prices_diff.shape[0],1)),prices_diff),axis=1)
        return prices_diff
        
    def _weights_at_end_of_period(self):
        '''
        returns a vector with the weights of the portfolio after the new prices but before taking any action
        '''
        y = self._price_relative_vector()[-1]
        return np.multiply(y,self.weights)/np.dot(y,self.weights)
    
    def _operation_cost(self,weights):
        '''
        weights: vector with the new weights provided by the actor
        returns a scalar value with the cost of doing the buy/sell operations needed to get to those weights
        '''
        w_prime = self._weights_at_end_of_period()[1:]
        return self.fee * np.sum(np.abs(weights[1:]-w_prime))
    
    def _portfolio_value_after_operation(self,weights):
        '''
        weights: vector with the new weights provided by the actor
        returns a scalar with the new value of the portfolio after doing the buy/sell operations needed to get to those weights
        '''
        c = self._operation_cost(weights)
        p0 = self.portfolio_value
        y = self._price_relative_vector()[-1]
        w = self.weights
        return p0 * (1 - c) * np.dot(y, w)
    
    def step(self, action):
        
        p1 = self._portfolio_value_after_operation(action)
        
        reward = np.log(p1/self.portfolio_value) / self.max_steps
        
        self.weights = action
        
        self.portfolio_value = p1
        done = 0 if self.buffer.length-1 > self.buffer.pointer and self.current_step < self.max_steps and self.weights[0] > 0.0 else 1 
        info = {}
        self.current_step += 1
        obs = {"data":self.buffer.get_next_batch(),"weights":self.weights}
        
        return obs, reward, done, info
    
    def reset(self):
        self.weights = np.array([1.0]+[0.0]*(self.m))
        self.portfolio_value = 1.0
        self.current_step = 0
        self.buffer.reset()
        return {"data":self.buffer.get_batch(),"weights":self.weights}
    
    def render(self):
        pass
        
        

Experience replay, guarda las acciones realizadas por el actor los estados y las recompensas

In [81]:
class ReplayBuffer():
    def __init__(self,max_size=1e6):
        self.max_size = max_size
        self.storage = [] # (s,s',a,r) memory
        self.ptr = 0 #memory pointer
    
    def add(self, transition):
        if len(self.storage)==self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr+1)%self.max_size
        else:
            self.storage.append(transition)
    
    def sample(self,batch_size):
        ind = np.random.randint(0,len(self.storage),size=batch_size) 
        batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [],[],[],[],[]
        for i in ind:
            state, next_state, action, reward, done = self.storage[i]
            batch_states.append(np.array(state,copy=False))
            batch_next_states.append(np.array(next_state,copy=False))
            batch_actions.append(np.array(action,copy=False))
            batch_rewards.append(np.array(reward,copy=False))
            batch_dones.append(np.array(done,copy=False))
        return np.array(batch_states),np.array(batch_next_states),np.array(batch_actions),np.array(batch_rewards).reshape(-1,1),np.array(batch_dones).reshape(-1,1)
        

In [82]:
class Actor(tf.keras.Model):
    def __init__(self,state_dim,action_dim,max_action):
        super(Actor,self).__init__()
        self.layer_1 = tf.keras.layers.Dense(state_dim,activation='relu')
        self.layer_2 = tf.keras.layers.Dense(400,activation='relu')
        self.layer_3 = tf.keras.layers.Dense(300,activation='relu')
        self.layer_4 = tf.keras.layers.Dense(action_dim,activation='softmax')
        self.max_action = max_action
        
    def call(self, obs):
        x = self.layer_1(obs)
        x = self.layer_2(x)
        x = self.layer_3(x)
        x = self.layer_4(x)
        x = x * self.max_action
        return x

In [83]:
class Critic(tf.keras.Model):
    def __init__(self,state_dim,action_dim):
        super(Critic,self).__init__()
        self.layer_1 = tf.keras.layers.Dense(state_dim+action_dim,activation='relu')
        self.layer_2 = tf.keras.layers.Dense(400,activation='relu')
        self.layer_3 = tf.keras.layers.Dense(300,activation='relu')
        self.layer_4 = tf.keras.layers.Dense(1)
        self.layer_5 = tf.keras.layers.Dense(state_dim+action_dim,activation='relu')
        self.layer_6 = tf.keras.layers.Dense(400,activation='relu')
        self.layer_7 = tf.keras.layers.Dense(300,activation='relu')
        self.layer_8 = tf.keras.layers.Dense(1)
        
    def call(self, obs,actions):
        x0 = tf.concat([obs, actions], 1)
        x1 = self.layer_1(x0)
        x1 = self.layer_2(x1)
        x1 = self.layer_3(x1)
        x1 = self.layer_4(x1)
        
        x2 = self.layer_5(x0)
        x2 = self.layer_6(x2)
        x2 = self.layer_7(x2)
        x2 = self.layer_8(x2)
        
        return x1, x2
        
    def Q1(self, state, action):
        x0 = tf.concat([state, action], 1)
        x1 = self.layer_1(x0)
        x1 = self.layer_2(x1)
        x1 = self.layer_3(x1)
        x1 = self.layer_4(x1)
        return x1
        

In [84]:
class TD3():
    def __init__(self, state_dim, action_dim, max_action,lr=3e-4):
        self.actor = Actor(state_dim, action_dim, max_action)
        self.actor_target = Actor(state_dim, action_dim, max_action)
        for t, e in zip(self.actor_target.trainable_variables, self.actor.trainable_variables):
            t.assign(e)
        
        self.actor_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        
        self.critic = Critic(state_dim, action_dim)
        self.critic_target = Critic(state_dim, action_dim)
        for t, e in zip(self.critic_target.trainable_variables, self.critic.trainable_variables):
            t.assign(e)
        self.critic_optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        self.critic_loss_fn = tf.keras.losses.Huber()
        self.max_action = max_action
        
    def select_action(self, state):
        state = state.reshape(1, -1)
        action = self.actor.call(state)[0].numpy()
        return action
    
    def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clipping=0.5, policy_freq=2):
        for i in range(iterations):
            #get sample (s,s',a,r) from memory
            batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            
            #predict a' from s'
            next_action = self.actor_target.call(batch_next_states)
            
            #add noise 
            #noise = tf.random.normal(next_action.shape, mean=0, stddev=policy_noise)
            #noise = tf.clip_by_value(noise, -noise_clipping, noise_clipping)
            #next_action = tf.clip_by_value(next_action + noise, 0, self.max_action)
                        
            target_Q1,target_Q2 = self.critic_target.call(batch_next_states,next_action)
            #take minimum Q value
            target_Q = tf.minimum(target_Q1,target_Q2)
            #get final Q target, considering wether the episode has ended or not
            target_Q = tf.stop_gradient(batch_rewards + (1 - batch_dones) * discount * target_Q)    
            
            #critic backpropagation
            
            trainable_critic_variables = self.critic.trainable_variables
            
            with tf.GradientTape(watch_accessed_variables=False) as tape:
                tape.watch(trainable_critic_variables)
                current_Q1, current_Q2 = self.critic.call(batch_states,batch_actions)
                critic_loss = (self.critic_loss_fn(current_Q1,target_Q) + self.critic_loss_fn(current_Q2,target_Q))
            critic_grads = tape.gradient(critic_loss, trainable_critic_variables)
            self.critic_optimizer.apply_gradients(zip(critic_grads, trainable_critic_variables))
                     
            #AUpdate actor model
            if i%policy_freq==0:
                trainable_actor_variables = self.actor.trainable_variables
                with tf.GradientTape(watch_accessed_variables=False) as tape:
                    tape.watch(trainable_actor_variables)
                    #applying gradient ascent by taking de oposit function
                    actor_loss = -tf.reduce_mean(self.critic.Q1(batch_states, self.actor(batch_states))) 
                actor_grads = tape.gradient(actor_loss, trainable_actor_variables)
                self.actor_optimizer.apply_gradients(zip(actor_grads, trainable_actor_variables))
            
                # update the weights in the critic and actor target models, the tau parameter will define how much is going to adjust
                for target_param, param in zip(self.critic_target.trainable_variables, self.critic.trainable_variables):
                    target_param.assign(target_param * (1 - tau) + param * tau)
                for target_param, param in zip(self.actor_target.trainable_variables, self.actor.trainable_variables):
                    target_param.assign(target_param * (1 - tau) + param * tau)
        
        
        
        
    def save(self):
        time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        self.actor.save_weights(f'./models/{time}/actor')
        self.actor_target.save_weights(f'./models/{time}/actor_target')
        self.critic.save_weights(f'./models/{time}/critic')
        self.critic_target.save_weights(f'./models/{time}/critic_target')

In [85]:
def evaluate_policy(policy, eval_episodes=10):
    avg_reward = 0.0
    for _ in range(eval_episodes):
        state = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(state))
            state, reward, done, _ = env.step(action)
            avg_reward += reward
    avg_reward /= eval_episodes
    print(f'Average Reward: {avg_reward}')
    return avg_reward

In [86]:
seed = 0
start_timesteps = 1e4 # Number of timesteps in which the model choose a random action, after that number starts using the policy
eval_freq = 5e3 # policy evaluation frequency in timesteps
max_timesteps = 5e5 
save_models = True 
expl_noise = 0.1 # Exploration noise 
batch_size = 100 
discount = 0.99 # reward Discount factor gamma 
tau = 0.005 # target weights update ratio
policy_noise = 0.2 # std deviation of gaussian noise to be added to the action, for exploration purposes
noise_clip = 0.5 # max value of gaussian noise added to action
policy_freq = 2 # actor model weights update frecuency

In [87]:
aapl = pd.read_csv('AAPL.csv')[['close','high','low']]
xom = pd.read_csv('XOM.csv')[['close','high','low']]
tsla = pd.read_csv('TSLA.csv')[['close','high','low']]
assets_data_list = [aapl,xom,tsla]

In [88]:
env = PortfolioEnvironment(['AAPL','XOM','TSLA'],assets_data_list,fee=0.025,look_back_window=40)
env = gym.wrappers.FlattenObservation(env)

In [89]:
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

In [90]:
policy = TD3(state_dim,action_dim,max_action)
np.seterr('raise')
replay_buffer = ReplayBuffer()
evaluations = [evaluate_policy(policy)]

Average Reward: -0.0001311608837627755


In [91]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True

In [None]:
start_time=time.time()
while total_timesteps < max_timesteps:
    if done:
        if total_timesteps != 0:
            print(f'Total timesteps: {total_timesteps} Episode Num: {episode_num} Reward: {episode_reward}')
            policy.train(replay_buffer,episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
        
        if timesteps_since_eval >= eval_freq:
            timesteps_since_eval %= eval_freq
            evaluations.append(evaluate_policy(policy))
            #policy.save(file_name)
            #np.save(f'{file_name}_evaluation',evaluations)
        
        obs = env.reset()
        done = False
        
        episode_reward = 0
        episode_timesteps = 0
        episode_num +=1
        
    if total_timesteps < start_timesteps:
        action = env.action_space.sample()
    else:
        action = policy.select_action(np.array(obs))
        #print(action)
        #if expl_noise != 0: 
        #    action = (action + np.random.normal(0,expl_noise,size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
    
    next_obs,reward,done,_ = env.step(action)
    done_bool = 0 if episode_timesteps +1 == env.max_steps else float(done)
    #done_bool = 0 if episode_timesteps +1 == env._max_episode_steps else float(done)
    
    episode_reward += reward
    replay_buffer.add((obs,next_obs,action,reward,done_bool))
    
    obs = next_obs
    episode_timesteps +=1
    total_timesteps +=1
    timesteps_since_eval +=1

print(evaluations.append(evaluate_policy(policy)))
#policy.save(file_name)
#np.save(f'{file_name}_evaluation',evaluations)
print(f'Tiempo de entrenamiento: {time.time()-start_time} segundos')

Total timesteps: 201 Episode Num: 1 Reward: 0.649935428234472
Total timesteps: 402 Episode Num: 2 Reward: 0.608969643320114
Total timesteps: 603 Episode Num: 3 Reward: 0.6488049928618003
Total timesteps: 804 Episode Num: 4 Reward: 0.625277419195717
Total timesteps: 1005 Episode Num: 5 Reward: 0.6023878818387706
Total timesteps: 1206 Episode Num: 6 Reward: 0.6004139843550615
Total timesteps: 1345 Episode Num: 7 Reward: 0.4016809124241343
Total timesteps: 1546 Episode Num: 8 Reward: 0.625277634770482
Total timesteps: 1747 Episode Num: 9 Reward: 0.6583797616570931
Total timesteps: 1948 Episode Num: 10 Reward: 0.6209790679867677
Total timesteps: 2149 Episode Num: 11 Reward: 0.6007532760733452
Total timesteps: 2350 Episode Num: 12 Reward: 0.6260874205596533
Total timesteps: 2551 Episode Num: 13 Reward: 0.619657765141251
Total timesteps: 2752 Episode Num: 14 Reward: 0.6077704013549624
Total timesteps: 2953 Episode Num: 15 Reward: 0.6303671696568787
Total timesteps: 3154 Episode Num: 16 Rewar