In [1]:
import os
import time
import numpy as np
#import matplotlib.pyplot as plt
from collections import deque
import gym
import pandas as pd
from pathlib import Path
import sys
import random
import datetime
import tensorflow as tf

from environments import PortfolioEnvironment
from utils import ReplayBuffer, ReplayBufferMultiObs
from policies import TD3

In [25]:
def reshape_dict_state(state):
    obs = {}
    for key,value in state.items():
        obs[key] = np.array([value])
    return obs

In [3]:
seed = 0
start_timesteps = 1e4 # Number of timesteps in which the model choose a random action, after that number starts using the policy
eval_freq = 5e3 # policy evaluation frequency in timesteps
max_timesteps = 5e5 
save_models = True 
expl_noise = 0.3 # Exploration noise 
batch_size = 100 
discount = 0.99 # reward Discount factor gamma 
tau = 0.005 # target weights update ratio
policy_noise = 0.2 # std deviation of gaussian noise to be added to the action, for exploration purposes
noise_clip = 0.5 # max value of gaussian noise added to action
policy_freq = 2 # actor model weights update frecuency

In [37]:
tickers = ['AAPL','AMZN','COIN','FB','GOOG','MSFT','NVDA','SQ','TSLA','V','XOM']
assets_data_list = []

for t in tickers:
    assets_data_list.append(pd.read_csv(f'./data/{t}.csv'))

In [10]:
env = PortfolioEnvironment(tickers,assets_data_list,fee=0.0025,look_back_window=50,max_steps=200)

In [11]:
state_dim = env.observation_space["data"].shape
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

In [12]:
policy = TD3(state_dim,action_dim,max_action)

replay_buffer = ReplayBufferMultiObs()

In [13]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True

In [14]:
start_time=time.time()
info_list = []
while total_timesteps < max_timesteps:
    if done:
        if total_timesteps != 0:
            print(f'Total timesteps: {total_timesteps} Episode Num: {episode_num} Reward: {episode_reward} Portfolio Value:{env.portfolio_value_units}')
            policy.train(replay_buffer,episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
        
        if timesteps_since_eval >= eval_freq:
            timesteps_since_eval %= eval_freq
            policy.save()

        
        obs = env.reset()
        done = False
        
        episode_reward = 0
        episode_timesteps = 0
        episode_num +=1
        
    if total_timesteps < start_timesteps:
        action = tf.nn.softmax(env.action_space.sample()+ np.random.normal(0,1,size=env.action_space.shape[0])).numpy()
    else:
        action = policy.select_action(reshape_dict_state(obs))
        if expl_noise != 0: 
            action = tf.nn.softmax((action + np.random.normal(0,expl_noise,size=env.action_space.shape[0]))).numpy()
    
    
    next_obs,reward,done, info = env.step(action)
    info_list.append(info)
    done_bool = 0 if episode_timesteps +1 == env.max_steps else float(done)
    
    
    episode_reward += reward
    replay_buffer.add((obs,next_obs,action,reward,done_bool))
    
    obs = next_obs
    episode_timesteps +=1
    total_timesteps +=1
    timesteps_since_eval +=1


policy.save()
print(f'Tiempo de entrenamiento: {time.time()-start_time} segundos')

[0.02540155 0.08445325 0.2194899  0.07599555 0.00829133 0.08853717
 0.07214953 0.05333817 0.13208755 0.05781625 0.02535468 0.15708508]
Total timesteps: 201 Episode Num: 1 Reward: 35351.90640382108 Portfolio Value:135351.90640382108
[0.09889332 0.02946939 0.01734909 0.12300316 0.08241269 0.09045353
 0.02408202 0.21570705 0.08218355 0.05920394 0.01473086 0.16251141]
Total timesteps: 402 Episode Num: 2 Reward: -23660.651810440482 Portfolio Value:76339.34818955952
[0.0752896  0.02576768 0.27065514 0.08212362 0.01582442 0.14798022
 0.09572264 0.02154131 0.09760803 0.01108359 0.06890133 0.08750241]
Total timesteps: 603 Episode Num: 3 Reward: 4095.29390541435 Portfolio Value:104095.29390541435
[0.07558573 0.03588203 0.00641057 0.02848198 0.02082697 0.03637235
 0.04611631 0.01571085 0.68297014 0.01076702 0.00474308 0.03613297]
Total timesteps: 804 Episode Num: 4 Reward: -3284.0774341256038 Portfolio Value:96715.9225658744
[0.04239642 0.03411539 0.04145956 0.03287035 0.12438864 0.1310271
 0.046

  prices_diff = prices[1:]/prices[:-1]


[0.11701467 0.08194769 0.06946536 0.11223808 0.04367974 0.07942618
 0.09046964 0.05690467 0.07479585 0.12047298 0.09111675 0.06246839]
Total timesteps: 178095 Episode Num: 939 Reward: 219699.940104832 Portfolio Value:319699.940104832
[0.21542458 0.05799839 0.08940862 0.06738188 0.05453139 0.09820477
 0.08399458 0.05828881 0.04540968 0.08243858 0.07024586 0.07667286]
Total timesteps: 178296 Episode Num: 940 Reward: 133014.1790037942 Portfolio Value:233014.1790037942
[0.14590651 0.09777034 0.05950586 0.10651562 0.05867043 0.04862424
 0.06447979 0.04535914 0.0942624  0.10257272 0.1040513  0.07228167]
Total timesteps: 178497 Episode Num: 941 Reward: 2699.0648435508047 Portfolio Value:102699.0648435508
[0.17093534 0.07652147 0.08673996 0.06500585 0.08627521 0.07691203
 0.09457875 0.04339766 0.06973378 0.071601   0.09179219 0.06650675]
Total timesteps: 178698 Episode Num: 942 Reward: 2200.349297141147 Portfolio Value:102200.34929714115
[0.16243607 0.11072591 0.11809927 0.07914098 0.10213709 

Inferencia

In [31]:
state = env.reset()
done = False
total_reward = 0

while not done:
    action = policy.select_action(reshape_dict_state(state))
    state, reward, done, _ = env.step(action)
    total_reward += reward
    #print(action)

total_reward

[1.0000000e+00 2.3319483e-27 0.0000000e+00 3.1161125e-28 1.8034648e-31
 0.0000000e+00 0.0000000e+00 5.8014698e-35 6.8880719e-21 0.0000000e+00
 6.2885874e-32 7.7115799e-18]


0.0