In [1]:
import os
import time
import numpy as np
#import matplotlib.pyplot as plt
from collections import deque
import gym
import pandas as pd
from pathlib import Path
import sys
import random
#from joblib import dump,load
import datetime
import tensorflow as tf
#import pybullet_envs

from environments import PortfolioEnvironment
from utils import ReplayBuffer
from policies import TD3,TD3p

In [2]:
def evaluate_policy(policy, eval_episodes=10):
    avg_reward = 0.0
    for _ in range(eval_episodes):
        state = env.reset()
        done = False
        while not done:
            action = policy.select_action(np.array(state))
            #action = tf.nn.softmax(action+ np.random.normal(0,1,size=4)).numpy()
            state, reward, done, _ = env.step(action)
            avg_reward += reward
            #print(action)
    avg_reward /= eval_episodes
    print(f'Average Reward: {avg_reward}')
    return avg_reward

In [3]:
seed = 0
start_timesteps = 1e4 # Number of timesteps in which the model choose a random action, after that number starts using the policy
eval_freq = 5e3 # policy evaluation frequency in timesteps
max_timesteps = 5e5 
save_models = True 
expl_noise = 0 # Exploration noise 
batch_size = 100 
discount = 0.99 # reward Discount factor gamma 
tau = 0.005 # target weights update ratio
policy_noise = 0.2 # std deviation of gaussian noise to be added to the action, for exploration purposes
noise_clip = 0.5 # max value of gaussian noise added to action
policy_freq = 2 # actor model weights update frecuency

In [4]:
aapl = pd.read_csv('./data/AAPL.csv')[['close','high','low']]
xom = pd.read_csv('./data/XOM.csv')[['close','high','low']]
tsla = pd.read_csv('./data/TSLA.csv')[['close','high','low']]
tickers = ['AAPL','XOM','TSLA']
assets_data_list = [aapl,xom,tsla]

In [5]:
#xom_daily = pd.read_csv('BTC-USD.csv')[['Adj Close','High','Low']]
#tsla_daily = pd.read_csv('DOGE-USD.csv')[['Adj Close','High','Low']]
xom_daily = pd.read_csv('./data/XOM_y.csv')[['Adj Close','High','Low']]
tsla_daily = pd.read_csv('./data/TSLA_y.csv')[['Adj Close','High','Low']]
aapl_daily = pd.read_csv('./data/AAPL_y.csv')[['Adj Close','High','Low']]
tickers_daily = ['AAPL','XOM','TSLA']
assets_data_list_daily = [aapl_daily,xom_daily,tsla_daily]

In [6]:
#env = PortfolioEnvironment(tickers_daily,assets_data_list_daily,fee=0.025,look_back_window=12,max_steps=24)
env = PortfolioEnvironment(tickers,assets_data_list,fee=0.0025,look_back_window=40,max_steps=200)
env = gym.wrappers.FlattenObservation(env)
#env = gym.make("HalfCheetahBulletEnv-v0")

In [7]:
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

In [8]:
policy = TD3p(state_dim,action_dim,max_action)

replay_buffer = ReplayBuffer()
evaluations = [evaluate_policy(policy,1)]

Average Reward: 1.3806474577981047


In [9]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True

In [None]:
start_time=time.time()
info_list = []
while total_timesteps < max_timesteps:
    if done:
        if total_timesteps != 0:
            print(f'Total timesteps: {total_timesteps} Episode Num: {episode_num} Reward: {episode_reward} Portfolio Value:{np.exp(episode_reward)}')
            policy.train(replay_buffer,episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
        
        if timesteps_since_eval >= eval_freq:
            timesteps_since_eval %= eval_freq
            evaluations.append(evaluate_policy(policy))
            #policy.save(file_name)
            np.save(f'policy_evaluation',evaluations)
        
        obs = env.reset()
        done = False
        
        episode_reward = 0
        episode_timesteps = 0
        episode_num +=1
        
    if total_timesteps < start_timesteps:
        action = tf.nn.softmax(env.action_space.sample()+ np.random.normal(0,1,size=env.action_space.shape[0])).numpy()
    else:
        action = policy.select_action(np.array(obs))
        if expl_noise != 0: 
            #action = (action + np.random.normal(0,expl_noise,size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
            action = tf.nn.softmax((action + np.random.normal(0,expl_noise,size=env.action_space.shape[0]))).numpy()
    
    #print(action)
    next_obs,reward,done, info = env.step(action)
    info_list.append(info)
    #done_bool = 0 if episode_timesteps +1 == env._max_episode_steps else float(done)
    done_bool = 0 if episode_timesteps +1 == env.max_steps else float(done)
    
    #print(reward)
    episode_reward += reward
    replay_buffer.add((obs,next_obs,action,reward,done_bool))
    
    obs = next_obs
    episode_timesteps +=1
    total_timesteps +=1
    timesteps_since_eval +=1

print(evaluations.append(evaluate_policy(policy)))
policy.save()
np.save(f'policy_evaluation',evaluations)
print(f'Tiempo de entrenamiento: {time.time()-start_time} segundos')

Total timesteps: 201 Episode Num: 1 Reward: -0.001490716200161466 Portfolio Value:0.9985103943653187
Total timesteps: 402 Episode Num: 2 Reward: -0.001585255285695405 Portfolio Value:0.9984160005677613
Total timesteps: 603 Episode Num: 3 Reward: -0.001562334343522515 Portfolio Value:0.9984388854654455
Total timesteps: 804 Episode Num: 4 Reward: -0.0016579010860431338 Portfolio Value:0.9983434724727827
Total timesteps: 1005 Episode Num: 5 Reward: -0.0015642008919238943 Portfolio Value:0.9984370218326792
Total timesteps: 1206 Episode Num: 6 Reward: -0.0015292208056728802 Portfolio Value:0.9984719478566733
Total timesteps: 1407 Episode Num: 7 Reward: -0.0015052136305916937 Portfolio Value:0.9984959186352732
Total timesteps: 1608 Episode Num: 8 Reward: -0.001736385796473157 Portfolio Value:0.9982651208491784
Total timesteps: 1809 Episode Num: 9 Reward: -0.001553571885879434 Portfolio Value:0.998447634282219
Total timesteps: 2010 Episode Num: 10 Reward: -0.0016793450727461239 Portfolio Valu

In [None]:
tf.nn.softmax(env.action_space.sample() + np.random.normal(0,1,size=env.action_space.shape[0])).numpy()

In [None]:
np.exp(-0.00014784459103412868)

In [None]:
env.buffer.length

In [None]:
base = tf.nn.softmax(env.action_space.sample())
base

In [13]:
tf.nn.softmax(np.random.normal(0,1,size=env.action_space.shape[0])).numpy()

array([0.24344839, 0.29504645, 0.0590641 , 0.40244105])

In [18]:
weights = tf.nn.softmax(np.random.normal(0,1,size=env.action_space.shape[0]-1)).numpy()

In [92]:
data = env.buffer.data[:,:40,0].T
data

array([[165.14,  78.66, 875.84],
       [165.53,  78.72, 864.9 ],
       [165.68,  79.19, 864.59],
       [165.51,  79.37, 861.35],
       [165.75,  79.29, 863.52],
       [165.96,  79.19, 867.56],
       [166.22,  79.3 , 869.61],
       [166.27,  79.42, 866.45],
       [166.33,  79.54, 869.38],
       [166.26,  79.62, 874.14],
       [166.11,  79.69, 868.82],
       [166.05,  79.8 , 866.31],
       [165.9 ,  80.09, 862.84],
       [165.77,  80.37, 864.16],
       [165.71,  80.37, 862.99],
       [165.83,  80.5 , 862.11],
       [165.71,  80.47, 859.5 ],
       [166.1 ,  80.35, 863.3 ],
       [166.23,  80.48, 864.26],
       [166.41,  80.6 , 865.16],
       [166.23,  80.5 , 863.09],
       [165.77,  80.36, 860.  ],
       [165.75,  79.98, 859.39],
       [165.6 ,  79.8 , 858.78],
       [165.75,  80.06, 860.9 ],
       [165.58,  79.94, 860.91],
       [165.9 ,  79.97, 862.54],
       [165.9 ,  80.05, 863.17],
       [165.92,  79.98, 863.  ],
       [165.74,  79.98, 860.84],
       [16

In [97]:
np.log(np.dot(weights,data[-1])/np.dot(weights,data[-2]))*40

-0.046992145312596836

In [98]:
data = data/data[-1]

In [99]:
np.log(np.dot(weights,data[-1])/np.dot(weights,data[-2]))*40

-0.04410925802795166

In [75]:
data

array([[0.9977042 , 0.98868778, 1.0169523 ],
       [1.00006042, 0.98944193, 1.00424969],
       [1.00096665, 0.99534942, 1.00388974],
       [0.99993958, 0.99761187, 1.00012772],
       [1.00138956, 0.99660633, 1.00264735],
       [1.00265829, 0.99534942, 1.00733826],
       [1.0042291 , 0.99673203, 1.00971855],
       [1.00453117, 0.99824032, 1.00604942],
       [1.00489367, 0.99974862, 1.00945149],
       [1.00447076, 1.00075415, 1.0149784 ],
       [1.00356452, 1.00163399, 1.00880126],
       [1.00320203, 1.00301659, 1.00588686],
       [1.0022958 , 1.00666164, 1.00185779],
       [1.00151039, 1.010181  , 1.00339046],
       [1.0011479 , 1.010181  , 1.00203195],
       [1.00187289, 1.01181498, 1.00101017],
       [1.0011479 , 1.01143791, 0.99797966],
       [1.00350411, 1.00992961, 1.0023919 ],
       [1.00428951, 1.0115636 , 1.00350657],
       [1.00537699, 1.0130719 , 1.00455158],
       [1.00428951, 1.01181498, 1.00214807],
       [1.00151039, 1.0100553 , 0.99856022],
       [1.

In [45]:
env.buffer.pointer

108

In [63]:
env.buffer.get_price_relative_vector()

array([1.        , 1.00036153, 0.99925816, 0.99956936])

In [55]:
batch = np.zeros(shape=(env.buffer.shape[0],2,env.buffer.shape[2]))
for index,data in enumerate(env.buffer.data):
    batch[index] = data[env.buffer.pointer-2:env.buffer.pointer]
prices = batch[:,:,0].T
prices_diff = prices[1]/prices[0]
prices_diff = np.concatenate((np.ones(shape=(1)),prices_diff))
prices_diff

array([1.        , 1.00012087, 0.99886321, 0.99978199])

In [52]:
env.buffer.data[:,107,:]

array([[165.49, 165.6 , 165.42],
       [ 79.08,  79.18,  79.05],
       [871.31, 872.07, 870.6 ]])

In [5]:
import torch
import numpy as np

In [2]:
torch.cuda.is_available()

True

In [3]:
torch.softmax()

TypeError: softmax() received an invalid combination of arguments - got (), but expected one of:
 * (Tensor input, int dim, torch.dtype dtype)
 * (Tensor input, name dim, *, torch.dtype dtype)


In [14]:
torch.nn.functional.softmax(torch.Tensor(np.random.normal(0,1,size=4)),dim=0)

tensor([0.3811, 0.2359, 0.1225, 0.2605])

tensor([-2.3216, -0.6105,  1.0061,  0.5312])