In [1]:
import gym
import numpy as np
import itertools
import datetime
import torch
import torch.nn as nn
import copy
import time

from sac import SAC
from torch.utils.tensorboard import SummaryWriter
from replay_buffer import ReplayMemory

In [2]:
from stock_trading_env import StockTradingEnv

In [3]:
train_trade_path = './train_trade_table.npy'
val_trade_path = './val_trade_table.npy'
train_feature_path = './train_feature_array_standard.npy'
val_feature_path = './val_feature_array_standard.npy'

In [4]:
# Settings
SEEDS = 666
replay_size = 10000

look_back = 10       # size of extracted window
updates_per_step = 1 # model updates per simulator step (default: 1)
num_stock = 1
balance = 100000

hidden_size = 64

gamma = 0.99       # discount factor
tau = 0.005        # target function update parameter
lr = 0.0001
alpha = 0.2        # weight of entropy term

h_max = 1          # upper/lower bound of action
batch_size = 12
num_episodes = 500
target_update_interval = 1
updates_per_step = 1

automatic_entropy_tuning = True

In [5]:
train_trade = np.load(train_trade_path)
val_trade = np.load(val_trade_path)
train_feature = np.load(train_feature_path)
val_feature = np.load(val_feature_path)


# concat the feature and the original information
train_states = np.concatenate([train_feature,train_trade],axis=1)
val_states = np.concatenate([val_feature,val_trade],axis=1)

num_feature = train_states.shape[1]

In [6]:
num_feature

122

In [7]:
state_dim = num_stock*num_feature + 1 \
         + num_stock # original states + balance + shares
val_steps = len(val_trade) - look_back - 1

In [8]:
env = StockTradingEnv(train_states,
                      look_back = look_back, 
                      feature_num = num_feature,
                      steps = 1440,
                      valid_env = True,
                      balance = balance)
env_val = StockTradingEnv(val_states, 
                          look_back = look_back,
                          steps = val_steps,
                          feature_num = num_feature,
                          balance = balance)

In [9]:
# obs, _  = env.reset()
# obs = [obs]

# agent.normalize(obs).shape

In [10]:
memory = ReplayMemory(replay_size,SEEDS)

In [11]:
# Training Loop
policy_types = ["Gaussian", "Deterministic"]
policy_type = policy_types[1]

writer = SummaryWriter('runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), 
                                                     policy_type,
                                                     look_back,
                                "autotune" if automatic_entropy_tuning and policy_type == "Gaussian" else ""))
num_episodes = 1000
agent = SAC(state_dim, 
            env.action_space,
            look_back = look_back,
            lr = lr,
            automatic_entropy_tuning=automatic_entropy_tuning,
            tau = tau,
            alpha = alpha,
            gamma = gamma,
            hidden_size=hidden_size,
            policy_type=policy_type,
            num_episodes=num_episodes)



In [None]:
total_numsteps = 0
updates = 0


for i_episode in itertools.count(1):
    episode_reward = 0
    episode_steps = 0
    done = False
    pre_time = time.time()

    state, _ = env.reset()
    while not done:
        state_ = [state]
        action = agent.select_action(state_)  # Sample action from policy

        if len(memory) > batch_size:
            # Number of updates per step in environment
            for i in range(updates_per_step):
                # Update parameters of all the networks
                critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(memory, batch_size, updates)

                updates += 1
                agent.critic_lr_scheduler.step()
                agent.policy_lr_scheduler.step()
                

        # print("updates:",updates,"action: ",action[0], 'env steps', env.src.step)
        next_state, reward, done, _ = env.step(action) # Step
        episode_reward += reward
        episode_steps += 1
        # important to make a deep copy of the state
        memory.push(copy.deepcopy(state), action, reward, next_state, done) # Append transition to memory

        state = next_state
        
    # learning rate scheduling 
#     agent.critic_lr_scheduler.step()
#     agent.policy_lr_scheduler.step()
    total_numsteps += 1
    
    market_gains = np.sum(np.vstack([info['market_gain'] for info in env.infos]),axis=0)[0]
    print("-------------------------------------------")
    print("Training Episode: {:d}, Avg Episode Reward: {:4f}, Market-Gain: {:4f}, elapase:{:4f}s".format(total_numsteps,
                                                                                episode_reward /episode_steps,
                                                                                market_gains / episode_steps,
                                                                                time.time() - pre_time))
    print("Critic 1 Loss: {:.4e}, Critic 2 loss: {:.4e}, policy_loss: {:.4f}, ent loss: {:.4f}".format(critic_1_loss,
                                                                                        critic_2_loss,
                                                                                        policy_loss,
                                                                                        ent_loss))
    print("-------------------------------------------")
    writer.add_scalar('critic1_loss/train', critic_1_loss, total_numsteps)
    writer.add_scalar('critic2_loss/train', critic_2_loss, total_numsteps)
    writer.add_scalar('policy_loss/train', policy_loss, total_numsteps)
    writer.add_scalar('ent_loss/train', ent_loss, total_numsteps)
    if total_numsteps > num_episodes:
        break
        
    
    ## validating training every 10 episodoes
    if i_episode % 5 == 0:
        avg_reward = 0.
        episodes = 10
        for _  in range(episodes):
            state, _ = env_val.reset()
            state_ = [state]
            episode_reward = 0
            done = False
            while not done:
                action = agent.select_action(state_, evaluate=True)
                next_state, reward, done, _ = env_val.step(action)
                episode_reward += reward


                state = next_state
                # print("action: ",action[0], 'env steps', env_val.src.step)
            avg_reward += episode_reward
        avg_reward /= episodes
        
        writer.add_scalar('avg_reward/test', avg_reward, i_episode)
        market_gains = np.sum(np.vstack([info['market_gain'] for info in env_val.infos]),axis=0)[0]
        print("-------------------------------------------")
        print("Testing Episode: {:d}, Episode Reward: {:4f},Market-Gain: {:4f} ".format(i_episode,
                                                                                         avg_reward,
                                                                                       market_gains))
        print("-------------------------------------------")
        
agent.save_checkpoint(env_name = policy_type)

-------------------------------------------
Training Episode: 1, Avg Episode Reward: -0.000055, Market-Gain: -0.000057, elapase:58.165006s
Critic 1 Loss: 3.2402e-06, Critic 2 loss: 2.9905e-06, policy_loss: 0.0006, ent loss: 0.0000
-------------------------------------------
-------------------------------------------
Training Episode: 2, Avg Episode Reward: -0.000056, Market-Gain: -0.000057, elapase:58.401585s
Critic 1 Loss: 1.4981e-06, Critic 2 loss: 1.5550e-06, policy_loss: 0.0005, ent loss: 0.0000
-------------------------------------------
-------------------------------------------
Training Episode: 3, Avg Episode Reward: -0.000055, Market-Gain: -0.000057, elapase:56.911356s
Critic 1 Loss: 1.7901e-06, Critic 2 loss: 1.6970e-06, policy_loss: 0.0004, ent loss: 0.0000
-------------------------------------------


In [18]:
print(agent.policy)

GaussianPolicy(
  (lstm): LSTM_Module(
    (lstm): LSTM(124, 64, num_layers=2)
  )
  (linear1): Linear(in_features=640, out_features=64, bias=True)
  (linear2): Linear(in_features=64, out_features=64, bias=True)
  (mean_linear1): Linear(in_features=64, out_features=64, bias=True)
  (mean_linear2): Linear(in_features=64, out_features=1, bias=True)
  (log_std_linear1): Linear(in_features=64, out_features=64, bias=True)
  (log_std_linear2): Linear(in_features=64, out_features=1, bias=True)
)


In [None]:
state, _ = env_val.reset()
state_ = [state]
agent.normalize(state_).shape

In [None]:
input_ = torch.FloatTensor(np.random.rand(1,30))

In [None]:
bn1(linear1(input_)).shape

In [None]:
from torch.nn.utils import clip_grad_norm

In [None]:
-2.6621e+34

In [None]:
x = torch.rand([1,2])
x.view([2,1]).size(0)