In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

#Hyperparameters
learning_rate = 0.0002
#learning_rate = 0.01
gamma         = 0.99 # discount factor

class ActorCritic(nn.Module):
    def __init__(self, env):
        super(ActorCritic, self).__init__()
        observation_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        n_fc1 = 256
#        n_fc2 = 256
        
        self.data = [] # list containing data
        self.fc1 = nn.Linear(observation_dim,n_fc1) # fully conected first
 #       self.fc2 = nn.Linear(n_fc1,n_fc2)
#        self.fc_pi = nn.Linear(n_fc2,action_dim) # fully connected pi
#        self.fc_v = nn.Linear(n_fc2,1) # fully connected value
        self.fc_pi = nn.Linear(n_fc1,action_dim) # fully connected pi
        self.fc_v = nn.Linear(n_fc1,1) # fully connected value
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        
    # Neural network of policy
    def pi(self, x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
#        x = F.relu(self.fc2(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    # NN for value function
    def v(self, x):
        x = F.relu(self.fc1(x))
#        x = F.relu(self.fc2(x))
        v = self.fc_v(x)
        return v
    
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], []        
        for transition in self.data:
            s,a,r,s_prime,done = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])            
#            r_lst.append([r/100.0])
            s_prime_lst.append(s_prime) # next state
            done_mask = 0.0 if done else 1.0 # If an episode is done (terminated), done_mask = 0
            done_lst.append([done_mask])

        if self.data == []:
            print('make_batch')
            print(self.data)            
            
        # Convert the lists into torch.tensor
        s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                                               torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \
                                                               torch.tensor(done_lst, dtype=torch.float)


        self.data = [] # Clear data after make_batch
#        print('a ',a_batch)
#        print('r ',r_batch)
#        print('s_prime_lst ',s_prime_lst)
#        print('s_prime ',s_prime_batch)
#        print('done_mask ',done_batch)
        return s_batch, a_batch, r_batch, s_prime_batch, done_batch
  
    def train_net(self):
        s, a, r, s_prime, done = self.make_batch() # start with make_batch from accumulated data
#        print('train_net')
#        print('s_prime ',s_prime)
#        print('done_mask ',done)
#        try:
        td_target = r + gamma * self.v(s_prime) * done # I guess it is tensor operation. (Vector)
#        except:
#            print(r)
#            print(gamma)
#            print(s_prime)
#            print(done)
#            print(self.v(s_prime))
        delta = td_target - self.v(s) # TD-target, advantage
        
        pi = self.pi(s, softmax_dim=1) # dim = 1 since s is list of list
        pi_a = pi.gather(1,a)
        # If you use just delta, it means network. Thus, use delta.detach() since it is a constant
        loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach()) # policy loss + value loss

        self.optimizer.zero_grad()
        loss.mean().backward()
        self.optimizer.step()         

In [2]:
from env import TradingSPYEnv
import numpy as np

In [3]:
env = gym.make('CartPole-v1')
num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.n
print("Size of Action Space ->  {}".format(num_actions))

model = ActorCritic(env)    
print_interval = 100
n_rollout = 10 # 6 months
score = 0.0

#    for n_epi in range(10000):
for n_epi in range(2000): # small test
    done = False
    s = env.reset()
    a_list = []    
    while not done:
        for t in range(n_rollout):
            prob = model.pi(torch.from_numpy(s).float()) # tensor of probability. torch.from_numpy: numpy to torch tensor
            m = Categorical(prob) # Category object defined by its probability
            # m.sample() returns a tensor. draw an action 
            a = m.sample().item() # .item() returns Int
            a_list.append(a)
            s_prime, r, done, info = env.step(a) # Action and new step
            model.put_data((s,a,r,s_prime,done)) # accumulate data

            s = s_prime # update state
            score += r

            if done:
                break                     

        model.train_net()

    if n_epi%print_interval==0 and n_epi!=0:
        print('action mean max min ', np.mean(a_list), np.max(a_list), np.min(a_list))
        print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
        score = 0.0
env.close()

Size of State Space ->  4
Size of Action Space ->  2
action mean max min  0.0 0 0
# of episode :100, avg score : 13.2
action mean max min  0.0 0 0
# of episode :200, avg score : 9.4
action mean max min  0.0 0 0
# of episode :300, avg score : 9.3
action mean max min  0.0 0 0
# of episode :400, avg score : 9.3
action mean max min  0.0 0 0
# of episode :500, avg score : 9.2
action mean max min  0.0 0 0
# of episode :600, avg score : 9.4
action mean max min  0.0 0 0
# of episode :700, avg score : 9.3
action mean max min  0.0 0 0
# of episode :800, avg score : 9.4
action mean max min  0.0 0 0
# of episode :900, avg score : 9.4
action mean max min  0.0 0 0
# of episode :1000, avg score : 9.3
action mean max min  0.0 0 0
# of episode :1100, avg score : 9.3
action mean max min  0.0 0 0
# of episode :1200, avg score : 9.5
action mean max min  0.0 0 0
# of episode :1300, avg score : 9.5
action mean max min  0.0 0 0
# of episode :1400, avg score : 9.3
action mean max min  0.0 0 0
# of episode :15

In [None]:
env = TradingSPYEnv(sma_len=[5,25])
num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.n
print("Size of Action Space ->  {}".format(num_actions))

model = ActorCritic(env)    
print_interval = 50
n_rollout = 150 # 6 months
score = 0.0


In [None]:
#    for n_epi in range(10000):
for n_epi in range(500): # small test
    s = env.reset()
    a_list = []
    for t in range(n_rollout):
        prob = model.pi(torch.from_numpy(s).float()) # tensor of probability. torch.from_numpy: numpy to torch tensor
        m = Categorical(prob) # Category object defined by its probability
        # m.sample() returns a tensor. draw an action 
        a = m.sample().item() # .item() returns Int
        s_prime, r, done, info = env.step(a) # Action and new step
        if done is False:
#                tmp = (t,s,a,r)
            a_list.append(a)
            model.put_data((s,a,r,s_prime,done)) # accumulate data
            s = s_prime # update state
            score += r
        else: # done is true
            break                     
#        print(model.data)
    if model.data == []:
        print('empty data!')
#            print('start learning. action ', a)
#    print('actions', a_list[0], a_list[-1])
    model.train_net()

    if n_epi%print_interval==0 and n_epi!=0:
        print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
        print('action mean max min ', np.mean(a_list), np.max(a_list), np.min(a_list))
        score = 0.0