In [1]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class DeepQNetwork(nn.Module):
    """
    For a given observation, compute Q values for each action
        Attributes
    ----------
    lr: float
        learning rate
    input_dims : list
        [8]
    fc1_dims : int
        fully-connected layer 1
    fc2_dims : int
        fully-connected layer 2
    n_actions : int
        the number of actions
    """
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions, weight_decay=1e-5,p=0.5):
        super(DeepQNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
#        self.dropout = nn.Dropout(p) 
        self.optimizer = optim.Adam(self.parameters(), lr=lr, weight_decay=weight_decay)
#        self.loss = nn.MSELoss()
        self.loss = nn.SmoothL1Loss() # Huber loss
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
#        x = self.dropout(x) # dropout for regularization
        actions = self.fc3(x)
        
        return actions

class Agent():
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                max_mem_size=100000, eps_end=0.01, eps_dec=1e-5, weight_decay=1e-5):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)] # list of actions
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        
        self.Q_eval = DeepQNetwork(self.lr, n_actions=n_actions, input_dims=input_dims, 
                                  fc1_dims=32, fc2_dims=32, weight_decay=weight_decay)
        
        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype= np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
        
    def store_transition(self, state, action, reward, state_, done):
        if index >= self.mem_size:
            index -= self.mem_size
        index = self.mem_cntr % self.mem_size # residual. The memory is finite, so we are reusing
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] =  action
        self.terminal_memory[index] = done
        
        self.mem_cntr += 1
        
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device) # [] is used because of the nn library. torch.Size([1, 6])
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item() # .item() to get integer
        else:
            action = np.random.choice(self.action_space)
            
        return action
    
    def learn(self):
        if self.mem_cntr < self.batch_size:
            return
        
        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.mem_cntr, self.mem_size) 
        # select samples the number of self.batch_size out of max_mem 
        batch = np.random.choice(max_mem, self.batch_size, replace=False) # Don't select the same thing again
        # array slicing [0,1,2,...,self.batch_size-1]
        batch_index = np.arange(self.batch_size, dtype=np.int32) 
        
        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
        
        action_batch = self.action_memory[batch]
        
        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0
                                                 
        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0]
        
        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        

In [None]:
#from env import TradingSPYEnv
from env4 import TradingSPYEnv
import numpy as np
env = TradingSPYEnv(init_invest=100.0, sma_len=[5,25,50,100])
num_states = len(env._get_observation().flatten())
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.n
print("Size of Action Space ->  {}".format(num_actions))

agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=num_actions, eps_end=0.01, input_dims = [num_states], lr=0.003,
              weight_decay=1e-5)
scores, eps_history = [], []
n_games = 1000


for i in range(n_games):
    score = 0
    done = False
    observation = env.reset()
    if observation.dtype == np.float64:
        observation = observation.astype(np.float32)
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        if (observation_ is not None): 
            if (observation_.dtype == np.float64):
                observation_ = observation_.astype(np.float32)
            score += reward
            agent.store_transition(observation, action, reward, observation_, done)
        agent.learn()
        observation = observation_
    scores.append(score)
    eps_history.append(agent.epsilon)
    
    avg_score = np.mean(scores[-100:])
    
    print('episode ', i, 'score %.2f' % score,
         'average score %.2f' % avg_score,
         'epsilon %.2f ' % agent.epsilon,
         'profit_iteration %.2f ' % info['profit_iteration'],
         'iterations %.2f ' % info['iterations'],
         'long_return %.2f ' % info['long_return']
         )


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Size of State Space ->  404
Size of Action Space ->  3
episode  0 score -58.62 average score -58.62 epsilon 0.97  profit_iteration -0.02  iterations 2920.00  long_return 2.51 
episode  1 score 9.48 average score -24.57 epsilon 0.96  profit_iteration 0.01  iterations 946.00  long_return 1.54 
episode  2 score 79.29 average score 10.05 epsilon 0.93  profit_iteration 0.03  iterations 2813.00  long_return 2.31 
episode  3 score 10.31 average score 10.12 epsilon 0.92  profit_iteration 0.01  iterations 1273.00  long_return 1.93 
episode  4 score -4.50 average score 7.19 epsilon 0.91  profit_iteration -0.01  iterations 1027.00  long_return 1.56 
episode  5 score -4.19 average score 5.30 epsilon 0.91  profit_iteration -0.05  iterations 81.00  long_return 1.02 
episode  6 score -6.65 average score 3.59 epsilon 0.91  profit_iteration -0.02  iterations 252.00  long_return 1.16 
episode  7 score -43.26 average score -2.27 epsilon 0.88  profit_iteration -0.02  iterations 2600.00  long_return 2.61 


episode  69 score 101.72 average score 52.99 epsilon 0.01  profit_iteration 0.05  iterations 2065.00  long_return 2.74 
episode  70 score 179.15 average score 54.76 epsilon 0.01  profit_iteration 0.09  iterations 1938.00  long_return 2.74 
episode  71 score 134.73 average score 55.87 epsilon 0.01  profit_iteration 0.04  iterations 2967.00  long_return 2.64 
episode  72 score 193.87 average score 57.76 epsilon 0.01  profit_iteration 0.06  iterations 3358.00  long_return 3.00 
episode  73 score 169.36 average score 59.27 epsilon 0.01  profit_iteration 0.05  iterations 3250.00  long_return 2.91 
episode  74 score 273.43 average score 62.13 epsilon 0.01  profit_iteration 0.08  iterations 3455.00  long_return 3.31 
episode  75 score 113.71 average score 62.81 epsilon 0.01  profit_iteration 0.07  iterations 1557.00  long_return 2.24 
episode  76 score 325.96 average score 66.22 epsilon 0.01  profit_iteration 0.08  iterations 3994.00  long_return 4.12 
episode  77 score 137.46 average score 6

episode  138 score 85.55 average score 144.26 epsilon 0.01  profit_iteration 0.07  iterations 1211.00  long_return 1.77 
episode  139 score 281.42 average score 146.58 epsilon 0.01  profit_iteration 0.08  iterations 3668.00  long_return 3.44 
episode  140 score 11.18 average score 146.26 epsilon 0.01  profit_iteration 0.05  iterations 199.00  long_return 1.13 
episode  141 score 158.44 average score 145.32 epsilon 0.01  profit_iteration 0.05  iterations 2881.00  long_return 2.44 
episode  142 score 73.21 average score 145.54 epsilon 0.01  profit_iteration 0.06  iterations 1114.00  long_return 1.70 
episode  143 score 36.36 average score 143.89 epsilon 0.01  profit_iteration 0.05  iterations 747.00  long_return 1.40 
episode  144 score 127.67 average score 144.68 epsilon 0.01  profit_iteration 0.04  iterations 2888.00  long_return 2.46 
episode  145 score 470.62 average score 149.32 epsilon 0.01  profit_iteration 0.12  iterations 3954.00  long_return 4.28 
episode  146 score 211.61 aver

episode  206 score 148.12 average score 153.02 epsilon 0.01  profit_iteration 0.05  iterations 2904.00  long_return 2.48 
episode  207 score 27.31 average score 152.19 epsilon 0.01  profit_iteration 0.07  iterations 388.00  long_return 1.26 
episode  208 score 123.11 average score 151.58 epsilon 0.01  profit_iteration 0.08  iterations 1530.00  long_return 2.39 
episode  209 score 78.17 average score 152.25 epsilon 0.01  profit_iteration 0.07  iterations 1086.00  long_return 1.62 
episode  210 score 38.22 average score 150.76 epsilon 0.01  profit_iteration 0.07  iterations 564.00  long_return 1.39 
episode  211 score 66.75 average score 148.65 epsilon 0.01  profit_iteration 0.06  iterations 1132.00  long_return 1.64 
episode  212 score 121.76 average score 148.29 epsilon 0.01  profit_iteration 0.04  iterations 2761.00  long_return 2.30 
episode  213 score 18.11 average score 146.65 epsilon 0.01  profit_iteration 0.08  iterations 228.00  long_return 1.14 
episode  214 score 181.13 averag

In [None]:
observation_ 

In [None]:
len(env._get_observation().flatten())

In [None]:
env._get_observation().shape

In [None]:
from utils import plot_learning_curve
import os
x = [i+1 for i in range(n_games)]
filename = 'Phil-deepQ-huber.png'
plot_learning_curve(x, scores, eps_history, filename)
state_dict_path = os.path.join(os.getcwd(),'test-huber.pth')
T.save(agent.Q_eval.state_dict(), state_dict_path)

In [None]:
import matplotlib.pyplot as plt
plt.scatter(np.arange(len(agent.action_memory)),agent.action_memory)

In [None]:
agent.Q_eval.load_state_dict(T.load(state_dict_path)) # load saved model

In [None]:
print(agent.state_memory[-4:],
        agent.new_state_memory[-4:], 
        agent.action_memory[-4:], 
        agent.reward_memory[-4:], 
        agent.terminal_memory[-4:])

In [None]:
tmp = agent.action_memory
print('mean', np.mean(tmp))
print('max', np.max(tmp))
print('min', np.min(tmp))

In [None]:
info