In [1]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class DeepQNetwork(nn.Module):
    """
    For a given observation, compute Q values for each action
        Attributes
    ----------
    lr: float
        learning rate
    input_dims : list
        [8]
    fc1_dims : int
        fully-connected layer 1
    fc2_dims : int
        fully-connected layer 2
    n_actions : int
        the number of actions
    """
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
        super(DeepQNetwork, self).__init__()
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)
    
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        actions = self.fc3(x)
        
        return actions

class Agent():
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size, n_actions,
                max_mem_size=100000, eps_end=0.01, eps_dec=1e-5):
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        self.lr = lr
        self.action_space = [i for i in range(n_actions)] # list of actions
        self.mem_size = max_mem_size
        self.batch_size = batch_size
        self.mem_cntr = 0
        
        self.Q_eval = DeepQNetwork(self.lr, n_actions=n_actions, input_dims=input_dims, 
                                  fc1_dims=256, fc2_dims=256)
        
        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype= np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.bool)
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size # residual. The memory is finite, so we are reusing
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.action_memory[index] =  action
        self.terminal_memory[index] = done
        
        self.mem_cntr += 1
        
    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation]).to(self.Q_eval.device) # [] is used because of the nn library
            actions = self.Q_eval.forward(state)
            action = T.argmax(actions).item() # .item() to get integer
        else:
            action = np.random.choice(self.action_space)
            
        return action
    
    def learn(self):
        if self.mem_cntr < self.batch_size:
            return
        
        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.mem_cntr, self.mem_size) 
        # select samples the number of self.batch_size out of max_mem 
        batch = np.random.choice(max_mem, self.batch_size, replace=False) # Don't select the same thing again
        # array slicing [0,1,2,...,self.batch_size-1]
        batch_index = np.arange(self.batch_size, dtype=np.int32) 
        
        state_batch = T.tensor(self.state_memory[batch]).to(self.Q_eval.device)
        new_state_batch = T.tensor(self.new_state_memory[batch]).to(self.Q_eval.device)
        reward_batch = T.tensor(self.reward_memory[batch]).to(self.Q_eval.device)
        terminal_batch = T.tensor(self.terminal_memory[batch]).to(self.Q_eval.device)
        
        action_batch = self.action_memory[batch]
        
        q_eval = self.Q_eval.forward(state_batch)[batch_index, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0.0
                                                 
        q_target = reward_batch + self.gamma * T.max(q_next, dim=1)[0]
        
        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
        

In [None]:
from env import TradingSPYEnv
import numpy as np
env = TradingSPYEnv(sma_len=[5,25])
num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.n
print("Size of Action Space ->  {}".format(num_actions))

agent = Agent(gamma=0.99, epsilon=1.0, batch_size=64, n_actions=num_actions, eps_end=0.01, input_dims = [num_states], lr=0.003)
scores, eps_history = [], []
n_games = 1000


for i in range(n_games):
    score = 0
    done = False
    observation = env.reset()
    if observation.dtype == np.float64:
        observation = observation.astype(np.float32)
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        if (observation_ is not None): 
            if (observation_.dtype == np.float64):
                observation_ = observation_.astype(np.float32)
            score += reward
            agent.store_transition(observation, action, reward, observation_, done)
        agent.learn()
        observation = observation_
    scores.append(score)
    eps_history.append(agent.epsilon)
    
    avg_score = np.mean(scores[-100:])
    
    print('episode ', i, 'score %.2f' % score,
         'average score %.2f' % avg_score,
         'epsilon %.2f ' % agent.epsilon)

    


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Size of State Space ->  6
Size of Action Space ->  3
episode  0 score -4555.82 average score -4555.82 epsilon 0.96 
episode  1 score -6007.42 average score -5281.62 epsilon 0.93 
episode  2 score -2208.56 average score -4257.27 epsilon 0.91 
episode  3 score -2567.11 average score -3834.73 epsilon 0.88 
episode  4 score -2457.70 average score -3559.32 epsilon 0.86 
episode  5 score -1660.05 average score -3242.78 epsilon 0.86 
episode  6 score -3215.23 average score -3238.84 epsilon 0.84 
episode  7 score -1770.05 average score -3055.24 epsilon 0.83 
episode  8 score -3015.92 average score -3050.87 epsilon 0.81 
episode  9 score -1884.45 average score -2934.23 epsilon 0.78 
episode  10 score -1347.19 average score -2789.95 epsilon 0.76 
episode  11 score -673.44 average score -2613.58 epsilon 0.75 
episode  12 score -3069.13 average score -2648.62 epsilon 0.74 
episode  13 score -2205.40 average score -2616.96 epsilon 0.71 
episode  14 score -1930.29 average score -2571.18 epsilon 0.69

episode  129 score 2693.29 average score -2141.12 epsilon 0.01 
episode  130 score 7339.11 average score -2045.17 epsilon 0.01 
episode  131 score 18495.86 average score -1780.04 epsilon 0.01 
episode  132 score 31091.44 average score -1448.76 epsilon 0.01 
episode  133 score 29832.65 average score -1109.14 epsilon 0.01 
episode  134 score 14532.67 average score -946.10 epsilon 0.01 
episode  135 score 18400.05 average score -697.61 epsilon 0.01 
episode  136 score 13738.29 average score -492.56 epsilon 0.01 
episode  137 score 12941.23 average score -325.07 epsilon 0.01 
episode  138 score 12442.55 average score -189.79 epsilon 0.01 
episode  139 score 12614.33 average score -43.94 epsilon 0.01 
episode  140 score 12625.39 average score 147.14 epsilon 0.01 
episode  141 score 6219.89 average score 289.63 epsilon 0.01 
episode  142 score 11387.99 average score 466.53 epsilon 0.01 
episode  143 score 27798.30 average score 766.80 epsilon 0.01 
episode  144 score 23155.75 average score 1

episode  257 score 1318.00 average score 13671.55 epsilon 0.01 
episode  258 score 19191.79 average score 13807.14 epsilon 0.01 
episode  259 score 3946.47 average score 13707.85 epsilon 0.01 
episode  260 score 26341.27 average score 13724.70 epsilon 0.01 
episode  261 score 5251.11 average score 13764.95 epsilon 0.01 
episode  262 score 4857.12 average score 13614.54 epsilon 0.01 
episode  263 score 26120.47 average score 13870.72 epsilon 0.01 
episode  264 score 17653.62 average score 13917.97 epsilon 0.01 
episode  265 score 21205.99 average score 14008.02 epsilon 0.01 
episode  266 score 2898.77 average score 13849.02 epsilon 0.01 
episode  267 score 26036.56 average score 14064.14 epsilon 0.01 
episode  268 score 34759.66 average score 14247.04 epsilon 0.01 
episode  269 score 16265.78 average score 14039.91 epsilon 0.01 
episode  270 score 19538.85 average score 14080.24 epsilon 0.01 
episode  271 score 12876.63 average score 13933.20 epsilon 0.01 
episode  272 score 5049.03 ave

episode  385 score 11140.99 average score 11540.87 epsilon 0.01 
episode  386 score 15082.62 average score 11652.92 epsilon 0.01 
episode  387 score 1226.68 average score 11631.60 epsilon 0.01 
episode  388 score 16922.67 average score 11773.69 epsilon 0.01 
episode  389 score 13386.16 average score 11881.62 epsilon 0.01 
episode  390 score 11257.13 average score 11828.95 epsilon 0.01 
episode  391 score 5905.07 average score 11842.23 epsilon 0.01 
episode  392 score 14951.64 average score 11841.10 epsilon 0.01 
episode  393 score 12014.97 average score 11924.01 epsilon 0.01 
episode  394 score 16207.83 average score 12065.20 epsilon 0.01 
episode  395 score 3780.91 average score 12107.63 epsilon 0.01 
episode  396 score 11176.51 average score 12133.41 epsilon 0.01 
episode  397 score 4647.66 average score 12067.94 epsilon 0.01 
episode  398 score 11334.25 average score 12135.03 epsilon 0.01 
episode  399 score 20366.73 average score 12225.89 epsilon 0.01 
episode  400 score 11177.29 a

In [None]:
from utils import plot_learning_curve
import os
x = [i+1 for i in range(n_games)]
filename = 'lunar_lander_2020.png'
#plot_learning_curve(x, scores, eps_history, filename)
state_dict_path = os.path.join(os.getcwd(),'test.pth')
T.save(agent.Q_eval.state_dict(), state_dict_path)


In [None]:
import matplotlib.pyplot as plt
plt.scatter(np.arange(len(agent.action_memory)),agent.action_memory)

In [None]:
agent.Q_eval.load_state_dict(T.load(state_dict_path)) # load saved model

In [None]:
T.cuda.is_available()