In [1]:
import numpy as np
import random
from collections import namedtuple, deque

from model import QNetwork

import torch
import torch.nn.functional as F
import torch.optim as optim

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
import multiprocessing as mp

In [4]:
class ReplayBuffer():
    def __init__(self, BUF_SIZE,N):
        self.N = N
        self.queue = deque(maxlen = BUF_SIZE)
        
    def add(priority, exp):
        self.queue.append((priority, exp))
        
    def sample(batch_size, rollout_length = self.N):
        exp = sorted(self.queue, reverse = True)[:batch_size]
        st_init = []
        st_final = []
        action = []
        done = []
        reward = []
        priority = []
        for (pr,x) in exp:
            priority.append(pr)
            st_init.append(x[0])
            st_final.append(x[1])
            action.append(x[2])
            done.append(x[3])
            a = []
            for i in range(rollout_length):
                a.append(x[4+i])
            reward.append(a)
            
        st_init = torch.from_numpy(np.array(st_init)).float().to(device)
        st_final = torch.from_numpy(np.array(st_final)).float().to(device)
        action = torch.from_numpy(np.array(action)).float().to(device)
        done = torch.from_numpy(np.array(done)).float().to(device)
        reward = torch.from_numpy(np.array(reward)).float().to(device)
        priority = torch.from_numpy(np.array(priority)).float().to(device)
        
        return st_init, st_final, action, done, reward, priority
        
        
        
        
        
        
        

In [None]:
class Learner():
    def __init__(self, state_size, action_size, seed):
        
        self.actor_network = QNetwork(state_size, action_size, seed).to(device)
        self.actor_target_network = QNetwork(state_size, action_size, seed).to(device)
        
        self.critic_network = QNetwork(state_size, action_size, seed).to(device)
        self.critic_target_network = QNetwork(state_size, action_size, seed).to(device)
        
        self.rollout_length = N
        self.gamma = GAMMA
        self.replaymen = ReplayBuffer(BUF_SIZE, self.rollout_length)
        
        self.optimizer_actor = optim.Adam(self.actor_network.parameters(), lr=ALPHA)
        self.optimizer_critic = optim.Adam(self.critic_network.parameters(), lr=BETA)
        self.tau = TAU
        
    def learn():
        st_init, st_final, action, done, reward, priority = self.replaymem.sample(batch_size)  #sampling from minibatch
        
        root_priority = torch.sqrt(priority)    # squar rooting priorities, since we are using mse loss and 
                                                # i am multiplying both the target and predicted values with 
                                                # priorities beforehand, so ((a-b)^2)/k = (a/p - b/p)^2 
                                                # where p = sqrt(k)
                    
        q_initialst = (self.critic_network(st_init, action))/root_priority  # qvalues of (s(i),a(i))

        action_finalst = self.actor_target_network(st_final)                # action_values of final state(s(i+N))
        q_finalst = self.critic_target_network(st_final, action_finalst)    # q_value of final (s(i+N), afinal)
        q_finalst = q_finalst*(1-done)
        
        disc = 1
        gamma = self.gamma
        g = []
        for _ in range(self.rollout_length):
            g.append(disc)
            disc *= gamma
            
        g = np.array(g)
        
        disc_reward = g*reward                                           # discounted reward
        
        y_val = (torch.sum(disc_reward, dim = 1) + q_finalst)/root_priority  # target value
        
        critic_loss = (F.mse_loss(q_initialst, y_val))/BUF_SIZE   # mse loss between target and predicted value
        self.optimizer_critic.zero_grad()
        critic_loss.backward()
        self.optimizer_critic.step()
        
        
        action_init = self.actor_network(st_init)          # action values as predicted by actor(policy) network
        actor_loss = -self.critic_network(st_init, action_init)   # q_values of (s(i), action_value) by critic network
                                                                  # negative sign because of gradient ascent
        
        self.optimizer_actor.zero_grad()
        actor_loss.backward()
        self.optimizer_actor.step()
        
        soft_update(actor_network, actor_target_network, self.tau)
        soft_update(critic_network, critic_target_network, self.tau)
        
    def soft_update(self, local_model, target_model, tau):

        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
        
        

In [18]:
class QNetwork():
    

In [19]:
q

deque([(2, ('j', 'nana')), (1, ('nb', 'kaka'))])

In [22]:
l = sorted(q)[:1]

In [24]:
l

[(1, ('nb', 'kaka'))]

In [28]:
l[0][1][0]

'nb'

In [29]:
a = np.array([[1,2,3],[4,5,6],[7,8,9]])
b = torch.from_numpy(a)
b

tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]])

In [30]:
c = np.array([0.1, 0.2, 0.3])
c

array([0.1, 0.2, 0.3])

In [32]:
d = c*b

In [41]:
d

tensor([[ 0.1000,  0.4000,  0.9000],
        [ 0.4000,  1.0000,  1.8000],
        [ 0.7000,  1.6000,  2.7000]], dtype=torch.float64)

In [43]:
e = torch.sum(d, dim = 1)

In [44]:
e

tensor([ 1.4000,  3.2000,  5.0000], dtype=torch.float64)

In [40]:
1/e

tensor([ 2.1429,  0.9375,  0.6000], dtype=torch.float64)

In [42]:
d/e

tensor([[ 0.2143,  0.3750,  0.5400],
        [ 0.8571,  0.9375,  1.0800],
        [ 1.5000,  1.5000,  1.6200]], dtype=torch.float64)