In [1]:
import numpy as np
import random
from collections import namedtuple, deque

from model import QNetwork

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
BUF_SIZE = 1e5
N = 5
TAU = 1e-3
batch_size = 256
GAMMA = 0.99
ALPHA = 1e-4
BETA = 1e-4
TD_EPSILON = 1e-3
NOISE_EPSILON = 0.3

In [3]:
import multiprocessing as mp

In [4]:
class ReplayBuffer():
    def __init__(self, BUF_SIZE,N):
        self.N = N
        self.queue = deque(maxlen = BUF_SIZE)
        
    def add(priority, exp, lock):
        lock.aquire()
        self.queue.append((priority, exp))
        lock.release()
        
    def sample(batch_size, lock, rollout_length = self.N):
        
        lock.aquire()
        w = []
        for e in self.queue:
            w.append(e[0])
            
        exp = random.choices(self.queue, weights = w, k = batch_size)
        
        for e in exp:
            self.queue.remove(e)
            
        lock.release()
        
        st_init = []
        st_final = []
        action = []
        done = []
        reward = []
        priority = []
        for (pr,x) in exp:
            priority.append(pr)
            st_init.append(x[0])
            st_final.append(x[1])
            action.append(x[2])
            done.append(x[3])
            reward.append(x[4])
            
        st_init = torch.from_numpy(np.array(st_init)).float().to(device)
        st_final = torch.from_numpy(np.array(st_final)).float().to(device)
        action = torch.from_numpy(np.array(action)).float().to(device)
        done = torch.from_numpy(np.array(done)).float().to(device)
        reward = torch.from_numpy(np.array(reward)).float().to(device)
        priority = torch.from_numpy(np.array(priority)).float().to(device)
        priority = priority/torch.mean(priority)
        
        return st_init, st_final, action, done, reward, priority
        
        
        
        
        
        
        

In [None]:
class Learner():
    def __init__(self, state_size, action_size, seed):
        
        self.actor_network = ActorNetwork(state_size, action_size, seed).to(device)
        self.actor_target_network = ActorNetwork(state_size, action_size, seed).to(device)
        
        self.critic_network = CriticNetwork(state_size, action_size, seed).to(device)
        self.critic_target_network = CriticNetwork(state_size, action_size, seed).to(device)
        
        self.rollout_length = N
        self.gamma = GAMMA
        self.batch_size = batch_size
        self.replaymem = ReplayBuffer(BUF_SIZE, self.rollout_length)
        
        self.optimizer_actor = optim.Adam(self.actor_network.parameters(), lr=ALPHA)
        self.optimizer_critic = optim.Adam(self.critic_network.parameters(), lr=BETA)
        self.tau = TAU
        
    def learn(lock):
        st_init, st_final, action, done, reward, priority = self.replaymem.sample(self.batch_size, lock)  #sampling from minibatch
        
        root_priority = torch.sqrt(priority)    # squar rooting priorities, since we are using mse loss and 
                                                # i am multiplying both the target and predicted values with 
                                                # priorities beforehand, so ((a-b)^2)/k = (a/p - b/p)^2 
                                                # where p = sqrt(k)
                    
        q_initialst = (self.critic_network(st_init, action))/root_priority  # qvalues of (s(i),a(i))

        action_finalst = self.actor_target_network(st_final)                # action_values of final state(s(i+N))
        q_finalst = self.critic_target_network(st_final, action_finalst)    # q_value of final (s(i+N), afinal)
        q_finalst = q_finalst*(1-done)
        
        disc = 1
        gamma = self.gamma
        g = []
        for _ in range(self.rollout_length):
            g.append(disc)
            disc *= gamma
            
        g = np.array(g)
        
        disc_reward = g*reward                                           # discounted reward
        
        y_val = (torch.sum(disc_reward, dim = 1) + q_finalst)/root_priority  # target value
        
        td_error = torch.abs(y_val - q_initialst)*root_priority + TD_EPSILON
        
        for i in range(self.batch_size):
            self.replaymem.add(td_error[i], (st_init[i], st_final[i], action[i], done[i], reward[i]))
        
        critic_loss = (F.mse_loss(q_initialst, y_val))/BUF_SIZE   # mse loss between target and predicted value
        self.optimizer_critic.zero_grad()
        critic_loss.backward()
        torch.nn.utils.clip_grad_norm(self.critic_network.parameters(), 1)  # gradient clipping
        self.optimizer_critic.step()
        
        
        action_init = self.actor_network(st_init)          # action values as predicted by actor(policy) network
        actor_loss = -self.critic_network(st_init, action_init)   # q_values of (s(i), action_value) by critic network
                                                                  # negative sign because of gradient ascent
        
        self.optimizer_actor.zero_grad()
        actor_loss.backward()
        self.optimizer_actor.step()
        
        soft_update(actor_network, actor_target_network, self.tau)
        soft_update(critic_network, critic_target_network, self.tau)
        
    def soft_update(self, local_model, target_model, tau):

        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
        
        

In [56]:
class ActorNetwork(nn.Module):
    
    def __init__(self, state_size, action_size, seed, h1_size = 128, h2_size = 128):
        
        
        super(ActorNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, h1_size)
        self.fc2 = nn.Linear(h1_size, h2_size)
        self.fc3 = nn.Linear(h2_size, action_size)
        
        
    def forward(x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.tanh(self.fc3(x))
        return x
    
    
class CriticNetwork(nn.Module):
    
    def __init__(self, state_size, action_size, seed, hs1_size = 128, ha1_size = 32, h2_size = 64):
        
        
        super(CriticNetwork, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fcs1 = nn.Linear(state_size, hs1_size)
        self.fca1 = nn.Linear(action_size, ha1_size)
        self.fc2 = nn.Linear(hs1_size + ha1_size, h2_size) 
        self.fc3 = nn.Linear(h2_size, action_size)
        
        
    def forward(x,a):
        x = F.relu(self.fcs1(x))
        a = F.relu(self.fca1(a))
        y = torch.cat((x,a), dim = 1)
        y = F.relu(self.fc2(y))
        y = F.relu(self.fc3(y))
        return y

In [None]:
class Agent():
    
    def __init__(state_size, action_size, seed):
        self.actor_network = ActorNetwork(state_size, action_size, seed).to(device)
        self.critic_network = CriticNetwork(state_size, action_size, seed).to(device)
        

In [None]:
a.train()

In [50]:
b = deque(maxlen = 3)
b.append((1,torch.tensor([3, 0.4, 0.256])))
b.append((2, torch.tensor([4, 0.1, 0.4785])))
b.append((3, torch.tensor([5, 0.36, 0.4785])))
batch_size = 2



w = [1/6,2/6,3/6]
e = np.random.choice(np.arange(3), size = 2, replace=False, p = w)

b.remove(b[e[0]])
b
# for i in e:
#     b.remove(i)
#     break



deque([(1, tensor([ 3.0000,  0.4000,  0.2560])),
       (2, tensor([ 4.0000,  0.1000,  0.4785]))])

In [60]:
e = np.sort(e)
len(e)
j = []
j.append(torch.tensor([1,2,3]))
j.append(torch.tensor([4,5,6]))
j.append(torch.tensor([7,8,9]))
j


[tensor([ 1,  2,  3]), tensor([ 4,  5,  6]), tensor([ 7,  8,  9])]

In [62]:
np.vstack(j)

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [52]:
q = deque(maxlen = 3)
q.append((1, 'er'))
q.append((2, 'ol'))
q.append((3, 'kp'))
q


deque([(1, 'er'), (2, 'ol'), (3, 'kp')])

In [53]:
l[0][1][0]

'nb'

In [60]:
a = np.array([[1,2,3],[4,5,6],[7,8,9]])
b = torch.from_numpy(a)
b

tensor([[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]])

In [5]:
l = []
l.append([10, 30])
l.append([20,40])
l

[[10, 30], [20, 40]]

In [6]:
a = torch.tensor(l)
a

tensor([[ 10,  30],
        [ 20,  40]])

In [61]:
a

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [69]:
a[:,1]

array([2, 5, 8])

In [44]:
e

tensor([ 1.4000,  3.2000,  5.0000], dtype=torch.float64)

In [40]:
1/e

tensor([ 2.1429,  0.9375,  0.6000], dtype=torch.float64)

In [42]:
d/e

tensor([[ 0.2143,  0.3750,  0.5400],
        [ 0.8571,  0.9375,  1.0800],
        [ 1.5000,  1.5000,  1.6200]], dtype=torch.float64)

In [21]:
from multiprocessing import Condition, Lock, Process, Queue




def f1(cv,a,q):
    cv.acquire()
    print(1)
    a.append(4)
    q.put(a)
    cv.release()

        
def f2(cv,q):
    cv.acquire()
    print(2)
    a = q.get()
    print(a)
    cv.release()
    
    
    
    
def f():   
    cv = Condition()
    a = [1, 2, 3]
    q = Queue()
    
    p1 = Process(target = f1, args=(cv,a,q,))
    p2 = Process(target = f2, args=(cv,q,))

    p1.start()
    p2.start()

    p1.join()
    p2.join()
    
f()

1
2
[1, 2, 3, 4]


[1, 2, 3]


In [13]:
a = [1,2,3]
a

[1, 2, 3]

In [14]:
l = Lock(a)

TypeError: Lock() takes 1 positional argument but 2 were given

In [68]:
class ReplayBuffer():
    def __init__(self, BUF_SIZE,N):
        self.N = N
        self.queue = deque(maxlen = BUF_SIZE)
        
    def add(self, priority, exp):
        #lock.acquire()
        self.queue.append((priority, exp))
        #lock.release()
    def update(self, indices, td_error):
        for i in range(len(indices)):
            self.queue[indices[i]][0] = td_error[i]
        
    def sample(self, batch_size, rollout_length = N):

    #lock.acquire()
        w = []
        for e in self.queue:
            w.append(e[0])

        w = np.array(w)
        w /= np.sum(w)
        print(self.queue[0])
        print(batch_size)

        indices = np.random.choice(np.arange(len(self.queue)), size = batch_size, replace = False, p = w)

        exp = []
        for i in indices:
            exp.append(self.queue[i])

    #         print(exp[0])
    #         indices = np.sort(indices)[::-1]

    #         print(indices[0])
    #         for i in indices:
    #             self.queue.remove(self.queue[i])


    #lock.release()

        st_init = []
        st_final = []
        action = []
        done = []
        reward = []
        priority = []
        for (pr,x) in exp:
            priority.append(pr)
            st_init.append(x[0])
            st_final.append(x[1])
            action.append(x[2].detach())
            done.append(x[3])
            reward.append(x[4])

    #         print(st_init[0])
    #         print(st_final[0])
    #         print(action[0])
    #         print(done[0])
    #         print(reward[0])
    #         print(priority[0])

        st_init = torch.from_numpy(np.vstack(st_init)).float().to(device)
        st_final = torch.from_numpy(np.vstack(st_final)).float().to(device)
        action = torch.from_numpy(np.vstack(action)).float().to(device)
        dones = torch.from_numpy(np.array(done)).float().to(device)
        reward = torch.from_numpy(np.array(reward)).float().to(device)
        priority = torch.from_numpy(np.array(priority)).float().to(device)
        priority = priority/torch.sum(priority)

        return st_init, st_final, action, done, reward, priority, indices

    def __len__(self):
        return len(self.queue)
        
        
        
        
        
        
        



SyntaxError: invalid syntax (<ipython-input-68-3e96e6062ea6>, line 67)