In [1]:
import math
import time
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import random
from collections import namedtuple
from itertools import count
import torch
import torch.nn as nn
import csv
import os
import pandas as pd
import gym

In [2]:
env = gym.make('Pendulum-v0')

In [3]:
def testar_gpu():
    train_on_gpu = torch.cuda.is_available() #Observa se a GPU está disponivel
    if train_on_gpu: #Se sim
        device = torch.device('cuda') #Seleciona o device como GPU
        print("Treinando na GPU") #E manda a mensagem
    else: #Se não
        device = torch.device('cpu') #Seleciona o device como cpu
        print("GPU indisponível, treinando na CPU") #E avisa que a GPU não esta disponível
    return device

In [4]:
device = testar_gpu()


Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

Treinando na GPU


In [5]:
class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [6]:
input_size = int(env.observation_space.shape[0])
output_size = int(env.action_space.shape[0])
max_action = float(env.action_space.high[0])

In [7]:
class Actor(nn.Module):
    def __init__(self, input_size, outputs):
        super(Actor, self).__init__()
        self.linear1 = nn.Linear(input_size, 32)
        self.ln1 = nn.LayerNorm(32)

        self.linear2 = nn.Linear(32, 64)
        self.ln2 = nn.LayerNorm(64)

        self.linear3 = nn.Linear(64, outputs)
        #self.linear3.weight.data.mul_(10)
        #self.linear3.bias.data.mul_(10)
        
        self.tanh = nn.Tanh()

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        out = nn.functional.relu(self.ln1(self.linear1(x)))
        out = nn.functional.relu(self.ln2(self.linear2(out)))
        out = max_action*self.tanh(self.linear3(out))
        return out

In [8]:
class Critic(nn.Module):
    def __init__(self, state_size, action_size, outputs):
        super(Critic, self).__init__()
        self.linear1 = nn.Linear(state_size, 32)
        self.ln1 = nn.LayerNorm(32)

        self.linear2 = nn.Linear(32 + action_size, 64)
        self.ln2 = nn.LayerNorm(64)

        self.linear3 = nn.Linear(64, outputs)
        self.linear3.weight.data.mul_(10)
        self.linear3.bias.data.mul_(10)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, state, action):
        s1 = nn.functional.relu(self.ln1(self.linear1(state)))
        out = torch.cat((s1,action), dim=1)
        out = nn.functional.relu(self.ln2(self.linear2(out)))
        out = self.linear3(out)
        return out


In [9]:
BATCH_SIZE = 100
GAMMA = 0.95
tau = 0.05

# CONFIG PARAM NOISE
sigma_0 = 0.30
alfa = 1.01
delta = 1e-1
T_adapt = 5

#CONFIG ACNOISE
desv_pad_ac = 0.3
T_adapt_ac = 20
alfa_ac = 1.01

actor_net = Actor(input_size, output_size).to(device)
actor_noise_net = Actor(input_size, output_size).to(device)
actor_target_net = Actor(input_size, output_size).to(device)

critic_net = Critic(input_size, output_size, 1).to(device)
critic_target_net = Critic(input_size, output_size, 1).to(device)

actor_target_net.load_state_dict(actor_net.state_dict())
actor_target_net.eval()
actor_noise_net.load_state_dict(actor_net.state_dict())
actor_noise_net.eval()
critic_target_net.load_state_dict(critic_net.state_dict())
critic_target_net.eval()

optimizer_critic = torch.optim.AdamW(critic_net.parameters(), lr=0.001)
optimizer_actor = torch.optim.AdamW(actor_net.parameters(), lr=0.0002)

memory = ReplayMemory(5000)

loss_d = nn.MSELoss()
loss_critic = nn.SmoothL1Loss()

actor_loss = None
critic_loss = None

In [10]:
def select_action(state, action_noise=None, param_noise=None):
    actor_net.eval()
    if param_noise is not None: 
        mu = actor_noise_net(state.float())
    else:
        mu = actor_net(state.float())

    mu = mu.data
    if action_noise is not None:
        with torch.no_grad():
            mu = torch.Tensor(mu.cpu().numpy() + np.random.normal(0,desv_pad_ac,output_size)).to(device)

    mu = mu.squeeze()
    return mu

In [11]:
def weightSync(target_model, source_model):
    for parameter_target, parameter_source in zip(target_model.parameters(), source_model.parameters()):
        parameter_target.data.copy_((1 - tau) * parameter_target.data + tau * parameter_source.data)


In [13]:
def optimize_model(t):
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    
    actor_net.train()
    critic_net.train()
    actor_target_net.train()
    critic_target_net.train()

    batch = Transition(*zip(*transitions))

    action_batch, state_batch = [],[]
    next_state_batch = torch.autograd.Variable(torch.cat(batch.next_state)).to(device)
    state_batch = torch.cat(batch.state).to(device)
    for tupla in batch.action:
        action_batch.append(np.array(tupla.cpu()))
    action_batch = torch.from_numpy(np.array(action_batch)).view(-1,1).to(device)
    reward_batch = torch.cat(batch.reward)

    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_actor_action_value = actor_target_net(next_state_batch.float())
    with torch.no_grad():
        next_state_values = critic_target_net(next_state_batch.float(), next_actor_action_value.float()).squeeze()

    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Optimize the model
    optimizer_critic.zero_grad()
    state_action_values = critic_net(state_batch.float(), action_batch.float())
    critic_loss = loss_critic(state_action_values.float(), expected_state_action_values.unsqueeze(1).float())
    if i_episode == 10:
        print(critic_loss)
    if i_episode == 50:
        print(critic_loss)
        
    
    critic_loss.backward()
    optimizer_critic.step()

    optimizer_actor.zero_grad()            
    gradient = -critic_net(state_batch.float(), actor_net(state_batch.float()).float())
    actor_loss = torch.mean(gradient)
    actor_loss.backward()
    optimizer_actor.step()
    
    if t%10 == 0: 
        weightSync(actor_target_net, actor_net)
        weightSync(critic_target_net, critic_net)
    
    return state_batch, action_batch

In [14]:
num_episodes = 1000
list_retorno = []
param_noise, ac_noise = True, None
for i_episode in range(num_episodes):
    # Initialize the environment and state
    state = env.reset()
    retorno = 0
    steps=0
    for t in count():
        #env.render()
        # Select and perform an action
        action = select_action(torch.FloatTensor([state]).to(device), ac_noise, param_noise).cpu()
        next_state, reward, done, _ = env.step([action])
        reward = (reward + 8)/8
        retorno += reward

        # Store the transition in memory
        memory.push(torch.FloatTensor([state]),
                     action.to(device),  # action is already a tensor
                     torch.FloatTensor([next_state]),
                     torch.FloatTensor([reward]).to(device))

        # Move to the next state
        state = next_state
        steps += 1

        # Perform one step of the optimization (on the policy network)
        if optimize_model(t) != None:
            state_batch, action_batch = optimize_model(t)
        if done:
            break
    
    if ac_noise == True:
        if (i_episode+1) % T_adapt_ac == 0: 
            desv_pad_ac = desv_pad_ac/alfa_ac
            print('\n Sigma: ', desv_pad_ac)
    if param_noise == True:
        if (i_episode+1) % T_adapt == 0: 
            unpertubed_action = select_action(state_batch)
            perturbed_actions = action_batch
            
            d = loss_d(unpertubed_action, perturbed_actions)
            d = torch.mean(d)
            if d <= delta:
                sigma_0 = sigma_0*alfa
            else:
                sigma_0 = sigma_0/alfa
            print(" \nD: ", d.item(), '   Sigma: ', sigma_0)
        
    print(f'Episodio {i_episode}: retorno={round(retorno,2)}')
    list_retorno.append(retorno)

print('Complete')
env.render()
env.close()

Episodio 0: retorno=27.12
Episodio 1: retorno=45.36
Episodio 2: retorno=12.15
Episodio 3: retorno=85.61


  return F.mse_loss(input, target, reduction=self.reduction)


 
D:  2.013395309448242    Sigma:  0.297029702970297
Episodio 4: retorno=29.68
Episodio 5: retorno=70.21
Episodio 6: retorno=70.98
Episodio 7: retorno=25.1
Episodio 8: retorno=13.67
 
D:  1.4419639110565186    Sigma:  0.29408881482207627
Episodio 9: retorno=28.84
tensor(0.0020, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0023, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0025, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0019, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0022, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0018, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0018, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0017, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0037, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0026, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0022, device='cuda:0', grad_fn=<SmoothL1LossBackward>)
tensor(0.0016, device='cuda:0', 

In [None]:
plt.plot(list_retorno)
plt.show()

<Figure size 432x288 with 1 Axes>