In [28]:
import torch
import gym
import numpy as np
import torch.nn as nn
import os
import torch.nn.functional as F

In [29]:
class ActorNetwork(nn.Module):
    def __init__(self, alpha,  input_dims, fc1_dims, fc2_dims, n_actions, name,
                 chkpt_dir="tmp/ddpg"):
        super(ActorNetwork, self).__init__()
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.checkpoint_file = os.path.join(chkpt_dir, name+"_ddpg")

        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        f1 = 1/np.sqrt(self.fc1.weight.data.size()[0])
        torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
        torch.nn.init.uniform_(self.fc1.bias.data, -f1,  f1)
        self.bn1 = nn.LayerNorm(self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        f2 = 1/np.sqrt(self.fc2.weight.data.size()[0])
        torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
        torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
        self.bn2 = nn.LayerNorm(self.fc2_dims)

        f3 = 0.003
        self.mu = nn.Linear(self.fc2_dims, self.n_actions)
        torch.nn.init.uniform_(self.mu.weight.data, -f3, f3)
        torch.nn.init.uniform_(self.mu.bias.data, -f3, f3)

        self.optimizer = torch.optim.Adam(self.parameters(), lr=alpha)
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(self.device)

    def forward(self, state):
        x = self.fc1(state)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = torch.tanh(self.mu(x))

        return x

    def save_checkpoint(self):
        print("... saving checkpoint ...")
        torch.save(self.state_dict(), self.checkpoint_file)

    def load_checkpoint(self):
        print("... loading checkpoint ...")
        self.load_state_dict(torch.load(self.checkpoint_file))

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Net = ActorNetwork(0.000025, [8], 400, 300, n_actions = 2, name = "pippo")
Net.load_state_dict(torch.load("tmp/ddpg/Actor_ddpg"))
Net.eval()

ActorNetwork(
  (fc1): Linear(in_features=8, out_features=400, bias=True)
  (bn1): LayerNorm((400,), eps=1e-05, elementwise_affine=True)
  (fc2): Linear(in_features=400, out_features=300, bias=True)
  (bn2): LayerNorm((300,), eps=1e-05, elementwise_affine=True)
  (mu): Linear(in_features=300, out_features=2, bias=True)
)

In [31]:
def choose_action(observation):
    # Net.eval() #Perchè altrimenti terrebbe conto della batch_norm e calcolerebbe statistiche in continuazione
    observation = torch.tensor(observation, dtype = torch.float).to(device)
    mu = Net(observation).to(device)
    return mu.cpu().detach().numpy()

In [32]:
env = gym.make("LunarLanderContinuous-v2")
num_eps = 50
for i in range(num_eps):
    done = False
    obs = env.reset()
    score = 0
    while not done:
        action = choose_action(obs)
        new_state, reward, done, info = env.step(action)
        score += reward
        obs = new_state
        env.render()