In [0]:
import random
import numpy as np
import os

import torch
from torch import nn
from torch.distributions import Normal
from torch.nn import functional as F

SEED = 423  # 627, 8, 11


def make_reproducible(seed, make_cuda_reproducible):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if make_cuda_reproducible:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


make_reproducible(SEED, make_cuda_reproducible=False)


def transform_state(state):
    return torch.tensor(state).float().to(DEVICE)


class Actor(nn.Module):
    def __init__(self, hidden=400, in_dim=3, out_dim=1):
        super().__init__()
        self.fc = nn.Linear(in_dim, hidden)
        self.mu = nn.Linear(hidden, out_dim)
        self.sigma = nn.Parameter(torch.full((1,), np.log(0.6)), requires_grad=True)

    def forward(self, x):
        z2 = F.relu(self.fc(x))
        mu = 2 * F.tanh(self.mu(z2))
        sigma = self.sigma.expand_as(mu).exp()
        return mu, sigma


class Agent:
    def __init__(self):
        self.actor = Agent.generate_model()
        self.actor.load_state_dict(torch.load(__file__[:-8] + "/agent.pkl"))
        self.actor.to(torch.device("cpu"))
        self.actor.eval()

    @staticmethod
    def generate_model():
        return Actor()

    def act(self, state):
        state = transform_state(state)
        out = self.actor(state)
        return np.array([Normal(out[0], out[1]).sample().item()])

    def reset(self):
        pass


In [0]:
import time
from collections import deque
from copy import deepcopy

from gym import make
import numpy as np
import torch
import random

from torch.nn import functional as F
from torch import nn
from torch.distributions import Normal
from torch.optim import Adam

N_STEP = 1
GAMMA = 0.9
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
UPDATE_LENGTH = 10
ENTROPY = 0.01
TARGET_UPDATE = 800
EPOSIODE_LEN = 200


class Critic(nn.Module):
    def __init__(self, hidden=400, in_dim=3, out_dim=1):
        super().__init__()
        self.fc2 = nn.Linear(in_dim, hidden)
        self.valNet = nn.Linear(hidden, 1)

    def forward(self, x):
        z = F.relu(self.fc2(x))
        val = self.valNet(z)
        return val


class A2C:
    def __init__(self, state_dim, action_dim):
        self.gamma = GAMMA ** N_STEP
        self.actor = Agent.generate_model().to(DEVICE)  # Torch model
        self.critic = Critic().to(DEVICE)  # Torch model
        self.actor_optimizer = Adam(self.actor.parameters(), lr=0.0001)
        self.critic_optimizer = Adam(self.critic.parameters(), lr=0.001)
        self.states = []
        self.actions = []
        self.next_states = []
        self.rewards = []
        self.log_probs = []
        self.entropies = []
        self.critic_values = []
        self.target_values = []
        self.update_steps = 0
        self.distribution = None
        self.target = deepcopy(self.critic)

    def optimizer_step(self, log_probs, entropies, returns, critic_values):
        self.actor_optimizer.zero_grad()
        self.critic_optimizer.zero_grad()
        adv = (returns - critic_values).detach()
        policy_loss = -(log_probs * adv).mean()
        entropy_loss = -entropies.mean() * ENTROPY
        value_loss = ((critic_values - returns) ** 2 / 2).mean()
        total_loss = value_loss + entropy_loss + policy_loss
        total_loss.backward()
        self.actor_optimizer.step()
        self.critic_optimizer.step()

    def update(self, transition):
        self.update_steps += 1
        if self.update_steps % TARGET_UPDATE == 0:
            self.target = deepcopy(self.critic)

        state, action, next_state, reward, done = transition
        action = torch.tensor(action)
        self.states.append(transform_state(state))
        self.actions.append(action)
        self.next_states.append(transform_state(next_state))
        self.rewards.append((reward + 8.1) / 8.1)
        self.log_probs.append(self.distribution.log_prob(action))
        self.entropies.append(self.distribution.entropy())
        self.critic_values.append(self.critic(transform_state(state)))
        self.target_values.append(self.target(transform_state(state)))
        if done or len(self.states) == UPDATE_LENGTH:
            next_target = torch.zeros(1) if done else self.target(self.next_states[-1])
            if len(self.target_values) > 1:
                returns = torch.tensor(self.rewards).view(-1, 1) + \
                          self.gamma * torch.cat((torch.cat(self.target_values[1:]), next_target)).view(-1, 1).detach()
            else:
                self.states = []
                self.actions = []
                self.next_states = []
                self.rewards = []
                self.log_probs = []
                self.entropies = []
                self.critic_values = []
                self.target_values = []
                return

            self.optimizer_step(
                torch.cat(self.log_probs).view(-1, 1),
                torch.cat(self.entropies).view(-1, 1),
                returns,
                torch.cat(self.critic_values).view(-1, 1)
            )

            self.states = []
            self.actions = []
            self.next_states = []
            self.rewards = []
            self.log_probs = []
            self.entropies = []
            self.critic_values = []
            self.target_values = []

    def act(self, state):
        # Remember: agent is not deterministic, sample actions from distribution (e.g. Gaussian)
        state = transform_state(state)
        out = self.actor(state)
        self.distribution = Normal(out[0], out[1])
        return np.array([self.distribution.sample().item()])

    def save(self, i):
        torch.save(self.actor.state_dict(), f'agent_{i}.pkl')

In [0]:
env = make("Pendulum-v0")
a2c = A2C(state_dim=3, action_dim=1)
episodes = 10000

scores = []
best_score = -10000.0
best_score_25 = -10000.0
total_steps = 0
start = time.time()

for i in range(episodes):
    state = env.reset()
    total_reward = 0
    steps = 0
    done = False
    reward_buffer = deque(maxlen=N_STEP)
    state_buffer = deque(maxlen=N_STEP)
    action_buffer = deque(maxlen=N_STEP)
    while not done:
        if steps == EPOSIODE_LEN:
            break
        total_steps += 1
        action = a2c.act(state)
        next_state, reward, done, _ = env.step(action)
        next_state = next_state
        total_reward += reward
        steps += 1
        reward_buffer.append(reward)
        state_buffer.append(state)
        action_buffer.append(action)
        if len(reward_buffer) == N_STEP:
            a2c.update((state_buffer[0], action_buffer[0], next_state, sum([(GAMMA ** i) * r for i, r in enumerate(reward_buffer)]), done))
        state = next_state
        #env.render()
    scores.append(total_reward)
    if len(reward_buffer) == N_STEP:
        rb = list(reward_buffer)
        for k in range(1, N_STEP):
            a2c.update((state_buffer[k], action_buffer[k], next_state, sum([(GAMMA ** i) * r for i, r in enumerate(rb[k:])]), done))

    if (i + 1) % 75 == 0:
        current_score = np.mean(scores)
        print(f'Current score: {current_score}')
        scores = []
        if current_score > best_score:
            best_score = current_score
            a2c.save(75)
            print(f'Best model saved with score: {best_score}')
        end = time.time()
        elapsed = end - start
        start = end
        print(f'Elapsed time: {elapsed}')
    #elif (i + 1) % 25 == 0:
    #    current_score_25 = np.mean(scores[-25:])
    #    print(f'Intermediate score: {current_score_25}')
    #    if current_score_25 > best_score_25:
    #        best_score_25 = current_score_25
    #        a2c.save(25)
    #        print(f'Best 25 model saved with score: {best_score_25}')



Current score: -1419.348671927737
Best model saved with score: -1419.348671927737
Elapsed time: 23.74226450920105
Current score: -1419.9237573494704
Elapsed time: 23.86884880065918
Current score: -1085.568547592379
Best model saved with score: -1085.568547592379
Elapsed time: 23.734857320785522
Current score: -912.845817549795
Best model saved with score: -912.845817549795
Elapsed time: 23.656028270721436
Current score: -882.2347368585605
Best model saved with score: -882.2347368585605
Elapsed time: 23.521143674850464
Current score: -814.7592419184786
Best model saved with score: -814.7592419184786
Elapsed time: 23.611609935760498
Current score: -692.1581229134532
Best model saved with score: -692.1581229134532
Elapsed time: 24.835127592086792
Current score: -583.0617913673863
Best model saved with score: -583.0617913673863
Elapsed time: 24.328096628189087
Current score: -464.2403707840897
Best model saved with score: -464.2403707840897
Elapsed time: 24.05731511116028
Current score: -4