# Imports

In [1]:
import random
import time
import os
import copy
import torch
import numpy               as np
import torch.nn            as nn
import torch.nn.functional as F
import torch.optim         as optim

from unityagents import UnityEnvironment
from collections import (
    deque, 
    namedtuple
)

# Global Constants

In [2]:
SEED = 42

# Actor

In [3]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size, fc1, fc2, leak):
        super(Actor, self).__init__()
        
        self.seed = torch.manual_seed(SEED)
        self.leak = leak
        self.bn   = nn.BatchNorm1d(state_size)

        self.fc1 = nn.Linear(state_size,         fc1)
        self.fc2 = nn.Linear(       fc1,         fc2)
        self.fc3 = nn.Linear(       fc2, action_size)
        
        self.reset_parameters()

        
    def reset_parameters(self):
        torch.nn.init.kaiming_normal_(self.fc1.weight.data, a = self.leak, mode = 'fan_in')
        torch.nn.init.kaiming_normal_(self.fc2.weight.data, a = self.leak, mode = 'fan_in')
        
        torch.nn.init.uniform_(self.fc3.weight.data, -3e-3, 3e-3)

        
    def forward(self, state):
        x = F.leaky_relu(self.fc1(self.bn(state)), negative_slope = self.leak)
        x = F.leaky_relu(self.fc2(x),              negative_slope = self.leak)
        
        return torch.tanh(self.fc3(x))

# Critic

In [4]:
class Critic(nn.Module):
    def __init__(self, state_size, action_size, fc1, fc2, fc3, leak ):
        super(Critic, self).__init__()
        
        self.seed = torch.manual_seed(SEED)
        self.leak = leak
        self.bn   = nn.BatchNorm1d(state_size)
        
        self.fc1 = nn.Linear(       state_size, fc1)
        self.fc2 = nn.Linear(fc1 + action_size, fc2)
        self.fc3 = nn.Linear(              fc2, fc3)
        self.fc4 = nn.Linear(              fc3,   1)
        
        self.reset_parameters()

        
    def reset_parameters(self):
        torch.nn.init.kaiming_normal_(self.fc1.weight.data, a = self.leak, mode = 'fan_in')
        torch.nn.init.kaiming_normal_(self.fc2.weight.data, a = self.leak, mode = 'fan_in')
        
        torch.nn.init.uniform_(self.fc3.weight.data, -3e-3, 3e-3)

        
    def forward(self, state, action):
        x = F.leaky_relu(self.fc1(self.bn(state)), negative_slope = self.leak)
        
        x = torch.cat((x, action), dim = 1)
        
        x = F.leaky_relu(self.fc2(x), negative_slope = self.leak)
        x = F.leaky_relu(self.fc3(x), negative_slope = self.leak)
        
        return self.fc4(x)

# Noise

In [5]:
class OUNoise:
    def __init__(self, size, mu, theta, sigma):
        self.seed = np.random.seed(SEED)
        random.seed(SEED)
        
        self.mu    = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        
        self.reset()
        
        
    def reset(self):
        self.state = copy.copy(self.mu)
        
        
    def sample(self):
        x          = self.state
        dx         = self.theta * (self.mu - x) + self.sigma * np.array([np.random.random() for i in range(len(x))])
        self.state = x + dx
        
        return self.state

# Replay Buffer

In [6]:
class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, device):
        self.seed = np.random.seed(SEED)
        random.seed(SEED)
        
        self.device      = device
        self.action_size = action_size
        self.memory      = deque(maxlen = buffer_size) 
        self.batch_size  = batch_size
        self.experience  = namedtuple(
            "Experience", 
            field_names = [
                "state", 
                "action", 
                "reward", 
                "next_state", 
                "done"
            ]
        )
        

    def add(self, state, action, reward, next_state, done):
        exp = self.experience(state, action, reward, next_state, done)
        
        self.memory.append(exp)

        
    def sample(self):
        experiences = random.sample(self.memory, k = self.batch_size)
       
        return (
            torch.from_numpy(np.vstack([e.state      for e in experiences if e is not None])                 ).float().to(self.device),
            torch.from_numpy(np.vstack([e.action     for e in experiences if e is not None])                 ).float().to(self.device),
            torch.from_numpy(np.vstack([e.reward     for e in experiences if e is not None])                 ).float().to(self.device),
            torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])                 ).float().to(self.device),
            torch.from_numpy(np.vstack([e.done       for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
        )

    
    def __len__(self):
        return len(self.memory)

# Agent

In [7]:
class Agent():
    def __init__(self, state_size, action_size, n_agents, fc1, fc2, fc3, leakiness, actor_lr, critic_lr, buffer_size, batch_size, gamma, tau, decay, mu, theta, sigma):
        self.seed = np.random.seed(SEED)
        random.seed(SEED)
        
        if torch.cuda.is_available():
            self.device = "cuda:0"
            print(f"[INFO] training on CUDA")
            
        else:
            self.device = "cpu"
            print(f"[INFO] training on CPU")
            
        self.state_size  = state_size
        self.action_size = action_size
        self.n_agents    = n_agents
        self.fc1         = fc1
        self.fc2         = fc2
        self.fc3         = fc3
        self.leakiness   = leakiness
        self.actor_lr    = actor_lr
        self.critic_lr   = critic_lr
        self.buffer_size = buffer_size
        self.batch_size  = batch_size
        self.gamma       = gamma
        self.tau         = tau
        self.decay       = decay
        self.mu          = mu
        self.theta       = theta
        self.sigma       = sigma

        
        # Actor
        # --------------------------------------------------
        self.actor_local  = Actor(self.state_size, self.action_size, self.fc1, self.fc2, self.leakiness).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size, self.fc1, self.fc2, self.leakiness).to(self.device)
        
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.actor_lr)

        
        # Critic
        # --------------------------------------------------
        self.critic_local  = Critic(self.state_size, self.action_size, self.fc1, self.fc2, self.fc3, self.leakiness).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size, self.fc1, self.fc2, self.fc3, self.leakiness).to(self.device)
        
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.critic_lr)

        
        # Noise
        # --------------------------------------------------
        self.noise = OUNoise(self.action_size, self.mu, self.theta, self.sigma)

        
        # Replay Buffer
        # --------------------------------------------------
        self.memory    = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.device)
        self.timesteps = 0  

        
    def step(self, states, actions, rewards, next_states, dones):
        self.timesteps += 1
        
        for i in range(self.n_agents):
            self.memory.add(
                states[i], 
                actions[i], 
                rewards[i], 
                next_states[i], 
                dones[i]
            )

            
        if (len(self.memory) > self.batch_size) and (self.timesteps % self.n_agents == 0):
            for _ in range(10):
                exp = self.memory.sample()
                
                self.learn(exp)

                
    def act(self, states):
        states = torch.from_numpy(states).float().to(self.device)
        
        self.actor_local.eval()
        
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
            
        self.actor_local.train()
        
        actions += [self.noise.sample() for _ in range(self.n_agents)]
            
        return np.clip(actions, -1, 1)

    
    def reset(self):
        self.noise.reset()

        
    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        
        # Critic Update
        # --------------------------------------------------
        actions_next = self.actor_target(next_states)
        
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets      = rewards + (self.gamma * Q_targets_next * (1 - dones))
        Q_expected     = self.critic_local(states, actions)
        
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        
        # Actor Update
        # --------------------------------------------------
        actions_pred =  self.actor_local(states)
        actor_loss   = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        
        # Target Update
        # --------------------------------------------------
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local,  self.actor_target)

        
    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

# Training

### Envorinment

In [8]:
# Environement 
# --------------------------------------------------
SEED   = 42
ENV_FP = f"Reacher_Windows_x86_64/Reacher.exe"
ENV    = UnityEnvironment(file_name = ENV_FP)


# Brain 
# --------------------------------------------------
BRAIN_NAME = ENV.brain_names[0]
BRAIN      = ENV.brains[BRAIN_NAME]


# Environment Data 
# --------------------------------------------------
ENV_INFO    = ENV.reset(train_mode = True)[BRAIN_NAME]
N_AGENTS    = len(ENV_INFO.agents)
ACTION_SIZE = BRAIN.vector_action_space_size
STATE_SIZE  = ENV_INFO.vector_observations.shape[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		goal_speed -> 1.0
		goal_size -> 5.0
Unity brain name: ReacherBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 33
        Number of stacked Vector Observation: 1
        Vector Action space type: continuous
        Vector Action space size (per agent): 4
        Vector Action descriptions: , , , 


### Initialize Agent

In [9]:
AGENT = Agent(
    state_size  = STATE_SIZE, 
    action_size = ACTION_SIZE, 
    n_agents    = N_AGENTS,
    fc1         = 256, 
    fc2         = 128, 
    fc3         = 128,
    leakiness   = 1e-2,
    actor_lr    = 1e-4,
    critic_lr   = 3e-4,
    buffer_size = 1000000,
    batch_size  = 1024,
    gamma       = 0.99,
    tau         = 1e-3,
    decay       = 1e-4,
    mu          = 0.0, 
    theta       = 0.15, 
    sigma       = 0.2
)

[INFO] training on CUDA


### Define Training Scheme

In [10]:
def ddpg(n_episodes = 2000, max_t = 1000, epochs = 1000):
    window_size        = 100
    scores_deque       = deque(maxlen = window_size) 
    scores             = []        
    best_average_score = -np.inf
    
    for i_episode in range(1, epochs + 1):
        env_info       = ENV.reset(train_mode = True)[BRAIN_NAME]
        states         = env_info.vector_observations
        episode_scores = np.zeros(N_AGENTS) 
        
        AGENT.reset()

        for t in range(max_t):
            actions     = AGENT.act(states)
            env_info    = ENV.step(actions)[BRAIN_NAME]
            next_states = env_info.vector_observations
            rewards     = env_info.rewards
            dones       = env_info.local_done

            AGENT.step(
                states      = states, 
                actions     = actions, 
                rewards     = rewards, 
                next_states = next_states, 
                dones       = dones
            )
            
            episode_scores += np.array(rewards)
            states          = next_states
            
            if np.any(dones):
                break

        episode_score = np.mean(episode_scores)
        
        scores_deque.append(episode_score)
        scores.append(episode_score)
        
        average_score = np.mean(scores_deque)
        
        print('\rEpisode: {}\tAverage Score: {:.2f}\tCurrent Score: {:.2f}'.format(i_episode, average_score, episode_score), end="")
        
        if i_episode % 10 == 0:
            print('\rEpisode: {}\tAverage Score: {:.2f}\tCurrent Score: {:.2f}'.format(i_episode, average_score, episode_score))

        if average_score >= 30.0:
            print('\nEnvironment solved in {} episodes!\tAverage Score: {:.2f}'.format(i_episode - window_size, average_score))
            
            torch.save(AGENT.actor_local.state_dict(),  os.path.join("checkpoints", "checkpoint_actor.pth")) 
            torch.save(AGENT.critic_local.state_dict(), os.path.join("checkpoints", "checkpoint_critic.pth"))
            
            break

    np.save('scores/scores.npy', scores)

### Train

In [11]:
ddpg(
    n_episodes = 2000, 
    max_t      = 1000, 
    epochs     = 1000
)

Episode: 10	Average Score: 0.48	Current Score: 0.69
Episode: 20	Average Score: 0.68	Current Score: 0.96
Episode: 30	Average Score: 0.87	Current Score: 1.60
Episode: 40	Average Score: 1.21	Current Score: 2.72
Episode: 50	Average Score: 1.68	Current Score: 4.61
Episode: 60	Average Score: 2.29	Current Score: 5.50
Episode: 70	Average Score: 3.02	Current Score: 8.87
Episode: 80	Average Score: 3.82	Current Score: 9.694
Episode: 90	Average Score: 4.64	Current Score: 13.69
Episode: 100	Average Score: 5.64	Current Score: 15.26
Episode: 110	Average Score: 7.34	Current Score: 19.34
Episode: 120	Average Score: 9.45	Current Score: 23.88
Episode: 130	Average Score: 11.87	Current Score: 25.98
Episode: 140	Average Score: 14.56	Current Score: 30.38
Episode: 150	Average Score: 17.19	Current Score: 29.13
Episode: 160	Average Score: 19.83	Current Score: 31.06
Episode: 170	Average Score: 22.18	Current Score: 31.52
Episode: 180	Average Score: 24.33	Current Score: 28.48
Episode: 190	Average Score: 26.22	Curr