# Imports

In [1]:
import random
import time
import os
import copy
import torch
import numpy               as np
import torch.nn            as nn
import torch.nn.functional as F
import torch.optim         as optim

from unityagents import UnityEnvironment
from collections import (
    deque, 
    namedtuple
)

# Global Constants

In [2]:
SEED = 42

# Actor

In [3]:
class Actor(nn.Module):
    def __init__(self, state_size, action_size, fc1, fc2, leak):
        super(Actor, self).__init__()
        
        self.seed = torch.manual_seed(SEED)
        self.leak = leak
        self.bn   = nn.BatchNorm1d(state_size)

        self.fc1 = nn.Linear(state_size,         fc1)
        self.fc2 = nn.Linear(       fc1,         fc2)
        self.fc3 = nn.Linear(       fc2, action_size)
        
        self.reset_parameters()

        
    def reset_parameters(self):
        torch.nn.init.kaiming_normal_(self.fc1.weight.data, a = self.leak, mode = 'fan_in')
        torch.nn.init.kaiming_normal_(self.fc2.weight.data, a = self.leak, mode = 'fan_in')
        
        torch.nn.init.uniform_(self.fc3.weight.data, -3e-3, 3e-3)

        
    def forward(self, state):
        x = F.leaky_relu(self.fc1(self.bn(state)), negative_slope = self.leak)
        x = F.leaky_relu(self.fc2(x),              negative_slope = self.leak)
        
        return torch.tanh(self.fc3(x))

# Critic

In [4]:
class Critic(nn.Module):
    def __init__(self, state_size, action_size, fc1, fc2, leak):
        super(Critic, self).__init__()
        
        self.seed = torch.manual_seed(SEED)
        self.leak = leak
        self.bn   = nn.BatchNorm1d(state_size)
        
        self.fc1 = nn.Linear(       state_size, fc1)
        self.fc2 = nn.Linear(fc1 + action_size, fc2)
        self.fc3 = nn.Linear(              fc2,   1)
        
        self.reset_parameters()

        
    def reset_parameters(self):
        torch.nn.init.kaiming_normal_(self.fc1.weight.data, a = self.leak, mode = 'fan_in')
        torch.nn.init.kaiming_normal_(self.fc2.weight.data, a = self.leak, mode = 'fan_in')
        
        torch.nn.init.uniform_(self.fc3.weight.data, -3e-3, 3e-3)

        
    def forward(self, state, action):
        x = F.leaky_relu(self.fc1(self.bn(state)), negative_slope = self.leak)
        x = torch.cat((x, action), dim = 1)
        x = F.leaky_relu(self.fc2(x), negative_slope = self.leak)
        
        return self.fc3(x)

# Noise

In [5]:
class OUNoise:
    def __init__(self, size, mu, theta, sigma):
        self.seed = np.random.seed(SEED)
        random.seed(SEED)
        
        self.mu    = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        
        self.reset()
        
        
    def reset(self):
        self.state = copy.copy(self.mu)
        
        
    def sample(self):
        x          = self.state
        dx         = self.theta * (self.mu - x) + self.sigma * np.array([np.random.normal(self.mu, self.sigma) for i in range(len(x))])
        self.state = x + dx
        
        return self.state

# Replay Buffer

In [6]:
class ReplayBuffer:
    def __init__(self, action_size, buffer_size, batch_size, device):
        self.seed = np.random.seed(SEED)
        random.seed(SEED)
        
        self.device      = device
        self.action_size = action_size
        self.memory      = deque(maxlen = buffer_size) 
        self.batch_size  = batch_size
        self.experience  = namedtuple(
            "Experience", 
            field_names = [
                "state", 
                "action", 
                "reward", 
                "next_state", 
                "done"
            ]
        )
        

    def add(self, state, action, reward, next_state, done):
        exp = self.experience(state, action, reward, next_state, done)
        
        self.memory.append(exp)

        
    def sample(self):
        experiences = random.sample(self.memory, k = self.batch_size)
       
        return (
            torch.from_numpy(np.vstack([e.state      for e in experiences if e is not None])                 ).float().to(self.device),
            torch.from_numpy(np.vstack([e.action     for e in experiences if e is not None])                 ).float().to(self.device),
            torch.from_numpy(np.vstack([e.reward     for e in experiences if e is not None])                 ).float().to(self.device),
            torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])                 ).float().to(self.device),
            torch.from_numpy(np.vstack([e.done       for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
        )

    
    def __len__(self):
        return len(self.memory)

# Agent

In [7]:
class Agent():
    def __init__(self, state_size, action_size, n_agents, fc1, fc2, leakiness, actor_lr, critic_lr, buffer_size, batch_size, gamma, tau, decay, mu, theta, sigma):
        self.seed = np.random.seed(SEED)
        random.seed(SEED)
        
        if torch.cuda.is_available():
            self.device = "cuda:0"
            print(f"[INFO] training on CUDA")
            
        else:
            self.device = "cpu"
            print(f"[INFO] training on CPU")
            
        self.state_size  = state_size
        self.action_size = action_size
        self.n_agents    = n_agents
        self.fc1         = fc1
        self.fc2         = fc2
        self.leakiness   = leakiness
        self.actor_lr    = actor_lr
        self.critic_lr   = critic_lr
        self.buffer_size = buffer_size
        self.batch_size  = batch_size
        self.gamma       = gamma
        self.tau         = tau
        self.decay       = decay
        self.mu          = mu
        self.theta       = theta
        self.sigma       = sigma

        
        # Actor
        # --------------------------------------------------
        self.actor_local  = Actor(self.state_size, self.action_size, self.fc1, self.fc2, self.leakiness).to(self.device)
        self.actor_target = Actor(self.state_size, self.action_size, self.fc1, self.fc2, self.leakiness).to(self.device)
        
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = self.actor_lr)

        
        # Critic
        # --------------------------------------------------
        self.critic_local  = Critic(self.state_size, self.action_size, self.fc1, self.fc2, self.leakiness).to(self.device)
        self.critic_target = Critic(self.state_size, self.action_size, self.fc1, self.fc2, self.leakiness).to(self.device)
        
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr = self.critic_lr)

        
        # Noise
        # --------------------------------------------------
        self.noise = OUNoise(self.action_size, self.mu, self.theta, self.sigma)

        
        # Replay Buffer
        # --------------------------------------------------
        self.memory = ReplayBuffer(self.action_size, self.buffer_size, self.batch_size, self.device)

        
    def step(self, states, actions, rewards, next_states, dones, timesteps):            
        self.memory.add(
            states, 
            actions, 
            rewards, 
            next_states, 
            dones
        )
            
        if (len(self.memory) > self.batch_size) and (timesteps % self.n_agents == 0):
            for _ in range(10):
                exp = self.memory.sample()
                
                self.learn(exp)

                
    def act(self, states):
        states = torch.from_numpy(states).float().to(self.device)
        
        self.actor_local.eval()
        
        with torch.no_grad():
            actions = self.actor_local(states).cpu().data.numpy()
            
        self.actor_local.train()
        
        actions += self.noise.sample()
            
        return np.clip(actions, -1, 1)

    
    def reset(self):
        self.noise.reset()

        
    def learn(self, experiences):
        states, actions, rewards, next_states, dones = experiences

        
        # Critic Update
        # --------------------------------------------------
        actions_next = self.actor_target(next_states)
        
        Q_targets_next = self.critic_target(next_states, actions_next)
        Q_targets      = rewards + (self.gamma * Q_targets_next * (1 - dones))
        Q_expected     = self.critic_local(states, actions)
        
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        
        # Actor Update
        # --------------------------------------------------
        actions_pred =  self.actor_local(states)
        actor_loss   = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        
        # Target Update
        # --------------------------------------------------
        self.soft_update(self.critic_local, self.critic_target)
        self.soft_update(self.actor_local,  self.actor_target)

        
    def soft_update(self, local_model, target_model):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)

# Training

### Environment

In [8]:
# Environement 
# --------------------------------------------------
SEED   = 42
ENV_FP = f"Tennis_Windows_x86_64/Tennis.exe"
ENV    = UnityEnvironment(file_name = ENV_FP)


# Brain 
# --------------------------------------------------
BRAIN_NAME = ENV.brain_names[0]
BRAIN      = ENV.brains[BRAIN_NAME]


# Environment Data 
# --------------------------------------------------
ENV_INFO    = ENV.reset(train_mode = True)[BRAIN_NAME]
N_AGENTS    = len(ENV_INFO.agents)
ACTION_SIZE = BRAIN.vector_action_space_size
STATE_SIZE  = ENV_INFO.vector_observations.shape[1]

INFO:unityagents:
'Academy' started successfully!
Unity Academy name: Academy
        Number of Brains: 1
        Number of External Brains : 1
        Lesson number : 0
        Reset Parameters :
		
Unity brain name: TennisBrain
        Number of Visual Observations (per agent): 0
        Vector Observation space type: continuous
        Vector Observation space size (per agent): 8
        Number of stacked Vector Observation: 3
        Vector Action space type: continuous
        Vector Action space size (per agent): 2
        Vector Action descriptions: , 


### Initialize Agent

In [9]:
AGENT = Agent(
    state_size  = STATE_SIZE, 
    action_size = ACTION_SIZE, 
    n_agents    = N_AGENTS,
    fc1         = 256, 
    fc2         = 128, 
    leakiness   = 1e-2,
    actor_lr    = 1e-4,
    critic_lr   = 3e-4,
    buffer_size = 1000000,
    batch_size  = 256,
    gamma       = 0.99,
    tau         = 1e-3,
    decay       = 1e-4,
    mu          = 0.0, 
    theta       = 0.15, 
    sigma       = 0.2
)

[INFO] training on CUDA


### Define Training Scheme

In [10]:
def ddpg(n_episodes = 2000, max_t = 1000, epochs = 1000):
    window_size   = 100
    scores_window = deque(maxlen = window_size) 
    max_scores    = [] 
    moving_avgs   = []
    
    for i_episode in range(1, epochs + 1):
        env_info       = ENV.reset(train_mode = True)[BRAIN_NAME]
        states         = env_info.vector_observations
        episode_scores = np.zeros(N_AGENTS) 
        
        AGENT.reset()

        for t in range(max_t):
            actions     = AGENT.act(states)
            env_info    = ENV.step(actions)[BRAIN_NAME]
            next_states = env_info.vector_observations
            rewards     = env_info.rewards
            dones       = env_info.local_done
            
            for state, action, reward, next_state, done in zip(states, actions, rewards, next_states, dones):
                AGENT.step(state, action, reward, next_state, done, t)
            
            episode_scores += np.array(rewards)
            states          = next_states
            
            if np.any(dones):
                break
                
        best_score = np.max(episode_scores)
        
        max_scores.append(best_score)
        scores_window.append(best_score)
        
        avg_score = np.mean(scores_window)
        
        print('\rEpisode {}\tAverage Score: {:.2f}\tMax Episode Score: {:.2f}'.format(i_episode, avg_score, best_score), end="")
        
        if i_episode % 10 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}\tMax Episode Score: {:.2f}'.format(i_episode, avg_score, best_score))

        if avg_score >= 0.5:
            print("\nEnvironment solved in {} episodes!\tAverage score: {:.2f}".format(i_episode - window_size, avg_score))
            
            torch.save(AGENT.actor_local.state_dict(),  os.path.join("checkpoints", "checkpoint_actor.pth")) 
            torch.save(AGENT.critic_local.state_dict(), os.path.join("checkpoints", "checkpoint_critic.pth"))
            
            break

    np.save('scores/scores.npy', max_scores)

# Train

In [11]:
ddpg(
    n_episodes = 2000, 
    max_t      = 1000, 
    epochs     = 1000
)

Episode 10	Average Score: 0.02	Max Episode Score: 0.09
Episode 20	Average Score: 0.01	Max Episode Score: 0.00
Episode 30	Average Score: 0.01	Max Episode Score: 0.00
Episode 40	Average Score: 0.01	Max Episode Score: 0.09
Episode 50	Average Score: 0.01	Max Episode Score: 0.10
Episode 60	Average Score: 0.01	Max Episode Score: 0.00
Episode 70	Average Score: 0.02	Max Episode Score: 0.00
Episode 80	Average Score: 0.02	Max Episode Score: 0.10
Episode 90	Average Score: 0.03	Max Episode Score: 0.00
Episode 100	Average Score: 0.02	Max Episode Score: 0.00
Episode 110	Average Score: 0.02	Max Episode Score: 0.00
Episode 120	Average Score: 0.02	Max Episode Score: 0.00
Episode 130	Average Score: 0.03	Max Episode Score: 0.00
Episode 140	Average Score: 0.04	Max Episode Score: 0.10
Episode 150	Average Score: 0.04	Max Episode Score: 0.00
Episode 160	Average Score: 0.03	Max Episode Score: 0.00
Episode 170	Average Score: 0.03	Max Episode Score: 0.00
Episode 180	Average Score: 0.04	Max Episode Score: 0.10
E