# Collaboration and Competition with MADDPG

---
### Start the Environment

If the code cell below returns an error, please  double-check that you have installed [Unity ML-Agents](https://github.com/Unity-Technologies/ml-agents/blob/master/docs/Installation.md) and [NumPy](http://www.numpy.org/).

In [None]:
from unityagents import UnityEnvironment
import numpy as np
from torch import nn
import torch
from torch import optim
import torch.nn.functional as F
import random
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import copy
import time

**_Before running the code cell below_**, change the `file_name` parameter to match the location of the Unity environment that you downloaded.

- **Mac**: `"path/to/Tennis.app"`
- **Windows** (x86): `"path/to/Tennis_Windows_x86/Tennis.exe"`
- **Windows** (x86_64): `"path/to/Tennis_Windows_x86_64/Tennis.exe"`
- **Linux** (x86): `"path/to/Tennis_Linux/Tennis.x86"`
- **Linux** (x86_64): `"path/to/Tennis_Linux/Tennis.x86_64"`
- **Linux** (x86, headless): `"path/to/Tennis_Linux_NoVis/Tennis.x86"`
- **Linux** (x86_64, headless): `"path/to/Tennis_Linux_NoVis/Tennis.x86_64"`

For instance, if you are using a Mac, then you downloaded `Tennis.app`.  If this file is in the same folder as the notebook, then the line below should appear as follows:
```
env = UnityEnvironment(file_name="Tennis.app")
```

In [None]:
env = UnityEnvironment(file_name="Tennis_Windows_x86_64/Tennis.exe")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

In [None]:
# set parameters
BUFFER_SIZE = 100000  # replay buffer size
BATCH_SIZE = 512      # minibatch size
GAMMA = 0.99          # discount factor
TAU = 0.001           # for soft update of target parameters
ACTOR_LR = 0.001      # actor learning rate 
CRITIC_LR = 0.001     # critic learning rate 
UPDATE_EVERY = 2      # how often to update the network

In [None]:
# Ornstein-Uhlenbeck noise process by Alexis Cook from Udacity
# https://github.com/udacity/deep-reinforcement-learning/tree/master/ddpg-bipedal
class OUNoise:
    """Ornstein-Uhlenbeck process."""
    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.size = size
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.standard_normal(self.size)
        self.state = x + dx
        return self.state

In [None]:
# define actor and critic model architecture
class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(24, 400)
        self.fc2 = nn.Linear(400+2, 300)
        self.fc3 = nn.Linear(300, 1)
        
    def forward(self, x, action):
        x = self.fc1(x)
        x = F.relu(x)
        x = torch.cat([x, action], 1)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x


class Actor(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(24, 200)
        self.fc2 = nn.Linear(200, 150)
        self.fc3 = nn.Linear(150, 2)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return torch.tanh(x)

In [None]:
# replay buffer by Timo P. Gros
# https://github.com/TimoPGros/
class ReplayBuffer:
    
    def __init__(self, buffer_size, batch_size, seed):
        self.batch_size = batch_size
        self.memory = deque(maxlen=buffer_size)
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        if done:
            done_value = 1
        else:
            done_value = 0
        self.memory.append([state, action, reward, next_state, done_value])
    
    def sample(self):
        samples = random.sample(self.memory, self.batch_size)
        
        states = []
        actions = []
        rewards = []
        next_states = []
        dones = []
        
        for sample in samples:
            state, action, reward, next_state, done = sample
            
            states.append(torch.tensor(state).float())
            actions.append(torch.tensor(action).float())
            rewards.append(reward)
            next_states.append(next_state)
            dones.append(done)

        states = torch.cat(states).float().view(len(samples), -1)
        actions = torch.cat(actions).float().view(len(samples), -1)
        rewards = torch.tensor(rewards).float()
        next_states = torch.tensor(next_states).float()
        dones = torch.tensor(dones).float()
        
        return [states, actions, rewards, next_states, dones]
        
    def __len__(self):
        return len(self.memory)

In [None]:
# Agent class largely inspired by Alexis Cook from Udacity
# https://github.com/udacity/deep-reinforcement-learning/tree/master/ddpg-bipedal
class Agent():
    def __init__(self, seed):
        self.critic_local = Critic()
        self.critic_target = Critic()
        self.actor_local = Actor()
        self.actor_target = Actor()
        self.seed = random.seed(seed)
        
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LR)
        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)
        
        self.noise = OUNoise(2, self.seed)
        
        self.t_step = 0
        
    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            if (len(self.memory)) > BATCH_SIZE:
                samples = self.memory.sample()
                self.learn(samples, GAMMA)
    
    def act(self, state):
        state = torch.from_numpy(state).float()
        with torch.no_grad():
            action_values = self.actor_local(state)
        
        action_values += (torch.tensor(self.noise.sample()).float())
        return np.clip(action_values, -1, 1)    
            
    def learn(self, samples, gamma):
        states, actions, rewards, next_states, dones = samples
        rewards = rewards.unsqueeze(1)
        dones = dones.unsqueeze(1)
        q_values_next_states = self.critic_target.forward(next_states, self.actor_target(next_states))
        targets = rewards + (gamma * (q_values_next_states) *  (1 - dones))
        predictions = self.critic_local.forward(states, actions)

        loss = F.mse_loss(predictions, targets)
        self.critic_optimizer.zero_grad()
        loss.backward()
        self.critic_optimizer.step()
        
        actor_losses = self.critic_local.forward(states, self.actor_local(states))
        actor_loss = - actor_losses.mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.critic_local, self.critic_target, TAU)  
        self.soft_update(self.actor_local, self.actor_target, TAU)  
 
    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [None]:
# plot scores over time with box showing parameter set
def plot_scores(scores, last_100_scores_rolling_means, episode_count,
                buffer_size, batch_size, lr_actor, lr_critic, tau,
                actor, critic):
    fig, ax = plt.subplots(figsize=(20, 10))
    textstr = 'max(last_100_scores_means): {}\nepisode_count: {}\nbuffer_size: {}\nbatch_size: {}\nlr_actor: {}\nlr_critic: {}\ntau: {}\nactor: {}\ncritic: {} '
    textstr = textstr.format(round(np.max(last_100_scores_rolling_means), 2),
                             episode_count, buffer_size, batch_size, 
                             lr_actor, lr_critic, tau, actor, critic)
    ax.plot(np.arange(len(scores)), scores, 'o', label='Single episode score')
    ax.plot(np.arange(len(last_100_scores_rolling_means)),
            last_100_scores_rolling_means,
            label='Last 100 scores rolling mean')
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
    ax.text(0.05, 0.95, textstr, transform=ax.transAxes,
            verticalalignment='top', bbox=props)
    ax.legend(loc='upper right')
    ax.set_title('Score over number of episodes')
    ax.set_xlabel('Episode number')
    ax.set_ylabel('Score')
    filename = 'scores_over_episodes.png'
    plt.savefig(filename)
    plt.close()

In [None]:
# run MADDPG algorithm
def run_maddpg(n_episodes=8000, max_t=1000):
    env_info = env.reset(train_mode=True)[brain_name]
    agent = Agent(0)
    scores = []
    last_100_scores = deque(maxlen=100)
    last_100_scores_rolling_means = []
    
    for i_episode in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        states = env_info.vector_observations
        score = 0       
        while True:
            actions = []
            for state in states:
                action = agent.act(state).numpy()
                actions.append(action)     
            env_info = env.step(actions)[brain_name]
            next_states = env_info.vector_observations
            rewards = env_info.rewards
            dones = env_info.local_done
            experiences = zip(states, actions, rewards, next_states, dones)
            for (state,action, reward, next_state, done) in experiences:
                agent.step(state, action, reward, next_state, done)
            states = next_states
            score += np.max(rewards)
            if done:
                break
        last_100_scores.append(score)
        scores.append(score)
        last_100_scores_rolling_means.append(np.mean(last_100_scores))
        plot_scores(scores, last_100_scores_rolling_means, i_episode, BUFFER_SIZE, BATCH_SIZE, ACTOR_LR, CRITIC_LR, TAU,
                    agent.actor_local, agent.critic_local)

        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(last_100_scores)))
        is_problem_solved = np.mean(last_100_scores) >= 0.5
        if is_problem_solved:
            solved_str = '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
            print(solved_str.format(i_episode, np.mean(last_100_scores)))
            torch.save(agent.actor_local.state_dict(), 'actor_weights.pth')
            torch.save(agent.critic_local.state_dict(),'critic_weights.pth' )
            break

In [None]:
# track start time
start_time = time.time()
local_time = time.localtime(int(start_time))
local_time_str = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
print('\nStarted training on '+local_time_str)

# train DDPG agent
run_maddpg()

# track end time and print training time
end_time = time.time()
training_time = round((end_time - start_time) / 60, 1)
local_time = time.localtime(int(start_time))
local_time_str = time.strftime("%Y-%m-%d %H:%M:%S", local_time)
print('\nFinished training on '+local_time_str)
print('\nTotal training time: {} minutes'.format(training_time))

In [None]:
env.close()