In [None]:
# !pip -q install ./python

In [None]:
from unityagents import UnityEnvironment
import numpy as np
from torch import nn
import torch
from torch import optim
import torch.nn.functional as F
import random
from collections import namedtuple, deque
import matplotlib.pyplot as plt
import copy
import time

**_Before running the code cell below_**, change the `file_name` parameter to match the location of the Unity environment that you downloaded.

- **Mac**: `"path/to/Reacher.app"`
- **Windows** (x86): `"path/to/Reacher_Windows_x86/Reacher.exe"`
- **Windows** (x86_64): `"path/to/Reacher_Windows_x86_64/Reacher.exe"`
- **Linux** (x86): `"path/to/Reacher_Linux/Reacher.x86"`
- **Linux** (x86_64): `"path/to/Reacher_Linux/Reacher.x86_64"`
- **Linux** (x86, headless): `"path/to/Reacher_Linux_NoVis/Reacher.x86"`
- **Linux** (x86_64, headless): `"path/to/Reacher_Linux_NoVis/Reacher.x86_64"`

For instance, if you are using a Mac, then you downloaded `Reacher.app`.  If this file is in the same folder as the notebook, then the line below should appear as follows:
```
env = UnityEnvironment(file_name="Reacher.app")
```

In [None]:
# set environment-related variables
action_size = 4
state_size = 33
filename = '/data/Reacher_One_Linux_NoVis/Reacher_One_Linux_NoVis.x86_64'
env = UnityEnvironment(file_name=filename)
brain_name = 'ReacherBrain'

In [None]:
# set parameters
BUFFER_SIZE = 1048576
BATCH_SIZE = 128
GAMMA = 0.99 
TAU = 0.001
ACTOR_LR = 0.001
CRITIC_LR = 0.001
WEIGHT_DECAY = 0
UPDATE_EVERY = 10

In [None]:
# Ornstein-Uhlenbeck noise process by Alexis Cook from Udacity
# https://github.com/udacity/deep-reinforcement-learning/tree/master/ddpg-bipedal
class OUNoise:
    """Ornstein-Uhlenbeck process"""
    def __init__(self, size, seed, mu=0., theta=0.15, sigma=0.2):
        """Initialize parameters and noise process."""
        self.mu = mu * np.ones(size)
        self.theta = theta
        self.sigma = sigma
        self.seed = random.seed(seed)
        self.size = size
        self.reset()

    def reset(self):
        """Reset the internal state (= noise) to mean (mu)."""
        self.state = copy.copy(self.mu)

    def sample(self):
        """Update internal state and return it as a noise sample."""
        x = self.state
        dx = self.theta * (self.mu - x) + self.sigma * np.random.standard_normal(self.size)
        self.state = x + dx
        return self.state

In [None]:
# define actor and critic model architecture
class Critic(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(33, 512)
        self.fc2 = nn.Linear(512 + 4, 256)
        self.fc3 = nn.Linear(256, 1)
    
    def forward(self, x, action):
        x = self.fc1(x)
        x = F.relu(x)
        x = torch.cat([x, action],1)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x
                           
class Actor(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(33, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 4)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return F.tanh(x)

In [None]:
# replay buffer by Timo P. Gros
# https://github.com/TimoPGros/
class ReplayBuffer:
    
    def __init__(self, buffer_size, batch_size, seed):
        self.batch_size = batch_size
        self.memory = deque(maxlen=buffer_size)
        self.seed = random.seed(seed)
    
    def add(self, state, action, reward, next_state, done):
        if done:
            done_value = 1
        else:
            done_value = 0
        self.memory.append([state, action, reward, next_state, done_value])
    
    def sample(self):
        samples = (random.sample(self.memory, self.batch_size))
        
        states = []
        actions = []
        rewards = []
        next_states = []
        dones = []
        
        for sample in samples:
            state, action, reward, next_state, done = sample
            
            states.append(torch.tensor(state).float())
            actions.append(torch.tensor(action).float())
            rewards.append(reward)
            next_states.append(next_state)
            dones.append(done)

        states = torch.cat(states).float().view(len(samples), -1)
        actions = torch.cat(actions).float().view(len(samples), -1)
        rewards = torch.tensor(rewards).float()
        next_states = torch.tensor(next_states).float()
        dones = torch.tensor(dones).float()
        
        return [states, actions, rewards, next_states, dones]
        
    def __len__(self):
        return len(self.memory)

In [None]:
# agent class largely inspired by Alexis Cook from Udacity
# https://github.com/udacity/deep-reinforcement-learning/tree/master/ddpg-bipedal
class Agent():
    def __init__(self, seed):
        self.critic_local = Critic()
        self.critic_target = Critic()
        self.actor_local = Actor()
        self.actor_target = Actor()

        self.seed = random.seed(seed)       
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LR)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=CRITIC_LR,
            weight_decay=WEIGHT_DECAY
        )

        self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed)        
        self.noise = OUNoise(4, self.seed)       
        self.t_step = 0
        
    def step(self, state, action, reward, next_state, done):
        self.memory.add(state, action, reward, next_state, done)
        is_time_to_update = (self.t_step + 1) % UPDATE_EVERY == 0
        if is_time_to_update:
            # check if enough experiences in replay buffer
            if len(self.memory) > BATCH_SIZE:
                samples = self.memory.sample()
                self.learn(samples, GAMMA)
    
    def act(self, state):
        state = torch.from_numpy(state).float()
        with torch.no_grad():
            action_values = self.actor_local(state)
        action_values += torch.tensor(self.noise.sample()).float()
        return np.clip(action_values, -1, 1)    
            
    def learn(self, samples, gamma):
        states, actions, rewards, next_states, dones = samples
        rewards = rewards.unsqueeze(1)
        dones = dones.unsqueeze(1)
        # compute targets
        q_values_next_states = self.critic_target.forward(
            next_states,
            self.actor_target(next_states)
        )
        targets = rewards + (gamma * (q_values_next_states) *  (1 - dones))
        predictions = self.critic_local.forward(states, actions)

        loss = F.mse_loss(predictions, targets)
        self.critic_optimizer.zero_grad()
        loss.backward()
        self.critic_optimizer.step()

        actor_losses = self.critic_local.forward(states, self.actor_local(states))
        actor_loss = - actor_losses.mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        self.soft_update(self.critic_local, self.critic_target, TAU)  
        self.soft_update(self.actor_local, self.actor_target, TAU)  

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)

In [None]:
# plot scores over time with box showing parameter set
def plot_scores(last_100_scores_means, episode_count, buffer_size, batch_size, lr_actor, lr_critic, tau, actor, critic):
    fig, ax = plt.subplots(figsize=(20, 10))
    textstr = 'max(last_100_scores_means): {}\nepisode_count: {}\nbuffer_size: {}\nbatch_size: {}\nlr_actor: {}\nlr_critic: {}\ntau: {}\nactor: {}\ncritic: {} '
    textstr = textstr.format(round(np.max(last_100_scores_means), 2), episode_count, buffer_size, batch_size, lr_actor, lr_critic, tau, actor, critic)
    ax.plot(np.arange(len(last_100_scores_means)), last_100_scores_means)
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)
    ax.text(0.05, 0.95, textstr, transform=ax.transAxes,
            verticalalignment='top', bbox=props)
    ax.set_title('Score over number of episodes')
    ax.set_xlabel('Episode number')
    ax.set_ylabel('Score')
    n_ticks = len(last_100_scores_means)
    ticks = np.arange(0, n_ticks, 100).astype('int')
    ax.set_xticks(ticks)
    ax.set_xticklabels(ticks+100, rotation=45)
    filename = 'scores_over_episodes.png'
    plt.savefig(filename)
    plt.close()

In [None]:
# run DDPG algorithm
def run_ddpg(start_time, n_episodes=5000, max_t=1000):
    env_info = env.reset(train_mode=True)[brain_name]
    agent = Agent(0)
    scores = []
    last_100_scores = deque(maxlen=100)
    last_100_scores_means = []
    
    for episode_count in range(1, n_episodes+1):
        env_info = env.reset(train_mode=True)[brain_name]
        state = env_info.vector_observations[0]
        score = 0
        agent.t_step = 0        
        for t in range(max_t):
            action = agent.act(state)
            env_info = env.step(action.numpy())[brain_name]
            next_state = env_info.vector_observations[0]
            reward = env_info.rewards[0]
            done = env_info.local_done[0] 
            agent.step(state, action, reward, next_state, done)
            state = next_state
            score += reward
            minutes_elapsed = round((time.time() - start_time) / 60, 1)
            agent.t_step += 1
            tracker_str = '\repisode_count: {}, step_count: {}, last_100_scores_mean: {}, minutes_elapsed: {}'
            print(
                tracker_str.format(
                    episode_count,
                    agent.t_step,
                    round(np.mean(last_100_scores), 2),
                    minutes_elapsed
                ),
                end=''
            )
            if done:
                break
        last_100_scores.append(score)        
        scores.append(score)
        
        if episode_count >= 100:
            last_100_scores_means.append(np.mean(last_100_scores))
            plot_scores(
                last_100_scores_means,
                episode_count,
                BUFFER_SIZE,
                BATCH_SIZE,
                ACTOR_LR,
                CRITIC_LR,
                TAU, 
                agent.actor_local,
                agent.critic_local
            )

        if np.mean(last_100_scores)>=30.0:
            solved_message = '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
            print(solved_message.format(episode_count, np.mean(last_100_scores)))
            torch.save(agent.actor_local.state_dict(), 'actor_weights.pth')
            torch.save(agent.critic_local.state_dict(), 'critic_weights.pth')

In [None]:
from workspace_utils import active_session
 
with active_session():
    # track start time
    start_time = time.time()

    # train DDPG agent
    run_ddpg(start_time)

    # track end time and print training time
    end_time = time.time()
    training_time = round((end_time - start_time) / 60, 1)
    print('\nTotal training time: {} minutes'.format(training_time))