# Actor Critic - Syft Duet - Data Scientist 🥁

Contributed by [@Koukyosyumei](https://github.com/Koukyosyumei)

## PART 1: Connect to a Remote Duet Server

As the Data Scientist, you want to perform data science on data that is sitting in the Data Owner's Duet server in their Notebook.

In order to do this, we must run the code that the Data Owner sends us, which importantly includes their Duet Session ID. The code will look like this, importantly with their real Server ID.

```
import syft as sy
duet = sy.duet('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx')
```

This will create a direct connection from my notebook to the remote Duet server. Once the connection is established all traffic is sent directly between the two nodes.

Paste the code or Server ID that the Data Owner gives you and run it in the cell below. It will return your Client ID which you must send to the Data Owner to enter into Duet so it can pair your notebooks.

In [None]:
from itertools import count
from collections import namedtuple

import numpy as np
import torch
import syft as sy
import gym
duet = sy.join_duet(loopback=True)
sy.logger.add(sink="./syft_ds.log")

### <img src="https://github.com/OpenMined/design-assets/raw/master/logos/OM/mark-primary-light.png" alt="he-black-box" width="100"/> Checkpoint 0 : Now STOP and run the Data Owner notebook until Checkpoint 1.

In [None]:
config = {
    "gamma": 0.99,
    "seed": 543,
    "render": False,
    "log_interval": 10,
    "no_cuda": False,
    "log_interval": 1,
    "wait_interval": 1,
    "dry_run":True,
}

remote_torch = duet.torch
remote_torch.manual_seed(config["seed"])

In [None]:
has_cuda = False
has_cuda_ptr = remote_torch.cuda.is_available()

# lets ask to see if our Data Owner has CUDA
has_cuda = bool(
    has_cuda_ptr.get(
        request_block=True,
        reason="To run test and inference locally",
        timeout_secs=3,  # change to something slower
    )
)
print("Is cuda available ? : ", has_cuda)


use_cuda = not config["no_cuda"] and has_cuda
# now we can set the seed
remote_torch.manual_seed(config["seed"])

device = remote_torch.device("cuda" if use_cuda else "cpu")
# print(f"Data Owner device is {device.type.get()}")

In [None]:
SavedAction = namedtuple("SavedAction", ["log_prob", "value"])

In [None]:
buffer_saved_actions = []
buffer_rewards =[]

class Policy(sy.Module):
    """
    implements both actor and critic in one model
    """
    def __init__(self, torch_ref):
        super(Policy, self).__init__(torch_ref=torch_ref)
        self.affine1 = self.torch_ref.nn.Linear(4, 128)
        # actor's layer
        self.action_head = self.torch_ref.nn.Linear(128, 2)
        # critic's layer
        self.value_head = self.torch_ref.nn.Linear(128, 1)
        # action & reward buffer
        # self.saved_actions = []
        # self.rewards = []

    def forward(self, x):
        """
        forward of both actor and critic
        """
        x = remote_torch.relu(self.affine1(x))
        # actor: choses action to take from state s_t
        # by returning probability of each action
        action_prob = remote_torch.softmax(self.action_head(x), dim=-1)
        # critic: evaluates being in the state s_t
        state_values = self.value_head(x)
        # return values for both actor and critic as a tuple of 2 values:
        # 1. a list with the probability of each action over the action space
        # 2. the value from state s_t
        return action_prob, state_values

In [None]:
# send our model to remote
policy = Policy(torch)
remote_policy = policy.send(duet)

optimizer = remote_torch.optim.Adam(remote_policy.parameters(), lr=3e-2)
eps = np.finfo(np.float32).eps.item()

In [None]:
# if we have CUDA lets send our model to the GPU
if has_cuda:
    remote_policy.cuda(device)
else:
    remote_policy.cpu()

In [None]:
# You cannot see the state
def select_action(state):
    global buffer_saved_actions
    global buffer_rewards
    
    state = remote_torch.from_numpy(state).float()
    probs_ptr, state_value_ptr = remote_policy(state)
    
    # create a categorical distribution over the list of probabilities of actions
    m = remote_torch.distributions.Categorical(probs_ptr)
    
    # and sample an action using the distribution
    action = m.sample()
    
    # save to action buffer
    buffer_saved_actions.append(SavedAction(m.log_prob(action),
                                                   state_value_ptr))
    
    # the action to take (left or right)
    return action.item()


def finish_episode():
    """
    Training code. Calculates actor and critic loss and performs backpropagation.
    """
    global buffer_saved_actions
    global buffer_rewards
    
    gamma = duet.python.Float(config["gamma"])
    
    R = duet.python.Float(0)
    policy_losses = duet.python.List([])
    value_losses = duet.python.List([])
    returns = duet.python.List([])
    
    for r in buffer_rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
        
    returns = remote_torch.Tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    
    for (log_prob, value), R in zip(buffer_saved_actions, returns):
        advantage = R - value.item()
        
        # calculate actor (policy) loss
        policy_losses.append(-log_prob * advantage)
        
        # calculate critic (value) loss using L1 smooth loss
        value_losses.append(remote_torch.nn.functional.smooth_l1_loss(value,
                                                                R.reshape(1)))
    # reset gradients    
    optimizer.zero_grad()
    
    # sum up all the values of policy_losses and value_losses
    loss = remote_torch.stack(policy_losses).sum() + remote_torch.stack(value_losses).sum()
    
    # perform backprop
    loss.backward()
    optimizer.step()
    
    # reset rewards and action buffer
    del buffer_saved_actions[:]
    del buffer_rewards[:]

In [None]:
reward_threshold_ptr = duet.store["reward_threshold"]
reward_threshold = reward_threshold_ptr.get(request_block=True, delete_obj=False)
print(f"reward_threshold is {reward_threshold}")

In [None]:
remote_gym = duet.gym
remote_env = remote_gym.make("CartPole-v0")
remote_env.seed(config["seed"])

In [None]:
running_reward = 10

# run inifinitely many episodes
for i_episode in count(1):

    # reset environment and episode reward
    state = remote_env.reset()
    ep_reward = duet.python.Float(0)

    # for each episode, only run 9999 steps so that we don't
    # infinite loop while learning
    for t in range(1, 10000):
        # select action from policy
        action = select_action(state)

        # take the action
        state, reward, done, _ = remote_env.step(action)

        buffer_rewards.append(reward)
        ep_reward += reward
        
        if done.get(request_block=True):
            break

    # update cumulative reward
    running_reward = 0.05 * ep_reward.get(request_block=True, delete_obj=False) + (1 - 0.05) * running_reward

    # perform backprop
    finish_episode()

    # log results
    if i_episode % config["log_interval"] == 0:
        print(
                "Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}".format(
                    i_episode,
                    ep_reward.get(request_block=True, delete_obj=False),
                    running_reward
                )
            )

    # check if we have "solved" the cart pole problem
    if running_reward > reward_threshold:
        print(
                "Solved! Running reward is now {} and "
                "the last episode runs to {} time steps!".format(running_reward, t)
            )
        break
        
    if config["dry_run"]:
        break


### <img src="https://github.com/OpenMined/design-assets/raw/master/logos/OM/mark-primary-light.png" alt="he-black-box" width="100"/> Checkpoint 1 : Now STOP and run the Data Owner notebook until Checkpoint 2.