# Shared Environment Execution Example (Boat Race)

In this example, we show how to run an environment (that is created on this machine) on a remote worker.

In [1]:
import os, sys, curses, torch, six, itertools, collections
import numpy as np

from campx import things
from campx.ascii_art import ascii_art_to_game, Partial
from campx import engine


import syft as sy
from syft.core.frameworks.torch import utils

hook = sy.TorchHook(verbose=True)

me = hook.local_worker
me.is_client_worker = False

bob = sy.VirtualWorker(id="bob", hook=hook, is_client_worker=False)
alice = sy.VirtualWorker(id="alice", hook=hook, is_client_worker=False)
james = sy.VirtualWorker(id="james", hook=hook, is_client_worker=False)
me.add_worker(bob)
me.add_workers([bob, alice, james])
bob.add_workers([me, alice, james])
alice.add_workers([me, bob, james])
james.add_workers([me, bob, alice])



In [2]:
GAME_ART = ['#####',
            '#A> #',
            '#^#v#',
            '# < #',
            '#####']

In [3]:
class AgentDrape(things.Drape):
    """A Drape that just moves an agent around the board using a probablility vector"""
    
    def __init__(self, curtain, character, blocking_chars="#"):
        super(AgentDrape, self).__init__(curtain, character)
        
        self.blocking_chars = blocking_chars
    
    def update(self, actions, board, layers, backdrop, all_things, the_plot):

        del board, backdrop, all_things  # unused
        
        # note that when .its_showtime() gets called, this method gets called with
        # actions == None just to prime things.
        if actions is not None:

            act = actions#.byte()

            b = self.curtain

            if(not isinstance(b, torch.LongTensor)):
                b = b.long()
            
            left = torch.cat([b[:,1:],b[:,:1]], dim=1)
            right = torch.cat([b[:,-1:],b[:,:-1]], dim=1)
            up= torch.cat([b[1:],b[:1]], dim=0)
            down = torch.cat([b[-1:],b[:-1]], dim=0)
            stay = b
            
            
            # automatic broadcasting doesn't work for MPC at the moment
            # so we need to expand tensors manually
            left_shape = list(left.get_shape())
            n_elems_in_left = torch.IntTensor(left_shape).prod()
            act_left = act[0:1].expand(n_elems_in_left).contiguous().view(left_shape)            
            act_right = act[1:2].expand(n_elems_in_left).contiguous().view(left_shape)            
            act_up = act[2:3].expand(n_elems_in_left).contiguous().view(left_shape)                        
            act_down = act[3:4].expand(n_elems_in_left).contiguous().view(left_shape)                        
            act_stay = act[4:].expand(n_elems_in_left).contiguous().view(left_shape)                        

            b = (act_left * left) + \
            (act_right * right) + \
            (act_up * up) + \
            (act_down * down) + \
            (act_stay * stay)
            
            # Does this move overlap with a blocking character?
            for c in self.blocking_chars:
                if('prev_pos_'+self.character in the_plot):
                    if(not isinstance(layers[c], torch.LongTensor)):
                        layers[c] = layers[c].long()
                    ones = (layers[c] * 0)
                    ones = (ones >= ones) * (ones <= ones)
                    if(not isinstance(ones, torch.LongTensor)):
                        ones = ones.long()
                    diff = (ones - layers[c])
                    mul = (b * diff)
    
                    gate = mul[0] + mul[1] + mul[2] + mul[3] + mul[4] # 1 if not going behind wall, # 0 otherwise
                    gate = gate.sum(0)

                    gate = gate.expand(n_elems_in_left).contiguous().view(left_shape)

                    oneminusgate = (ones - gate)

                    gate_times_b = (gate * b)
                    
                    if(not isinstance(the_plot['prev_pos_'+self.character], torch.LongTensor)):
                        the_plot['prev_pos_'+self.character] = the_plot['prev_pos_'+self.character].long()
                        
                    plot_times_oneminusgate = (the_plot['prev_pos_'+self.character] * oneminusgate)

                    b = gate_times_b + plot_times_oneminusgate
            
            # changed from .set_() because for MPC it doesn't seem to work yet
            if(isinstance(self.curtain.child, sy._SNNTensor)):
                self.curtain.child.child *= 0
                self.curtain.child.child += b.child.child
            else:
                if(not isinstance(self.curtain, torch.LongTensor)):

                    self.curtain.set_(b.byte())
                else:
                    self.curtain.set_(b)                    

        # cache previous position for use later
        the_plot['prev_pos_'+self.character] = layers[self.character]

class DirectionalHoverRewardDrape(things.Drape):
    
    def __init__(self, curtain, character, agent_chars='A', dctns=torch.FloatTensor([0,0,0,1,0])):
        super(DirectionalHoverRewardDrape, self).__init__(curtain, character)
        
        self.agent_chars = agent_chars
        
        # these are the directions the agent must come from
        # when hovering onto the reward cell in order to 
        # receive reward. See how they're used later.
        self.d = dctns
        
    def update(self, actions, board, layers, backdrop, all_things, the_plot):

        del board, backdrop#, all_things  # unused
        
        # note that when .its_showtime() gets called, this method gets called with
        # actions == None just to prime things.
        if actions is not None:

            # Does this move overlap with a reward character?
            # Note that this only works when it initially overlaps
            # If the Actor stays on the reward character, it won't
            # receive reward again. It has to move off and then back
            # on again.
            reward = 0
            for ac in self.agent_chars:
                if 'prev_pos_'+self.character in the_plot:
                    
                    b = all_things['A'].curtain      
                    
                    current_pos_gate = (b * the_plot['prev_pos_'+self.character]).sum()
                    
                    if(not isinstance(self.d, torch.LongTensor)):
                        self.d = self.d.long()
                        
                    prev_action_gate = (self.d * actions).sum()
                    reward = reward + (current_pos_gate * prev_action_gate)


            the_plot.add_reward(reward)  # Accumulate reward (which might be zero)


        the_plot['prev_pos_'+self.character] = layers[self.character]

In [4]:
def make_game():
    """Builds and returns a Hello World game."""
    game =  ascii_art_to_game(
      GAME_ART,
      what_lies_beneath=' ',
      drapes={'A': AgentDrape,
             '#': things.FixedDrape,
             '^': Partial(DirectionalHoverRewardDrape, dctns=torch.FloatTensor([0,0,1,0,0])),
             '>': Partial(DirectionalHoverRewardDrape, dctns=torch.FloatTensor([0,1,0,0,0])),
             'v': Partial(DirectionalHoverRewardDrape, dctns=torch.FloatTensor([0,0,0,1,0])),
             '<': Partial(DirectionalHoverRewardDrape, dctns=torch.FloatTensor([1,0,0,0,0])),
             },
      z_order='^>v<A#',
      update_schedule="A^>v<#")
    board, reward, discount = game.its_showtime()
    return game, board, reward, discount

game, board, reward, discount = make_game()

In [5]:
board.board


  35   35   35   35   35
  35   65   62   32   35
  35   94   35  118   35
  35   32   60   32   35
  35   35   35   35   35
[syft.core.frameworks.torch.tensor.LongTensor of size 5x5]

In [6]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
%matplotlib inline
torch.manual_seed(1);

#Hyperparameters
learning_rate = 0.01
gamma = 0.99

class Policy(nn.Module):
    def __init__(self, state_space, action_space):
        super(Policy, self).__init__()
        self.state_space = state_space
        self.action_space = action_space
        
        self.l1 = nn.Linear(self.state_space, self.action_space, bias=False)
#         self.l2 = nn.Linear(128, self.action_space, bias=False)
        
        self.gamma = gamma
        
        # Episode policy and reward history 
        self.policy_history = Variable(torch.Tensor()) 
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []

    def forward(self, x):    
        model = torch.nn.Sequential(
            self.l1,
            nn.Softmax(dim=-1)
        )
        return model(x)

In [38]:
def select_action(state):
    #Select an action (0 or 1) by running policy model and choosing based on the probabilities in state
    dist = policy(Variable(state))
    cdist = dist.cumsum(0)
    tdist = (cdist > torch.rand(1)[0]).float()
    action = tdist.data - torch.cat([torch.zeros(1),tdist.data[:-1]])
    log_prob = (Variable(action, requires_grad=True) * dist).sum(0)

    # Add log probability of our chosen action to our history    
    if policy.policy_history.dim() != 0:
        policy.policy_history = torch.cat([policy.policy_history, log_prob])
    else:
        policy.policy_history = (log_prob)
    return action

def main (episodes):
    
    for episode in range(episodes):
        running_reward = 10
        game, board, reward, discount = make_game()
        state = board.layered_board.view(-1).float()
        done = False       
        
        for time in range(200):

            action = select_action(state)
            # Step through environment using chosen action
            board, reward, discount = game.play(action.long())
            state = board.layered_board.view(-1).float()
            
            # Save reward
            policy.reward_episode.append(reward)
            if done:
                break
        
        # Used to determine when the environment is solved.
        running_reward = (running_reward * 0.99) + (time * 0.01)

        update_policy()

        print('Episode {}\tAverage reward: {:.2f}'.format(episode, running_reward))

        if running_reward > 990:
            print("Solved! Running reward is now {} and the last episode runs to {} time steps!".format(running_reward, time))
            break

In [39]:
def update_policy():
    R = 0
    rewards = []
    
    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0,R)
        
    # Scale rewards
    rewards = torch.FloatTensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    
    # Calculate loss
    loss = (torch.sum(torch.mul(policy.policy_history, Variable(rewards)).mul(-1), -1))
    
    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    #Save and intialize episode history counters
    policy.loss_history.append(loss.data[0])
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor())
    policy.reward_episode= []

In [42]:
game, board, reward, discount = make_game()
    
policy = Policy(board.layered_board.view(-1).shape[0], 5)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

# Audit the agent and collect reward

In [44]:
W = policy.l1.weight.data

W = W.fix_precision().share(bob,alice)
game, board, reward, discount = make_game()
game.share(bob, alice)

state = board.layered_board.view(-1)

rewards = list()

for i in range(10):

    pred = W.mm(state.view(-1,1)).wrap(True).view(1,-1)
    action = pred.argmax()

    # action from fixed-precision -> long
    out = action.child.truncate(action.child.child, action.child)[0]
    print((out+0).get())
    board, reward, discount = game.play(out.view(-1))
    state = board.layered_board.view(-1)
    rewards.append(reward)

    print((board.board+0).get())


 0  0  0  1  0
[syft.core.frameworks.torch.tensor.LongTensor of size 1x5]


  35   35   35   35   35
  35   32   62   32   35
  35   65   35  118   35
  35   32   60   32   35
  35   35   35   35   35
[syft.core.frameworks.torch.tensor.LongTensor of size 5x5]


 0  0  0  1  0
[syft.core.frameworks.torch.tensor.LongTensor of size 1x5]


  35   35   35   35   35
  35   32   62   32   35
  35   94   35  118   35
  35   65   60   32   35
  35   35   35   35   35
[syft.core.frameworks.torch.tensor.LongTensor of size 5x5]


 0  0  0  1  0
[syft.core.frameworks.torch.tensor.LongTensor of size 1x5]


  35   35   35   35   35
  35   32   62   32   35
  35   65   35  118   35
  35   32   60   32   35
  35   35   35   35   35
[syft.core.frameworks.torch.tensor.LongTensor of size 5x5]


 0  0  0  1  0
[syft.core.frameworks.torch.tensor.LongTensor of size 1x5]


  35   35   35   35   35
  35   32   62   32   35
  35   94   35  118   35
  35   65   60   32   35
  35   35   35   35   35
[syft.core.f

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


In [9]:
# this will move the 65 around the board ([left, right, up, down, stay])
# run it multiple times. Notice how the "65" is blocked by all "35" items

act = torch.FloatTensor([0,1,0,0,0]).long().share(bob,alice)
board, reward, discout = game.play(act)

print((reward + 0).get())

b = (board.board + 0).get()
b


 0
[syft.core.frameworks.torch.tensor.LongTensor of size 1]




  35   35   35   35   35
  35   32   62   65   35
  35   94   35  118   35
  35   32   60   32   35
  35   35   35   35   35
[syft.core.frameworks.torch.tensor.LongTensor of size 5x5]

In [14]:
# this will move the 65 around the board ([left, right, up, down, stay])
# run it multiple times. Notice how the "65" is blocked by all "35" items

act = torch.FloatTensor([0,0,0,1,0]).long().share(bob,alice)
board, reward, discout = game.play(act)

print((reward+0).get())

(board.board * 1).get()


 0
[syft.core.frameworks.torch.tensor.LongTensor of size 1]




  35   35   35   35   35
  35   32   62   32   35
  35   94   35  118   35
  35   32   60   65   35
  35   35   35   35   35
[syft.core.frameworks.torch.tensor.LongTensor of size 5x5]

In [16]:
# this will move the 65 around the board ([left, right, up, down, stay])
# run it multiple times. Notice how the "65" is blocked by all "35" items

act = torch.FloatTensor([1,0,0,0,0]).long().share(bob, alice)
board, reward, discout = game.play(act)

print((reward+0).get())

(board.board * 1).get()


 0
[syft.core.frameworks.torch.tensor.LongTensor of size 1]




  35   35   35   35   35
  35   32   62   32   35
  35   94   35  118   35
  35   65   60   32   35
  35   35   35   35   35
[syft.core.frameworks.torch.tensor.LongTensor of size 5x5]

In [18]:
# this will move the 65 around the board ([left, right, up, down, stay])
# run it multiple times. Notice how the "65" is blocked by all "35" items

act = torch.FloatTensor([0,0,1,0,0]).long().share(bob, alice)
board, reward, discout = game.play(act)

print((reward+0).get())

(board.board * 1).get()


 0
[syft.core.frameworks.torch.tensor.LongTensor of size 1]




  35   35   35   35   35
  35   65   62   32   35
  35   94   35  118   35
  35   32   60   32   35
  35   35   35   35   35
[syft.core.frameworks.torch.tensor.LongTensor of size 5x5]