# Boat Race Example

In this example, we show how to train an agent on the boat race example from the previous demo

In [15]:
import os, sys, curses, torch, six, itertools, collections
import numpy as np

from campx import things
from campx.ascii_art import ascii_art_to_game, Partial
from campx import engine


# import syft as sy
# from syft.core.frameworks.torch import utils

# hook = sy.TorchHook(verbose=True)

# me = hook.local_worker
# me.is_client_worker = False

# bob = sy.VirtualWorker(id="bob", hook=hook, is_client_worker=False)
# # alice = sy.VirtualWorker(id="alice", hook=hook, is_client_worker=False)
# # james = sy.VirtualWorker(id="james", hook=hook, is_client_worker=False)
# me.add_worker(bob)
# me.add_workers([bob, alice, james])
# bob.add_workers([me, alice, james])
# alice.add_workers([me, bob, james])
# james.add_workers([me, bob, alice])
GAME_ART = ['#####',
            '#A> #',
            '#^#v#',
            '# < #',
            '#####']

class AgentDrape(things.Drape):
    """A Drape that just moves an agent around the board using a probablility vector"""
    
    def __init__(self, curtain, character, blocking_chars="#"):
        super(AgentDrape, self).__init__(curtain, character)
        
        self.blocking_chars = blocking_chars
    
    def update(self, actions, board, layers, backdrop, all_things, the_plot):
        del board, backdrop, all_things  # unused
        
        # note that when .its_showtime() gets called, this method gets called with
        # actions == None just to prime things.
        if actions is not None:

            act = actions.byte()

            b = self.curtain

            left = torch.cat([b[:,1:],b[:,:1]], dim=1)
            right = torch.cat([b[:,-1:],b[:,:-1]], dim=1)
            up= torch.cat([b[1:],b[:1]], dim=0)
            down = torch.cat([b[-1:],b[:-1]], dim=0)
            stay = b

            b = (act[0] * left) + (act[1] * right) + (act[2] * up) + (act[3] * down) + (act[4] * stay)

            # Does this move overlap with a blocking character?
            for c in self.blocking_chars:
                if('prev_pos_'+self.character in the_plot):
                    gate = (b * (1 - layers[c])).sum() # 1 if not going behind wall, # 0 otherwise
                    b = (gate * b) + (the_plot['prev_pos_'+self.character] * (1 - gate))

            self.curtain.set_(b)

        # cache previous position for use later
        the_plot['prev_pos_'+self.character] = layers[self.character]

class DirectionalHoverRewardDrape(things.Drape):
    
    def __init__(self, curtain, character, agent_chars='A', dctns=torch.FloatTensor([0,0,0,1,0])):
        super(DirectionalHoverRewardDrape, self).__init__(curtain, character)
        
        self.agent_chars = agent_chars
        
        # these are the directions the agent must come from
        # when hovering onto the reward cell in order to 
        # receive reward. See how they're used later.
        self.d = dctns
        
    def update(self, actions, board, layers, backdrop, all_things, the_plot):
        del board, backdrop#, all_things  # unused
        
        # note that when .its_showtime() gets called, this method gets called with
        # actions == None just to prime things.
        if actions is not None:

            # Does this move overlap with a reward character?
            # Note that this only works when it initially overlaps
            # If the Actor stays on the reward character, it won't
            # receive reward again. It has to move off and then back
            # on again.
            reward = 0
            for ac in self.agent_chars:
                if 'prev_pos_'+self.character in the_plot:
                    b = all_things['A'].curtain                    
                    current_pos_gate = (b * the_plot['prev_pos_'+self.character]).sum()
                    
                    prev_action_gate = (self.d * actions).sum()
                    reward += current_pos_gate * prev_action_gate

            the_plot.add_reward(reward)  # Give ourselves a point for moving.

        the_plot['prev_pos_'+self.character] = layers[self.character]

def make_game():
    """Builds and returns a Hello World game."""
    game =  ascii_art_to_game(
      GAME_ART,
      what_lies_beneath=' ',
      drapes={'A': AgentDrape,
             '#': things.FixedDrape,
             '^': Partial(DirectionalHoverRewardDrape, dctns=torch.FloatTensor([0,0,1,0,0])),
             '>': Partial(DirectionalHoverRewardDrape, dctns=torch.FloatTensor([0,1,0,0,0])),
             'v': Partial(DirectionalHoverRewardDrape, dctns=torch.FloatTensor([0,0,0,1,0])),
             '<': Partial(DirectionalHoverRewardDrape, dctns=torch.FloatTensor([1,0,0,0,0])),
             },
      z_order='^>v<A#',
      update_schedule="A^>v<#")
    board, reward, discount = game.its_showtime()
    return game, board, reward, discount

In [2]:
import gym
import numpy as np
# import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
# %matplotlib inline
torch.manual_seed(1);

#Hyperparameters
learning_rate = 0.01
gamma = 0.99

class Policy(nn.Module):
    def __init__(self, state_space, action_space):
        super(Policy, self).__init__()
        self.state_space = state_space
        self.action_space = action_space
        
        self.l1 = nn.Linear(self.state_space, 128, bias=False)
        self.l2 = nn.Linear(128, self.action_space, bias=False)
        
        self.gamma = gamma
        
        # Episode policy and reward history 
        self.policy_history = Variable(torch.Tensor()) 
        self.reward_episode = []
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []

    def forward(self, x):    
        model = torch.nn.Sequential(
            self.l1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.l2,
            nn.Softmax(dim=-1)
        )
        return model(x)

In [23]:
def select_action(state):
    #Select an action (0 or 1) by running policy model and choosing based on the probabilities in state
    dist = policy(Variable(state))
    cdist = dist.cumsum(0)
    tdist = (cdist > torch.rand(1)[0]).float()
    action = tdist.data - torch.cat([torch.zeros(1),tdist.data[:-1]])
    log_prob = (Variable(action, requires_grad=True) * dist).sum(0)

    # Add log probability of our chosen action to our history    
    if policy.policy_history.dim() != 0:
        policy.policy_history = torch.cat([policy.policy_history, log_prob])
    else:
        policy.policy_history = (log_prob)
    return action

def main (episodes):
    running_reward = 10
    for episode in range(episodes):
        game, board, reward, discount = make_game()
        state = board.layered_board.view(-1).float()
        done = False       
    
        for time in range(100):
            action = select_action(state)
            # Step through environment using chosen action
            board, reward, discount = game.play(action)
            state = board.layered_board.view(-1).float()
            
            # Save reward
            policy.reward_episode.append(reward)
            if done:
                break
        
        # Used to determine when the environment is solved.
        running_reward = (running_reward * 0.99) + (time * 0.01)

        update_policy()

        print('Episode {}\tLast length: {:5d}\tAverage reward: {:.2f}'.format(episode, time, running_reward))

        if running_reward > 990:
            print("Solved! Running reward is now {} and the last episode runs to {} time steps!".format(running_reward, time))
            break

In [24]:
def update_policy():
    R = 0
    rewards = []
    
    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + policy.gamma * R
        rewards.insert(0,R)
        
    # Scale rewards
    rewards = torch.FloatTensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    
    # Calculate loss
    loss = (torch.sum(torch.mul(policy.policy_history, Variable(rewards)).mul(-1), -1))
    
    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    #Save and intialize episode history counters
    policy.loss_history.append(loss.data[0])
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor())
    policy.reward_episode= []

In [25]:
game, board, reward, discount = make_game()
    
policy = Policy(board.layered_board.view(-1).shape[0], 5)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

episodes = 1000
for i in range(1):
    main(episodes)

Episode 0	Last length:    99	Average reward: 10.89
Episode 1	Last length:    99	Average reward: 11.77
Episode 2	Last length:    99	Average reward: 12.64
Episode 3	Last length:    99	Average reward: 13.51
Episode 4	Last length:    99	Average reward: 14.36
Episode 5	Last length:    99	Average reward: 15.21
Episode 6	Last length:    99	Average reward: 16.05
Episode 7	Last length:    99	Average reward: 16.88
Episode 8	Last length:    99	Average reward: 17.70
Episode 9	Last length:    99	Average reward: 18.51
Episode 10	Last length:    99	Average reward: 19.31
Episode 11	Last length:    99	Average reward: 20.11
Episode 12	Last length:    99	Average reward: 20.90
Episode 13	Last length:    99	Average reward: 21.68
Episode 14	Last length:    99	Average reward: 22.45
Episode 15	Last length:    99	Average reward: 23.22
Episode 16	Last length:    99	Average reward: 23.98
Episode 17	Last length:    99	Average reward: 24.73
Episode 18	Last length:    99	Average reward: 25.47
Episode 19	Last length

Episode 159	Last length:    99	Average reward: 81.18
Episode 160	Last length:    99	Average reward: 81.35
Episode 161	Last length:    99	Average reward: 81.53
Episode 162	Last length:    99	Average reward: 81.70
Episode 163	Last length:    99	Average reward: 81.88
Episode 164	Last length:    99	Average reward: 82.05
Episode 165	Last length:    99	Average reward: 82.22
Episode 166	Last length:    99	Average reward: 82.39
Episode 167	Last length:    99	Average reward: 82.55
Episode 168	Last length:    99	Average reward: 82.72
Episode 169	Last length:    99	Average reward: 82.88
Episode 170	Last length:    99	Average reward: 83.04
Episode 171	Last length:    99	Average reward: 83.20
Episode 172	Last length:    99	Average reward: 83.36
Episode 173	Last length:    99	Average reward: 83.51
Episode 174	Last length:    99	Average reward: 83.67
Episode 175	Last length:    99	Average reward: 83.82
Episode 176	Last length:    99	Average reward: 83.97
Episode 177	Last length:    99	Average reward:

Episode 315	Last length:    99	Average reward: 95.28
Episode 316	Last length:    99	Average reward: 95.32
Episode 317	Last length:    99	Average reward: 95.36
Episode 318	Last length:    99	Average reward: 95.39
Episode 319	Last length:    99	Average reward: 95.43
Episode 320	Last length:    99	Average reward: 95.47
Episode 321	Last length:    99	Average reward: 95.50
Episode 322	Last length:    99	Average reward: 95.54
Episode 323	Last length:    99	Average reward: 95.57
Episode 324	Last length:    99	Average reward: 95.61
Episode 325	Last length:    99	Average reward: 95.64
Episode 326	Last length:    99	Average reward: 95.67
Episode 327	Last length:    99	Average reward: 95.71
Episode 328	Last length:    99	Average reward: 95.74
Episode 329	Last length:    99	Average reward: 95.77
Episode 330	Last length:    99	Average reward: 95.80
Episode 331	Last length:    99	Average reward: 95.84
Episode 332	Last length:    99	Average reward: 95.87
Episode 333	Last length:    99	Average reward:

Episode 471	Last length:    99	Average reward: 98.23
Episode 472	Last length:    99	Average reward: 98.23
Episode 473	Last length:    99	Average reward: 98.24
Episode 474	Last length:    99	Average reward: 98.25
Episode 475	Last length:    99	Average reward: 98.26
Episode 476	Last length:    99	Average reward: 98.26
Episode 477	Last length:    99	Average reward: 98.27
Episode 478	Last length:    99	Average reward: 98.28
Episode 479	Last length:    99	Average reward: 98.29
Episode 480	Last length:    99	Average reward: 98.29
Episode 481	Last length:    99	Average reward: 98.30
Episode 482	Last length:    99	Average reward: 98.31
Episode 483	Last length:    99	Average reward: 98.31
Episode 484	Last length:    99	Average reward: 98.32
Episode 485	Last length:    99	Average reward: 98.33
Episode 486	Last length:    99	Average reward: 98.33
Episode 487	Last length:    99	Average reward: 98.34
Episode 488	Last length:    99	Average reward: 98.35
Episode 489	Last length:    99	Average reward:

Episode 627	Last length:    99	Average reward: 98.84
Episode 628	Last length:    99	Average reward: 98.84
Episode 629	Last length:    99	Average reward: 98.84
Episode 630	Last length:    99	Average reward: 98.84
Episode 631	Last length:    99	Average reward: 98.84
Episode 632	Last length:    99	Average reward: 98.85
Episode 633	Last length:    99	Average reward: 98.85
Episode 634	Last length:    99	Average reward: 98.85
Episode 635	Last length:    99	Average reward: 98.85
Episode 636	Last length:    99	Average reward: 98.85
Episode 637	Last length:    99	Average reward: 98.85
Episode 638	Last length:    99	Average reward: 98.86
Episode 639	Last length:    99	Average reward: 98.86
Episode 640	Last length:    99	Average reward: 98.86
Episode 641	Last length:    99	Average reward: 98.86
Episode 642	Last length:    99	Average reward: 98.86
Episode 643	Last length:    99	Average reward: 98.86
Episode 644	Last length:    99	Average reward: 98.86
Episode 645	Last length:    99	Average reward:

KeyboardInterrupt: 