## Setup

In [1]:
import sys
import numpy as np
import tensorflow as tf
import numpy as np
import keras
from keras import layers
import scipy.signal
import time
from collections import namedtuple, Counter
from queue import deque
import random
import math
from typing import List
from tqdm import tqdm  # used for progress meters
from time import sleep

sys.path.append("./")  # make sure that it is able to import Board

from Board import Board
from constants import *
from Player import ZombiePlayer, GovernmentPlayer


In [2]:
DEVICE = "GPU"


In [3]:
# Hyperparameters of the PPO algorithm
steps_per_epoch = 4000
epochs = 30
gamma = 0.99
clip_ratio = 0.2
policy_learning_rate = 3e-4
value_function_learning_rate = 1e-3
train_policy_iterations = 80
train_value_iterations = 80
lam = 0.97
target_kl = 0.01
hidden_sizes = (256, 256)

# True if you want to render the environment
render = False

## Environment

In [4]:
class GovernmentEnvironment:
    ACTION_MAPPINGS = {
        0: "moveUp",
        1: "moveDown",
        2: "moveLeft",
        3: "moveRight",
        4: "wallUp",
        5: "wallDown",
        6: "wallLeft",
        7: "wallRight",
        8: "vaccinate",
        9: "cureUp",
        10: "cureDown",
        11: "cureLeft",
        12: "cureRight",
    }
    ACTION_NAMES = [
        "moveUp",
        "moveDown",
        "moveLeft",
        "moveRight",
        "wallUp",
        "wallDown",
        "wallLeft",
        "wallRight",
        "vaccinate",
        "cureUp",
        "cureDown",
        "cureLeft",
        "cureRight",
    ]
    ACTION_SPACE = len(ACTION_MAPPINGS.keys())
    SIZE = (6, 6)
    OBSERVATION_SPACE = 40

    def __init__(self, max_timesteps: int = 300, logdir: str = "", run_name="") -> None:
        self.max_timesteps = max_timesteps
        self.reset()
        self.total_timesteps = 0
        self.total_invalid_moves = 0
        self.writer = None
        if logdir != "" and run_name != "":
            self.writer = tf.summary.create_file_writer(f"{logdir}/{run_name}")

    def reset(self):
        self.board = Board(GovernmentEnvironment.SIZE, "Government")
        num_people = random.randint(7, 12)
        self.board.populate(num_people=num_people, num_zombies=num_people - 1)
        self.enemyPlayer = ZombiePlayer()
        self.done = False

        # coordinates of the first Government player
        self.agentPosition = self.board.indexOf(False)
        if self.agentPosition == -1:
            self.reset()

        # useful for metrics
        self.max_number_of_government = 1
        self.episode_invalid_actions = 0
        self.episode_reward = 0
        self.episode_timesteps = 0
        return self._get_obs()

    def get_invalid_action_mask(self):
        action_mask = [True for name in GovernmentEnvironment.ACTION_NAMES]
        clone = self.board.clone(self.board.States, self.board.player_role)
        coord = self.board.toCoord(self.agentPosition)
        for idx in range(len(GovernmentEnvironment.ACTION_NAMES)):
            action_name = GovernmentEnvironment.ACTION_NAMES[idx]

            if "move" in action_name:
                valid, new_pos = clone.actionToFunction[action_name](coord)
            elif "vaccinate" in action_name:
                valid, _ = clone.actionToFunction[action_name](coord)
            elif "cure" in action_name:
                dest_coord = list(coord)
                if action_name == "cureUp":
                    dest_coord[1] -= 1
                elif action_name == "cureDown":
                    dest_coord[1] += 1
                elif action_name == "cureRight":
                    dest_coord[0] += 1
                else:
                    dest_coord[0] -= 1
                valid, _ = clone.actionToFunction["cure"](dest_coord)
            else:  # wall variation
                dest_coord = list(coord)
                if action_name == "wallUp":
                    dest_coord[1] -= 1
                elif action_name == "wallDown":
                    dest_coord[1] += 1
                elif action_name == "wallRight":
                    dest_coord[0] += 1
                else:
                    dest_coord[0] -= 1
                valid, _ = clone.actionToFunction["wall"](dest_coord)

            if valid:
                # re-clone the board
                clone = self.board.clone(self.board.States, self.board.player_role)
            else:
                action_mask[idx] = False

        return action_mask

    def step(self, action: int):
        if self.board.States[self.agentPosition].person is None:
            print("Lost Person before")
            print("agent position is", self.agentPosition)
            print("obs is", self._get_obs())

        action_name = GovernmentEnvironment.ACTION_MAPPINGS[action]
        # print("Before: ", end = str(self.agentPosition))
        # print()
        # print(action_name)
        if "move" in action_name:
            # print(self.board.get_board())
            valid, new_pos = self.board.actionToFunction[action_name](
                self.board.toCoord(self.agentPosition)
            )
            if valid:
                # print(self.board.get_board())
                # print(self.agentPosition)
                self.agentPosition = new_pos
                # print("After: ", end = str(self.agentPosition))
                # print()
        elif "vaccinate" in action_name:
            valid, _ = self.board.actionToFunction[action_name](
                self.board.toCoord(self.agentPosition)
            )
        elif "cure" in action_name:
            dest_coord = list(self.board.toCoord(self.agentPosition))
            if action_name == "cureUp":
                dest_coord[1] -= 1
            elif action_name == "cureDown":
                dest_coord[1] += 1
            elif action_name == "cureRight":
                dest_coord[0] += 1
            else:
                dest_coord[0] -= 1
            valid, _ = self.board.actionToFunction["cure"](dest_coord)
        else:  # wall variation
            dest_coord = list(self.board.toCoord(self.agentPosition))
            if action_name == "wallUp":
                dest_coord[1] -= 1
            elif action_name == "wallDown":
                dest_coord[1] += 1
            elif action_name == "wallRight":
                dest_coord[0] += 1
            else:
                dest_coord[0] -= 1
            valid, _ = self.board.actionToFunction["wall"](dest_coord)

        won = None
        # do the opposing player's action if the action was valid.
        if valid:
            _action, coord = self.enemyPlayer.get_move(self.board)
            if not _action:
                self.done = True
                won = True
            else:
                self.board.actionToFunction[_action](coord)
            self.board.update()

        # see if the game is over
        # print(self.agentPosition)
        # print(self.board.get_board())
        # print(self._get_obs())
        if self.board.States[self.agentPosition].person is None:
            print("Lost Person")
            print("agent position is", self.agentPosition)
            print("obs is", self._get_obs())

        if self.board.States[self.agentPosition].person.isZombie:  # person was bitten
            self.done = True
            won = False
        if not self.board.is_move_possible_at(self.agentPosition):  # no move possible
            self.done = True
        if self.episode_timesteps > self.max_timesteps:
            self.done = True

        # get obs, reward, done, info
        obs, reward, done, info = (
            self._get_obs(),
            self._get_reward(action_name, valid, won),
            self._get_done(),
            self._get_info(),
        )

        # update the metrics
        self.episode_reward += reward
        if not valid:
            self.episode_invalid_actions += 1
            self.total_invalid_moves += 1
        self.episode_timesteps += 1
        self.max_number_of_government = max(
            self.board.num_people(), self.max_number_of_government
        )
        self.total_timesteps += 1

        # write the metrics
        if self.writer is not None:
            with self.writer.as_default():
                tf.summary.scalar(
                    "train/invalid_action_rate",
                    self.total_invalid_moves / self.total_timesteps,
                    step=self.total_timesteps,
                )
                tf.summary.scalar("train/cur_reward", reward, step=self.total_timesteps)

        # return the obs, reward, done, info
        return obs, reward, done, info

    def _get_info(self):
        return {}

    def _get_done(self):
        return self.done

    def _get_reward(self, action_name: str, was_valid: bool, won: bool):
        """
        Gonna try to return reward between [-1, 1]
        This fits w/i tanh and sigmoid ranges
        """
        if not was_valid:
            return -1
        if won is True:
            return 1
        if won is False:
            return -0.5
        if "vaccinate" in action_name:
            return 0.3
        if "cure" in action_name:
            return 0.7
        return -0.01  # this is the case where it was move

    def _get_obs(self):
        """
        Is based off the assumption that 5 is not in the returned board.
        Uses 5 as the key for current position.
        """
        AGENT_POSITION_CONSTANT = 5
        ret = self.board.get_board()
        ret[self.agentPosition] = AGENT_POSITION_CONSTANT

        # add resources and prices
        ret.append(self.board.resources.resources)
        ret.append(self.board.resources.costs["cure"])
        ret.append(self.board.resources.costs["vaccinate"])
        ret.append(self.board.resources.costs["wall"])

        return np.array(ret, dtype=np.float32)

    def render(self):
        import PygameFunctions as PF
        import pygame

        PF.run(self.board)
        pygame.display.update()

    def init_render(self):
        import PygameFunctions as PF
        import pygame

        PF.initScreen(self.board)
        pygame.display.update()

    def close(self):
        import pygame

        pygame.quit()

    def write_run_metrics(self):
        if self.writer is not None:
            with self.writer.as_default():
                tf.summary.scalar(
                    "episode/num_invalid_actions_per_ep",
                    self.episode_invalid_actions,
                    step=self.total_timesteps,
                )
                tf.summary.scalar(
                    "episode/episode_length",
                    self.episode_timesteps,
                    step=self.total_timesteps,
                )
                tf.summary.scalar(
                    "episode/episode_total_reward",
                    self.episode_reward,
                    step=self.total_timesteps,
                )
                tf.summary.scalar(
                    "episode/mean_reward",
                    self.episode_reward / self.episode_timesteps,
                    step=self.total_timesteps,
                )
                tf.summary.scalar(
                    "episode/percent_invalid_per_ep",
                    self.episode_invalid_actions / self.episode_timesteps,
                    step=self.total_timesteps,
                )


In [5]:
NUM_ACTIONS = GovernmentEnvironment.ACTION_SPACE
OBS_SPACE = GovernmentEnvironment.OBSERVATION_SPACE

## Buffer and Model

In [6]:
def discounted_cumulative_sums(x, discount):
    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]


class Buffer:
    # Buffer for storing trajectories
    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
        # Buffer initialization
        self.observation_buffer = np.zeros(
            (size, observation_dimensions), dtype=np.float32
        )
        self.action_buffer = np.zeros(size, dtype=np.int32)
        self.advantage_buffer = np.zeros(size, dtype=np.float32)
        self.reward_buffer = np.zeros(size, dtype=np.float32)
        self.return_buffer = np.zeros(size, dtype=np.float32)
        self.value_buffer = np.zeros(size, dtype=np.float32)
        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
        self.gamma, self.lam = gamma, lam
        self.pointer, self.trajectory_start_index = 0, 0

    def store(self, observation, action, reward, value, logprobability):
        # Append one step of agent-environment interaction
        self.observation_buffer[self.pointer] = observation
        self.action_buffer[self.pointer] = action
        self.reward_buffer[self.pointer] = reward
        self.value_buffer[self.pointer] = value
        self.logprobability_buffer[self.pointer] = logprobability
        self.pointer += 1

    def finish_trajectory(self, last_value=0):
        # Finish the trajectory by computing advantage estimates and rewards-to-go
        path_slice = slice(self.trajectory_start_index, self.pointer)
        rewards = np.append(self.reward_buffer[path_slice], last_value)
        values = np.append(self.value_buffer[path_slice], last_value)

        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]

        self.advantage_buffer[path_slice] = discounted_cumulative_sums(
            deltas, self.gamma * self.lam
        )
        self.return_buffer[path_slice] = discounted_cumulative_sums(
            rewards, self.gamma
        )[:-1]

        self.trajectory_start_index = self.pointer

    def get(self):
        # Get all data of the buffer and normalize the advantages
        self.pointer, self.trajectory_start_index = 0, 0
        advantage_mean, advantage_std = (
            np.mean(self.advantage_buffer),
            np.std(self.advantage_buffer),
        )
        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
        return (
            self.observation_buffer,
            self.action_buffer,
            self.advantage_buffer,
            self.return_buffer,
            self.logprobability_buffer,
        )

def apply_invalid_mask(logits, env:GovernmentEnvironment):
    # pass in logits; this would be before doing logprobabilities
    # applies an invalid action mask
    action_mask = tf.constant([env.get_invalid_action_mask()], dtype=tf.bool)
    invalid_values = tf.constant([[tf.float32.min] * NUM_ACTIONS], dtype=tf.float32)
    
    assert invalid_values.shape == logits.shape 
    logits = tf.where(action_mask, logits, invalid_values)
    return logits


def mlp(x, sizes, activation=tf.tanh, output_activation=None):
    # Build a feedforward neural network
    for size in sizes[:-1]:
        x = layers.Dense(units=size, activation=activation)(x)
    return layers.Dense(units=sizes[-1], activation=output_activation)(x)


@tf.function
def logprobabilities(logits, a):
    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
    logprobabilities_all = tf.nn.log_softmax(logits)
    logprobability = tf.reduce_sum(
        tf.one_hot(a, num_actions) * logprobabilities_all, axis=1
    )
    return logprobability


# Sample action from actor
def sample_action(observation):
    logits = actor(observation)
    logits = apply_invalid_mask(logits, env)
    action = tf.squeeze(tf.random.categorical(logits, 1), axis=1)
    return logits, action


# Train the policy by maxizing the PPO-Clip objective
@tf.function
def train_policy(
    observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
):

    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        ratio = tf.exp(
            logprobabilities(actor(observation_buffer), action_buffer)
            - logprobability_buffer
        )
        min_advantage = tf.where(
            advantage_buffer > 0,
            (1 + clip_ratio) * advantage_buffer,
            (1 - clip_ratio) * advantage_buffer,
        )

        policy_loss = -tf.reduce_mean(
            tf.minimum(ratio * advantage_buffer, min_advantage)
        )
    policy_grads = tape.gradient(policy_loss, actor.trainable_variables)
    policy_optimizer.apply_gradients(zip(policy_grads, actor.trainable_variables))

    kl = tf.reduce_mean(
        logprobability_buffer
        - logprobabilities(actor(observation_buffer), action_buffer)
    )
    kl = tf.reduce_sum(kl)
    return kl


# Train the value function by regression on mean-squared error
@tf.function
def train_value_function(observation_buffer, return_buffer):
    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
        value_loss = tf.reduce_mean((return_buffer - critic(observation_buffer)) ** 2)
    value_grads = tape.gradient(value_loss, critic.trainable_variables)
    value_optimizer.apply_gradients(zip(value_grads, critic.trainable_variables))


## Initializations

In [7]:
env = GovernmentEnvironment(50)

In [8]:
# Initialize the environment and get the dimensionality of the
# observation space and the number of possible actions
temp_obs = env.reset()
observation_dimensions = temp_obs.shape[0]
num_actions = env.ACTION_SPACE

# Initialize the buffer
buffer = Buffer(observation_dimensions, steps_per_epoch)

# Initialize the actor and the critic as keras models
observation_input = keras.Input(shape=(observation_dimensions,), dtype=tf.float32)
logits = mlp(observation_input, list(hidden_sizes) + [num_actions], tf.tanh, None)
actor = keras.Model(inputs=observation_input, outputs=logits)
value = tf.squeeze(
    mlp(observation_input, list(hidden_sizes) + [1], tf.tanh, None), axis=1
)
critic = keras.Model(inputs=observation_input, outputs=value)

# Initialize the policy and the value function optimizers
policy_optimizer = keras.optimizers.Adam(learning_rate=policy_learning_rate)
value_optimizer = keras.optimizers.Adam(learning_rate=value_function_learning_rate)

# Initialize the observation, episode return and episode length
observation, episode_return, episode_length = env.reset(), 0, 0

In [9]:
# make sure logits work
for i in range(100):
    logits, action = sample_action(np.reshape(observation, (1, 40)))
    action = action.numpy()[0]
    if action == 12:
        print("12")
    if action == 13:
        print("13")

## Load

In [10]:
#value_optimizer_weights = np.load("gov_value_optimizer_v2.npy")
#value_optimizer.set_weights(value_optimizer_weights)

#policy_optimizer_weights = np.load("gov_policy_optimizer_v2.npy")
#policy_optimizer.set_weights(policy_optimizer_weights)

actor.load_weights("gov_actor_v2_weights")
critic.load_weights("gov_critic_v2_weights")

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x20e220032b0>

## Train!!

In [11]:
def train(num_epochs):
    global observation, episode_return, episode_length
    for epoch in tqdm(range(num_epochs)):
        # Initialize the sum of the returns, lengths and number of episodes for each epoch
        sum_return = 0
        sum_length = 0
        num_episodes = 0

        # Iterate over the steps of each epoch
        for t in range(steps_per_epoch):
            if render:
                env.render()

            # Get the logits, action, and take one step in the environment
            observation = observation.reshape(1, -1)
            logits, action = sample_action(observation)
            observation_new, reward, done, _ = env.step(action[0].numpy())
            episode_return += reward
            episode_length += 1

            # Get the value and log-probability of the action
            value_t = critic(observation)
            logprobability_t = logprobabilities(logits, action)

            # Store obs, act, rew, v_t, logp_pi_t
            buffer.store(observation, action, reward, value_t, logprobability_t)

            # Update the observation
            observation = observation_new

            # Finish trajectory if reached to a terminal state
            terminal = done
            if terminal or (t == steps_per_epoch - 1):
                last_value = 0 if done else critic(observation.reshape(1, -1))
                buffer.finish_trajectory(last_value)
                sum_return += episode_return
                sum_length += episode_length
                num_episodes += 1
                observation, episode_return, episode_length = env.reset(), 0, 0

        # Get values from the buffer
        (
            observation_buffer,
            action_buffer,
            advantage_buffer,
            return_buffer,
            logprobability_buffer,
        ) = buffer.get()

        # Update the policy and implement early stopping using KL divergence
        for _ in range(train_policy_iterations):
            kl = train_policy(
                observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
            )
            if kl > 1.5 * target_kl:
                # Early Stopping
                break

        # Update the value function
        for _ in range(train_value_iterations):
            train_value_function(observation_buffer, return_buffer)

        # Print mean return and length for each epoch
        print(
            f" Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum_length / num_episodes}"
        )

In [73]:
env.reset()
with tf.device(DEVICE):
    train(10)

 10%|█         | 1/10 [00:34<05:10, 34.53s/it]

 Epoch: 1. Mean Return: -0.023950892857142834. Mean Length: 8.928571428571429


 20%|██        | 2/10 [01:07<04:29, 33.73s/it]

 Epoch: 2. Mean Return: -0.0300447427293065. Mean Length: 8.94854586129754


 30%|███       | 3/10 [01:42<04:00, 34.41s/it]

 Epoch: 3. Mean Return: -0.0016009280742458765. Mean Length: 9.280742459396752


 40%|████      | 4/10 [02:19<03:31, 35.29s/it]

 Epoch: 4. Mean Return: -0.060831509846827114. Mean Length: 8.752735229759299


 50%|█████     | 5/10 [02:54<02:55, 35.13s/it]

 Epoch: 5. Mean Return: -0.01970319634703198. Mean Length: 9.132420091324201


 60%|██████    | 6/10 [03:29<02:19, 34.98s/it]

 Epoch: 6. Mean Return: -0.04129670329670328. Mean Length: 8.791208791208792


 70%|███████   | 7/10 [04:04<01:45, 35.11s/it]

 Epoch: 7. Mean Return: -0.0021064814814814826. Mean Length: 9.25925925925926


 80%|████████  | 8/10 [04:39<01:09, 35.00s/it]

 Epoch: 8. Mean Return: -0.11072072072072085. Mean Length: 9.00900900900901


 90%|█████████ | 9/10 [05:14<00:35, 35.09s/it]

 Epoch: 9. Mean Return: 0.06234299516908215. Mean Length: 9.66183574879227


100%|██████████| 10/10 [05:49<00:00, 34.97s/it]

 Epoch: 10. Mean Return: -0.03091139240506324. Mean Length: 10.126582278481013





## Watch Model

In [13]:
env = GovernmentEnvironment(50)

In [14]:
for i in range(10000):
    o, r, d, _ = env.step(random.randint(0, 12))
    if d:
        env.reset()

In [15]:
def watch_model(render: bool = False, max_timesteps=200):
    env = GovernmentEnvironment(max_timesteps)
    done = False
    if render:
        env.init_render()
    obs = env.reset()
    actions = []
    while not done:
        if render:
            env.render()
            sleep(0.1)
        # logits, action = sample_action(np.reshape(obs, (1, obs.shape[0])))
        logits = actor(np.reshape(obs, (1, obs.shape[0])))
        logits = apply_invalid_mask(logits, env)
        action = tf.argmax(tf.squeeze(logits)).numpy()

        print(f"took action {action} which is {env.ACTION_MAPPINGS[action]}")
        obs, reward, done, _ = env.step(action)
        actions.append(action)
    if render:
        env.render()
        sleep(0.1)
        env.close()
    counter = Counter(actions)
    print(counter.most_common())


In [82]:
watch_model(render=True, max_timesteps=50)

took action 1 which is moveDown
[(1, 1)]


## Save

In [17]:
actor.save_weights("gov_actor_v2_weights")
critic.save_weights("gov_critic_v2_weights")