# SGAI models (DQN)

This notebook is based off of the pytorch tutorial [here](https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html). It is intended to both create and train models for Courtney2-Outbreak

### Setup

In [1]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [2]:
import sys
import numpy as np
import tensorflow as tf
import keras.layers as layers
import keras.models as models
import keras
from collections import namedtuple, Counter
from queue import deque
import random
import math
from typing import List
from tqdm import tqdm  # used for progress meters

PREFIX = "SGAI_MK3"
# make sure that it is able to import Board
sys.path.append(PREFIX)

from Board import Board
from constants import *
from Player import ZombiePlayer, GovernmentPlayer


In [3]:
DEVICE = "CPU"
# tf.debugging.set_log_device_placement(True)
devices = tf.config.list_physical_devices(DEVICE)
print(devices)
if DEVICE == "GPU":
    tf.config.experimental.set_memory_growth(devices[0], True)


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


### Training Environments

In [4]:
class GovernmentEnvironment:
    ACTION_MAPPINGS = {
        0: "moveUp",
        1: "moveDown",
        2: "moveLeft",
        3: "moveRight",
        4: "wallUp",
        5: "wallDown",
        6: "wallLeft",
        7: "wallRight",
        8: "vaccinate",
        9: "cureUp",
        10: "cureDown",
        11: "cureLeft",
        12: "cureRight",
    }
    ACTION_SPACE = len(ACTION_MAPPINGS.keys())
    SIZE = (6, 6)

    def __init__(self, max_timesteps: int = 300, logdir: str = "", run_name="") -> None:
        self.max_timesteps = max_timesteps
        self.reset()
        self.total_timesteps = 0
        self.total_invalid_moves = 0
        self.writer = None
        if logdir != "" and run_name != "":
            self.writer = tf.summary.create_file_writer(f"{logdir}/{run_name}")

    def reset(self):
        self.board = Board(GovernmentEnvironment.SIZE, "Government")
        self.board.populate(num_people=4, num_zombies=3)
        self.enemyPlayer = ZombiePlayer()
        self.done = False

        # coordinates of the first Government player
        self.agentPosition = self.board.indexOf(False)

        # useful for metrics
        self.max_number_of_government = 1
        self.episode_invalid_actions = 0
        self.episode_reward = 0
        self.episode_timesteps = 0
        return self._get_obs()

    def step(self, action: int):
        action_name = GovernmentEnvironment.ACTION_MAPPINGS[action]
        # print("Before: ", end = str(self.agentPosition))
        # print()
        # print(action_name)
        if "move" in action_name:
            # print(self.board.get_board())
            valid, new_pos = self.board.actionToFunction[action_name](
                self.board.toCoord(self.agentPosition)
            )
            if valid:
                # print(self.board.get_board())
                # print(self.agentPosition)
                self.agentPosition = new_pos
                # print("After: ", end = str(self.agentPosition))
                # print()
        elif "vaccinate" in action_name:
            valid, _ = self.board.actionToFunction[action_name](
                self.board.toCoord(self.agentPosition)
            )
            if valid:
                self.board.vaccinate(self.agentPosition)
        elif "cure" in action_name:
            dest_coord = list(self.board.toCoord(self.agentPosition))
            if action_name == "cureUp":
                dest_coord[1] -= 1
            elif action_name == "cureDown":
                dest_coord[1] += 1
            elif action_name == "cureRight":
                dest_coord[0] += 1
            else:
                dest_coord[0] -= 1
            valid, _ = self.board.actionToFunction["cure"](dest_coord)
        else:  # wall variation
            dest_coord = list(self.board.toCoord(self.agentPosition))
            if action_name == "wallUp":
                dest_coord[1] -= 1
            elif action_name == "wallDown":
                dest_coord[1] += 1
            elif action_name == "wallRight":
                dest_coord[0] += 1
            else:
                dest_coord[0] -= 1
            valid, _ = self.board.actionToFunction["wall"](dest_coord)
    
        won = None
        # do the opposing player's action if the action was valid.
        if valid:
            _action, coord = self.enemyPlayer.get_move(self.board)
            if not _action:
                self.done = True
                won = True
            else:
                self.board.actionToFunction[_action](coord)
            self.board.update()

        # see if the game is over
        # print(self.agentPosition)
        # print(self.board.get_board())
        # print(self._get_obs())
        if self.board.States[self.agentPosition].person is None:
            print("Lost Person")
            
        if self.board.States[
            self.agentPosition
        ].person.isZombie:  # zombie was cured
            self.done = True
            won = False
        if not self.board.is_move_possible_at(self.agentPosition):  # no move possible
            self.done = True
        if self.episode_timesteps > self.max_timesteps:
            self.done = True

        # get obs, reward, done, info
        obs, reward, done, info = (
            self._get_obs(),
            self._get_reward(action_name, valid, won),
            self._get_done(),
            self._get_info(),
        )

        # update the metrics
        self.episode_reward += reward
        if not valid:
            self.episode_invalid_actions += 1
            self.total_invalid_moves += 1
        self.episode_timesteps += 1
        self.max_number_of_government = max(
            self.board.num_people(), self.max_number_of_government
        )
        self.total_timesteps += 1

        # write the metrics
        if self.writer is not None:
            with self.writer.as_default():
                tf.summary.scalar(
                    "train/invalid_action_rate",
                    self.total_invalid_moves / self.total_timesteps,
                    step=self.total_timesteps,
                )
                tf.summary.scalar("train/cur_reward", reward, step=self.total_timesteps)

        # return the obs, reward, done, info
        return obs, reward, done, info

    def _get_info(self):
        return {}

    def _get_done(self):
        return self.done

    def _get_reward(self, action_name: str, was_valid: bool, won: bool):
        """
        Gonna try to return reward between [-1, 1]
        This fits w/i tanh and sigmoid ranges
        """
        if not was_valid:
            return -10
        if won is True:
            return 10
        if won is False:
            return -2
        if "wall" in action_name:
            return 3
        if "vaccinate" in action_name:
            return 6
        if "cure" in action_name:
            return 7
        return 0.01  # this is the case where it was move

    def _get_obs(self):
        """
        Is based off the assumption that 5 is not in the returned board.
        Uses 5 as the key for current position.
        """
        AGENT_POSITION_CONSTANT = 5
        ret = self.board.get_board()
        ret[self.agentPosition] = AGENT_POSITION_CONSTANT
        
        """# normalize observation to be be centered at 0
        ret = np.array(ret, dtype=np.float32)
        ret /= np.float32(AGENT_POSITION_CONSTANT)
        ret -= np.float32(0.5)"""
        return np.array(ret)

    def render(self):
        import PygameFunctions as PF
        import pygame

        PF.run(self.board)
        pygame.display.update()

    def init_render(self):
        import PygameFunctions as PF
        import pygame

        PF.initScreen(self.board)
        pygame.display.update()

    def close(self):
        import pygame

        pygame.quit()

    def write_run_metrics(self):
        if self.writer is not None:
            with self.writer.as_default():
                tf.summary.scalar(
                    "episode/num_invalid_actions_per_ep",
                    self.episode_invalid_actions,
                    step=self.total_timesteps,
                )
                tf.summary.scalar(
                    "episode/episode_length",
                    self.episode_timesteps,
                    step=self.total_timesteps,
                )
                tf.summary.scalar(
                    "episode/episode_total_reward",
                    self.episode_reward,
                    step=self.total_timesteps,
                )
                tf.summary.scalar(
                    "episode/mean_reward",
                    self.episode_reward / self.episode_timesteps,
                    step=self.total_timesteps,
                )
                tf.summary.scalar(
                    "episode/percent_invalid_per_ep",
                    self.episode_invalid_actions / self.episode_timesteps,
                    step=self.total_timesteps,
                )


### Make models

In [5]:
GOVERNMENT_OUTPUT_SIZE = len(GovernmentEnvironment.ACTION_SPACE)
INPUT_SHAPE = (ROWS * COLUMNS,)


In [6]:
# from torch import conv2d


def make_government_model():
    """
    makes the model that will be used for zombies
    The output of the model will be the predicted q value
    for being in a certain state.
    """
    model = models.Sequential()
    model.add(layers.Reshape((6,6,1), input_shape = INPUT_SHAPE))
    model.add(layers.Conv2DTranspose(256, (3,3)))
    model.add(layers.Activation("relu"))
    model.add(layers.MaxPooling2D(pool_size=(2,2)))
    model.add(layers.Dropout(0.2))

    model.add(layers.Conv2DTranspose(256, (3,3)))
    model.add(layers.Activation("relu"))
    model.add(layers.MaxPooling2D(pool_size=(2,2)))
    model.add(layers.Dropout(0.2))

    model.add(layers.Flatten())
    model.add(layers.Dense(64))

    model.add(layers.Dense(GOVERNMENT_OUTPUT_SIZE, activation="linear"))

    return model


    # model = models.Sequential()
    # model.add(layers.InputLayer(INPUT_SHAPE))
    # model.add(layers.Flatten())
    # model.add(layers.Dense(36 * 2))
    # model.add(layers.LeakyReLU())
    # model.add(layers.Dense(36 * 4))
    # model.add(layers.LeakyReLU())
    # model.add(layers.Dense(36 * 8))
    # model.add(layers.LeakyReLU())
    # model.add(layers.Dense(36 * 16))
    # model.add(layers.LeakyReLU())
    # model.add(layers.Dense(36 * 32))
    # model.add(layers.LeakyReLU())
    # model.add(layers.Dense(GOVERNMENT_OUTPUT_SIZE * 16))
    # model.add(layers.LeakyReLU())
    # model.add(layers.Dense(GOVERNMENT_OUTPUT_SIZE * 8))
    # model.add(layers.LeakyReLU())
    # model.add(layers.Dense( * 4))
    # model.add(layers.LeakyReLU())
    # model.add(layers.Dense(GOVERNMENT_OUTPUT_SIZE * 2))
    # model.add(layers.LeakyReLU())
    # model.add(layers.Dense(GOVERNMENT_OUTPUT_SIZE, activation='tanh'))
    # return model


In [7]:
with tf.device(DEVICE):
    gov_policy = make_government_model()
    gov_target = make_government_model()


In [8]:
print(gov_policy.input_shape)
gov_policy.summary()


(None, 36)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 reshape (Reshape)           (None, 6, 6, 1)           0         
                                                                 
 conv2d_transpose (Conv2DTra  (None, 8, 8, 256)        2560      
 nspose)                                                         
                                                                 
 activation (Activation)     (None, 8, 8, 256)         0         
                                                                 
 max_pooling2d (MaxPooling2D  (None, 4, 4, 256)        0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 4, 4, 256)         0         
                                                                 
 conv2d_transpose_1 (Conv2DT  (None, 6, 6, 25

In [9]:
# make sure the output is correct shape
with tf.device(DEVICE):
    temp = gov_policy(tf.random.normal((1, 36)), training=False)
print(temp.shape)


(1, 8)


### Load saved model

In [10]:
# gov_policy.load_weights("gov_policy_weights")
# gov_target.load_weights("gov_policy_weights")


### DQN utilities

In [11]:
# this acts as a class; useful in the training
Transition = namedtuple("Transition", ("state", "action", "next_state", "reward"))


class ReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)


### Optimizers and Loss

In [12]:
with tf.device(DEVICE):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = keras.losses.MeanSquaredError()


### Training loop

In [13]:
BATCH_SIZE = 256
GAMMA = 0.999
EPSILON_MAX = 0.9  # exploration rate maximum
EPSILON_MIN = 0.05  # exploration rate minimum
EPS_DECAY = 1000  # decay rate, in steps
TARGET_UPDATE = 10  # how many episodes before the target is updated

BUFFER_CAPACITY = 10000
memory = ReplayMemory(BUFFER_CAPACITY)


In [14]:
def select_gov_action(state, steps_done: int = -1, writer=None):
    """
    If no steps are provided, assuming not going to do
    random exploration
    """
    sample = random.random()
    eps_threshold = 0
    if steps_done != -1:
        eps_threshold = EPSILON_MIN + (EPSILON_MAX - EPSILON_MIN) * math.exp(
            -1.0 * steps_done / EPS_DECAY
        )
    if writer is not None:
        with writer.as_default():
            tf.summary.scalar('exploration rate', eps_threshold, step=steps_done)
    if sample > eps_threshold:
        # Pick the action with the largest expected reward.
        temp = gov_policy(state, training=False)
        numpy = temp.numpy().flatten()
        return tf.constant([tuple(numpy).index(max(numpy))], dtype=tf.int32)
    else:
        return tf.constant([random.randrange(GOVERNMENT_OUTPUT_SIZE)], dtype=tf.int32)


In [15]:
@tf.function
def train_on_batch(
    state_batch: tf.Tensor,
    action_batch: tf.Tensor,
    reward_batch: tf.Tensor,
    non_final_next_states: tf.Tensor,
    non_final_mask: tf.Tensor,
):
    with tf.GradientTape() as policy_tape:
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        action_batch = tf.expand_dims(action_batch, 1)
        state_action_values = tf.gather_nd(
            gov_policy(state_batch, training=True), action_batch, 1
        )

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = tf.scatter_nd(
            tf.expand_dims(non_final_mask, 1),
            tf.reduce_max(gov_target(non_final_next_states, training=False), 1),
            tf.constant([BATCH_SIZE]),
        )

        # Compute the expected Q values
        expected_state_action_values = tf.squeeze(
            (next_state_values * GAMMA) + reward_batch
        )

        # compute loss (mean squared error)
        assert state_action_values.shape == expected_state_action_values.shape
        _loss = loss(state_action_values, expected_state_action_values)

    # Optimize the model
    policy_gradient = policy_tape.gradient(_loss, gov_policy.trainable_variables)

    # apply gradient
    optimizer.apply_gradients(zip(policy_gradient, gov_policy.trainable_variables))


In [16]:
def train(epochs, max_timesteps=200, render=False, logdir="", run_name=""):
    env = GovernmentEnvironment(max_timesteps, logdir, run_name)
    if render:
        env.init_render()

    for episode in tqdm(range(epochs)):
        # Initialize the environment and state
        prev_obs = env.reset()
        done = False
        timesteps = 0
        while not done:
            if render:
                env.render()

            # Select and perform an action
            action = select_gov_action(
                tf.constant([prev_obs]), env.total_timesteps, env.writer
            )
            action = action.numpy()[0]  # "flatten" the tensor and take the item
            new_obs, reward, done, _ = env.step(action)
            # reward = tf.constant([reward])

            # Observe new state
            if not done:
                next_state = new_obs
            else:
                next_state = None

            # Store the transition in memory
            memory.push(prev_obs, action, next_state, reward)

            # Move to the next state
            prev_obs = next_state

            # Perform one step of the optimization (on the policy network)
            if len(memory) >= BATCH_SIZE:
                # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
                # detailed explanation). This converts batch-array of Transitions
                # to Transition of batch-arrays.
                batch = Transition(*zip(*memory.sample(BATCH_SIZE)))

                # compute the states that aren't terminal states
                non_final_mask = tf.constant(
                    tuple(
                        idx
                        for state, idx in zip(
                            batch.next_state, range(len(batch.next_state))
                        )
                        if state is not None
                    ),
                )
                non_final_next_states = tf.cast(
                    tuple(state for state in batch.next_state if state is not None),
                    dtype=tf.float32,
                )

                train_on_batch(
                    tf.cast(batch.state, dtype=tf.float32),
                    tf.cast(batch.action, dtype=tf.int32),
                    tf.cast(batch.reward, dtype=tf.float32),
                    non_final_next_states,
                    non_final_mask,
                )

        env.write_run_metrics()

        # Update the target network, copying all weights and biases in DQN
        if episode % TARGET_UPDATE == 0:
            gov_policy.save_weights(PREFIX+"/gov_policy_weights")
            gov_target.load_weights(PREFIX+"/gov_policy_weights")
    # env.close()
    gov_policy.save_weights(PREFIX+"/gov_policy_weights")


### Start Training!

In [17]:
RUN_NUMBER = 1

In [18]:
for i in range(5):
    train(175, 100, render=False, logdir="GovernmentEnvironment", run_name=f"run{RUN_NUMBER}")
    RUN_NUMBER+=1


  3%|▎         | 6/175 [00:00<00:07, 23.58it/s]2022-07-28 14:42:14.042182: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
 98%|█████████▊| 171/175 [19:23<00:34,  8.51s/it]

### View Model Playing

In [None]:
def watch_model(max_timesteps=200):
    env = GovernmentEnvironment(max_timesteps)
    done = False
    #env.init_render()
    obs = env.reset()
    actions = []
    while not done:
        #env.render()
        action = select_gov_action(tf.constant([obs])).numpy()[0]
        obs, reward, done, _ = env.step(action)
        actions.append(action)
    #env.close()
    counter = Counter(actions)
    print(counter.most_common())


In [None]:
watch_model()


[(5, 202)]
