<h1>Information</h1>

<h3>Paper used as outline & insperation</h3>
https://towardsdatascience.com/explained-curiosity-driven-learning-in-rl-exploration-by-random-network-distillation-72b18e69eb1b

In [1]:
#? imports
from nes_py.wrappers import JoypadSpace
import gym_super_mario_bros
from gym_super_mario_bros.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT

from collections import deque
import numpy as np
import random
import datetime

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPool2D, Flatten, Reshape, GRU, Dense
from tensorflow.keras.optimizers import Adam

# constant seed for tensorflow and numpy
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8730353498045840482
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 2203243613317093611
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 4940566368
locality {
  bus_id: 1
  links {
  }
}
incarnation: 15627421989151166618
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2060, pci bus id: 0000:06:00.0, compute capability: 7.5"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 18096483059872053751
physical_device_desc: "device: XLA_GPU device"
]


In [2]:
#? initialize the environment and envrionment variables
ACTION_SPACE = SIMPLE_MOVEMENT
ENVIRONMENT = 'SuperMarioBros-v3'

env = gym_super_mario_bros.make(ENVIRONMENT)
env = JoypadSpace(env, ACTION_SPACE)

OBSERVATION_SPACE = env.observation_space.sample().shape

In [3]:
#? agent static vars
CURIOSITY_EMBEDDING = 128

EPOCHS=10
BATCH_SIZE=32

In [4]:
#? build the target and predictor networks
def build_curiosity_network(trainable):
    i = Input(OBSERVATION_SPACE)

    x = Conv2D(4, (5, 5), activation='relu', trainable=trainable)(i)
    x = MaxPool2D((4, 4))(x)

    x = Conv2D(4, (5, 5), activation='relu', trainable=trainable)(x)
    x = MaxPool2D((4, 4))(x)

    x = Conv2D(8, (5, 5), activation='relu', trainable=trainable)(x)
    x = MaxPool2D((4, 4))(x)

    x = Flatten()(x)
    x = Dense(CURIOSITY_EMBEDDING, activation='relu', trainable=trainable)(x)

    model = Model(i, x)

    return model

target_model = build_curiosity_network(False)
predictor_model = build_curiosity_network(True)

predictor_model.compile(loss='mse', optimizer=Adam())

In [5]:
#? build the actor network
i = Input(OBSERVATION_SPACE)

x = Conv2D(4, (8, 8), padding='same', activation='relu')(i)
x = MaxPool2D((4, 4))(x)

x = Conv2D(8, (6, 6), padding='same', activation='relu')(x)
x = MaxPool2D()(x)

x = Conv2D(32, (3, 3), padding='same', activation='relu')(x)
x = MaxPool2D()(x)

x = Conv2D(64, (3, 3), padding='same', activation='relu')(x)
x = MaxPool2D()(x)

# x = Reshape((-1, 1))(x)
x = Flatten()(x)

# x = GRU(128)(x)
x = Dense(len(ACTION_SPACE))(x)

actor_model = Model(i, x)
actor_model.compile(loss='mse', optimizer=Adam())

In [6]:
#? build the agent
class Agent:
    def __init__(self, target_model, predictor_model, actor_model, batch_size=32, Y=0.9, epsilon=2, epsilon_decay=0.998, epsilon_min=0.2, action_space=len(ACTION_SPACE), min_memory_size=1000, memory_length=100000):

        # the target model acts as a hash function, creating a unique, unknown output for any observation
        self.target = target_model
        # the predictor tries to match the output of the target.  This means that its error can act as a function for how 'familiar' an observation is
        self.predictor = predictor_model

        # model that actually selects what action to do
        self.actor = actor_model

        self.batch_size = batch_size
        self.Y = Y
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min

        self.action_space = action_space
        self.min_memory_size = min_memory_size
        self.memory = deque(maxlen=memory_length)

    def add_experience(self, observation, action, new_observation):
        self.memory.append([
            observation,
            action,
            new_observation
        ])

    def act(self, obs):

        # randomly explore or take predicted action
        if (random.uniform(0, 1) < self.epsilon):
            return (0, random.randint(0, self.action_space-1), self.action_space-1)
        else:
            prediction = self.actor.predict(np.asarray([obs]))

            return (1, prediction.argmax() - 1, prediction)

    def train(self):

        # extra info from the memory
        observations     = np.array([x[0] for x in list(self.memory)])
        actions          = np.array([x[1] for x in list(self.memory)])
        next_observation = np.array([x[2] for x in list(self.memory)])

        # calculate the novelty of each observation
        y_true = self.target.predict(observations)
        y_pred = self.predictor.predict(observations)

        novelty = np.mean(np.square(y_true - y_pred), axis=1)

        # train the predictor
        self.predictor.fit(observations, y_true, batch_size=self.batch_size, verbose=0)

        # don't the actor train unless there is sufficient memory
        if (len(self.memory) < self.min_memory_size):
            return np.mean(novelty)

        # calculate the new q values for the actor
        current_qs = self.actor.predict(observations)

        max_future_reward = self.actor.predict(next_observation).max()

        new_qs = (1 - self.Y) * novelty + self.Y * max_future_reward

        # update the q values with new qs calculated with the novelty and the estimated maximum future novelty
        for i in range(len(new_qs)):
            current_qs[i][actions[i]] = new_qs[i]

        # train the actor
        self.actor.fit(observations, current_qs, batch_size=self.batch_size, verbose=0)

        # increment the epsilon value
        self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

        return np.mean(novelty)

In [7]:
#? begin the training process
agent = Agent(target_model, predictor_model, actor_model, min_memory_size=10000, memory_length=20000, epsilon=1.2, epsilon_decay=0.95)

j = 0
for i in range(1, EPOCHS+1):
    obs = env.reset()
    t = datetime.datetime.now()

    while not env.done:
        # take an action
        action = agent.act(obs)
        try:
            new_obs, reward, done, info = env.step(action[1])
        except:
            print(obs.shape)
            print(action[2].shape)
            break

        if j % 20 == 0:
            env.render()

        # add the experience to the agent's memory
        agent.add_experience(obs, action[1], new_obs)

        obs = new_obs

        j += 1
        if j % 2000 == 0:
            # train the agent on its past experiences
            mean_novelty = agent.train()

            time = (int)((datetime.datetime.now() - t).total_seconds())
            actor = len(agent.memory) >= agent.min_memory_size
            eps = (int)(agent.epsilon * 100) / 100

            print(f'Training | Epoch time: {time}s | Actor: {actor} | Epsilon: {eps} | Mean Novelty: {(int)(mean_novelty)}')
            t = datetime.datetime.now()

Training | Epoch time: 7s | Actor: False | Epsilon: 1.2 | Mean Novelty: 1586
Training | Epoch time: 7s | Actor: False | Epsilon: 1.2 | Mean Novelty: 600
Training | Epoch time: 9s | Actor: False | Epsilon: 1.2 | Mean Novelty: 563
Training | Epoch time: 11s | Actor: False | Epsilon: 1.2 | Mean Novelty: 603
Training | Epoch time: 23s | Actor: True | Epsilon: 1.13 | Mean Novelty: 657
Training | Epoch time: 25s | Actor: True | Epsilon: 1.08 | Mean Novelty: 640
Training | Epoch time: 29s | Actor: True | Epsilon: 1.02 | Mean Novelty: 600
Training | Epoch time: 33s | Actor: True | Epsilon: 0.97 | Mean Novelty: 591
Training | Epoch time: 40s | Actor: True | Epsilon: 0.92 | Mean Novelty: 549
Training | Epoch time: 45s | Actor: True | Epsilon: 0.88 | Mean Novelty: 593
Training | Epoch time: 49s | Actor: True | Epsilon: 0.83 | Mean Novelty: 569
Training | Epoch time: 52s | Actor: True | Epsilon: 0.79 | Mean Novelty: 561
Training | Epoch time: 56s | Actor: True | Epsilon: 0.75 | Mean Novelty: 556
T