In [8]:
import tensorflow as tf
from tensorflow import keras
from Maze import Maze
from time import time
import numpy as np

In [9]:
optimizer = keras.optimizers.Adam(learning_rate=0.01)
huber_loss = keras.losses.Huber()
action_probs_history = []
critic_value_history = []
rewards_history = []

In [10]:
def train_reinforcement_network(n_neurons, maze_height=5, maze_width=10, gamma=0.3, eps=np.finfo(np.float32).eps.item()):
    """
    Training algorithm for the maze
    :param gamma: The discount value for past rewards
    :param eps: The epsilon value (small number!0)
    :param n_neurons: The number of neurons for the common layer
    :param maze_height: The starting maze height (increases by 1 each training run)
    :param maze_width: The starting maze width (increases by 1 each training run)
    :return:
    """
    if maze_height == 'random' and maze_width == 'random':
        import random
        maze_height = random.randint(5,30)
        maze_width = random.randint(5,30)

    inputs = keras.layers.Input(shape=(Maze.N_INPUTS,))

    common = keras.layers.Dense(n_neurons, activation="relu")(inputs)
    action = keras.layers.Dense(Maze.N_INPUTS, activation="softmax")(common)
    critic = keras.layers.Dense(1)(common)

    # The full RL model with the action layer and the critic
    model = keras.Model(inputs=inputs, outputs=[action, critic])

    running_reward = 0


    mazes_complete = 0
    while True:
        # Reward for the current run
        episode_reward = 0

        # Create a maze and display it!
        my_maze = Maze(maze_height, maze_width)
        my_maze.create_maze()
        my_maze.print_maze()

        state: list

        maze_height += 1
        maze_width += 1

        # Loop through the steps of one maze
        start_time = time()
        iteration = 0
        prog_bar = keras.utils.Progbar(1)
        while not my_maze.is_maze_complete():
            iteration += 1
            # print(f"Iteration: {iteration}", my_maze.player.r, my_maze.player.c)
            with tf.GradientTape() as tape:
                state = tf.convert_to_tensor([int(x) for x in my_maze.get_maze_options()])
                state = tf.expand_dims(state, 0)

                action_probs, critic_value = model(state)
                critic_value_history.append(critic_value[0, 0])


                p = np.squeeze(action_probs)

                action = np.random.choice(Maze.N_INPUTS, p=p)
                action_probs_history.append(tf.math.log(action_probs[0, action]))

                reward = my_maze.take_action(action)
                prog_bar.update(reward)
                rewards_history.append(reward)
                episode_reward += reward


                running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward

                # Calculate expected value from rewards
                returns = []
                discounted_sum = 0
                for r in rewards_history[::-1]:
                    discounted_sum = r + gamma * discounted_sum
                    returns.insert(0, discounted_sum)

                # Normalize
                returns = np.array(returns)
                returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
                returns = returns.tolist()

                # Calculating loss values to update our network
                history = zip(action_probs_history, critic_value_history, returns)
                actor_losses = []
                critic_losses = []
                for log_prob, value, ret in history:
                    diff = ret - value
                    actor_losses.append(-log_prob * diff)  # actor loss

                    critic_losses.append(huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)))

                # Backpropagation
                loss_value = sum(actor_losses) + sum(critic_losses)
                grads = tape.gradient(loss_value, model.trainable_variables)
                optimizer.apply_gradients(zip(grads, model.trainable_variables))
        print(f'Training time: {time()-start_time}')
        mazes_complete += 1


In [None]:
train_reinforcement_network(n_neurons=5, maze_height=4, maze_width=4)


█ ▢ █ █ 
█     █ 
█   █ █ 
█ △ █ █ 




█ ▢ █ █ █ 
█   █   █ 
█   █   █ 
█       █ 
█ █ █ △ █ 




█ ▢ █ █ █ █ 
█         █ 
█ █   █ █ █ 
█         █ 
█ █   █   █ 
█ █ █ █ △ █ 




█ ▢ █ █ █ █ █ 
█       █   █ 
█   █ █ █   █ 
█     █ █   █ 
█   █ █ █   █ 
█           █ 
█ █ █ █ █ △ █ 



0/1 [..............................] - ETA: 27:41