In [1]:
from mazegen import MazeEnv
import random
from dqnAgent import DQN
import numpy as np
from tqdm import tqdm
import tensorflow as tf


model = tf.keras.models.Sequential([
            tf.keras.layers.Conv2D(16, 3, activation='relu', input_shape=(36,36,3)),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Conv2D(64, 3, activation='relu'),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Conv2D(64, 3, activation='relu'),
            tf.keras.layers.MaxPooling2D(),
            tf.keras.layers.Flatten(),
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(4, activation = 'linear')
        ])

agent=DQN(model)






In [2]:
env=MazeEnv(size=7)
test_env=MazeEnv(size=7, render_mode="human")
N_ACTION=4

In [3]:
# Exploration
epsilon=1
EPSILON_DECAY=0.99
MIN_EPSILON=0.001

# Visualization
PREVIEW_TRAIN=False
PREVIEW_EVAL=True

# Episodes
N_EPISODES=20000
AGGREGATE_STATS_EVERY=100
LOAD_PATH="best_model_maze.keras"

https://medium.com/data-science-in-your-pocket/advantage-actor-critic-a2c-algorithm-in-reinforcement-learning-with-codes-and-examples-using-e810273c0c9e

In [None]:
if LOAD_PATH != "":
    agent.load(LOAD_PATH)
current_ep_rewards=[]
ep_rewards=[]


In [22]:
epsilon=1

In [23]:
for episode in tqdm(range(N_EPISODES), ascii=True, unit='episodes'):

    # Set reward to initial values
    episode_reward=0

    # Reset Environment
    current_state, _ =env.reset()

    # Iterate until episode end
    done = False
    while not done:

        # We use epsilon greedy policy, we take a random number and compare it to epsilon
        if random.random() > epsilon:
            # Choose action from q table
            action = np.argmax(agent.get_qtable(np.array([current_state])))
        else:
            # Random action
            action = random.choice([i for i in range(N_ACTION)])

        # Perform the step and get data
        new_state, reward, done, truncated, info = env.step(action)
        
        # Sum the reward
        episode_reward+=reward

        # Show preview if conditions are met
        if PREVIEW_TRAIN:
            env.render()

        # Every step we update replay memory
        agent.append_replay_memory((current_state, action, new_state, reward, done))
        agent.train(done)

        current_state = new_state

    current_ep_rewards.append(episode_reward)

    if episode % AGGREGATE_STATS_EVERY == 0:
        print("AGGREGATING EPISODE: ", episode) 
        # Get the average, minimum maximum reward across AGGREGATE_STATS_EVERY episodes
        average_reward = sum(current_ep_rewards) / len(current_ep_rewards)
        min_reward = min(current_ep_rewards)
        max_reward = max(current_ep_rewards)

        ep_rewards.append([average_reward, min_reward, max_reward])
        current_ep_rewards=[]

        # Evaluation
        print("Reward: ", agent.evaluate_policy(test_env, render=PREVIEW_EVAL))
        

    # Decay epsilon
    if epsilon > MIN_EPSILON:
        epsilon *= EPSILON_DECAY
        epsilon = max(MIN_EPSILON, epsilon)

  0%|          | 0/20000 [00:00<?, ?episodes/s]

AGGREGATING EPISODE:  0


  0%|          | 1/20000 [00:01<8:10:30,  1.47s/episodes]

Reward:  -1.0500000000000003


  0%|          | 100/20000 [02:19<12:02:06,  2.18s/episodes]

AGGREGATING EPISODE:  100


  1%|          | 101/20000 [02:21<12:40:37,  2.29s/episodes]

Reward:  -1.0


  1%|1         | 200/20000 [06:13<12:35:46,  2.29s/episodes]

AGGREGATING EPISODE:  200


  1%|1         | 201/20000 [06:17<14:30:05,  2.64s/episodes]

Reward:  -1.0


  1%|1         | 224/20000 [07:21<10:49:52,  1.97s/episodes]


KeyboardInterrupt: 

In [24]:
agent.save("best_model_maze.keras")

In [20]:
agent.evaluate_policy(test_env)

-1.1