# ICA - CIS4049-N - Artificial Intelligence Foundations
## Video Game AI Solution Experiment - Q Learning, Deep Reinforcement Learning & A* Search
*Student Name - Thomas Popham*  
*Student Number - B1662096*  
*Student Email - b1662096@live.tees.ac.uk*   
*Module Leader - Dr. Alessandro Di Stefano*

## 1. Install Required Python Libaries

In [1]:
# Make sure that all necessary Python packages are installed correctly through pip install when testing through Google Colab 
import sys
!{sys.executable} -m pip install numpy scipy pandas seaborn pygame stable-baselines3 networkx plotly pydot
# If pip fails to install the necessary packages, try installing the packages through Anaconda on the local machine
# %conda install --yes --prefix {sys.prefix} numpy pandas seaborn pygame stable-baselines3 networkx plotly pydot

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame
  Downloading pygame-2.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.8/21.8 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting stable-baselines3
  Downloading stable_baselines3-1.7.0-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.8/171.8 KB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
Collecting gym==0.21
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m64.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting importlib-metadata~=4.13
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Building wheels for collected packages: gym
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Crea

### 1.1 Import Required Python Libraries

In [4]:
%matplotlib inline
# Core Python Libraries
import os
import sys
import math
import random
import gzip
from queue import PriorityQueue
from timeit import timeit

# Anaconda/Pip Python Libraries
from IPython import display
import numpy as np
import pandas as pd
import scipy as sci
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
import seaborn as sns

# Reinforcement Learning Python Libraries
import gym as gym
import pygame
import stable_baselines3 as sb3
from stable_baselines3.a2c.a2c import A2C
from stable_baselines3.ppo.ppo import PPO
from stable_baselines3.dqn.dqn import DQN
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.vec_env.dummy_vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Informed Search Python Libraries
import networkx as nx
from networkx.drawing.nx_pydot import graphviz_layout

# # Google Drive folder path access
# from google.colab import drive

# # Access the folder in Google Drive to save training logs
# drive.mount('/content/drive/')

## 2. Reinforcement Learning/Q Learning Algorithm - Evaluation

In [3]:
# Initialise the environments with the necessary environment parameters set

# Frozen Lake Environment - 4x4
frozen_maze_env = gym.make('FrozenLake-v1', map_name="4x4", is_slippery=True)

# Cliff Walking Environment
cliff_walking_env = gym.make('CliffWalking-v0')

# Taxi Environment
taxi_env = gym.make('Taxi-v3')

### 2.1 Q Learning Evaluation with Epsilon Greedy Method

In [None]:
# Start training based on the environment's action sample and the necessary hyperparameters
def trainWithRandomActions(env, num_obs, num_steps, learning_rate, gamma, max_epsilon, min_epsilon, prob_decay):

  # Check the actions, reward range and observation space for the environment that is passed
  print("Number of possible actions to take in environment: {}".format(env.action_space.n))
  print("Number of possible states in environment: {}".format(env.observation_space.n))
  print("Action space: {}".format(env.action_space))
  print("Observation space: {}".format(env.observation_space))
  print("Observation space shape: {}".format(env.observation_space.shape))

  # Create a Q table to track the number of states and actions that are required by the model based on the environment
  q_table = np.zeros((env.observation_space.n, env.action_space.n))

  total_obs_rewards = []

  epsilon = max_epsilon

  for obs in range(num_obs):
    # Reset the environment before stepping through it for every observation
    init_obs = env.reset()
    # Define the rewards list that gains the rewards from the steps made
    obs_rewards = []
    
    # Step through the environment and retrieve observations from the environment
    for step in range(num_steps):

        # Visualise the environment as a pop-up window
        env.render()

        # Randomly generate a number from 0 to 1
        random_int_value = random.uniform(0,1)

        # If the random number is greater than the epsilon value
        if random_int_value < epsilon:
          # Create a random action to pass when making each step
          action = env.action_space.sample()
        else:
          # Get the action on the highest value based on the state
          action = np.argmax(q_table[init_obs, :])
      
        # Check the action sample gained from the epsilon greedy method
        print("Action Sample: {}".format(action))

        # For every step through the environment, retrieve the observation, reward, if its done and environment info
        new_obs, reward, done, info = env.step(action)
        # Check the contents of the steps taken
        print("Observations: {}".format(new_obs))
        print("Observations Data Type: {}".format(type(new_obs)))
        print("Reward: {}".format(reward))
        print("If done: {}".format(done))
        print("Environment info: {}".format(info))

        # Check the probability of the state transitioning to another state
        print("Transition Probability based on the environment {}".format(env.P[new_obs][action]))

        # Update the Q table with the action and observation (state) values
        q_table[init_obs][action] = q_table[init_obs][action] + learning_rate * (reward + gamma * np.max(q_table[new_obs, :]) - q_table[init_obs][action])

        # Add the reward based on the environment and its state
        obs_rewards.append(reward)

        # Make sure that the state is always updated when stepping through the envinroment
        obs = new_obs

        # Check the contents of the q table based on the random actions based on the environment's action space
        with np.printoptions(precision=5, suppress=True):
          print("Q Table After Random Sampling: {}".format(q_table))

        if done:
          total_obs_rewards.append(obs_rewards)
          # Reduce the value of epsilon, to ensure that the agent explores a good amount of state space
          epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-prob_decay*obs)
          break

  # Close the environment
  env.close()
  
  # Return the q_table
  return q_table


In [None]:
# Start training based on the environment's action sample and the necessary hyperparameters
def evaluateTrainedQTable(env, num_obs, num_steps, q_table):

  # Check the actions, reward range and observation space for the environment that is passed
  print("Number of possible actions to take in environment: {}".format(env.action_space.n))
  print("Number of possible states in environment: {}".format(env.observation_space.n))
  print("Action space: {}".format(env.action_space))
  print("Observation space: {}".format(env.observation_space))
  print("Observation space shape: {}".format(env.observation_space.shape))

  total_obs_rewards = []

  for obs in range(num_obs):
    # Reset the environment before stepping through it for every observation
    init_obs = env.reset()
    # Define the rewards list that gains the rewards from the steps made
    obs_rewards = 0
    
    # Step through the environment and retrieve observations from the environment
    # Whether the agent fails to reach the goal in each observation and returns done
    for step in range(num_steps):
        # Visualise the environment as a pop-up window
        env.render()
        # Get an action from the greedy method
        action = np.argmax(q_table[init_obs, :])
        # Check the action sample gained from the epsilon greedy method
        print("Action Sample: {}".format(action))
        # For every step through the environment, retrieve the observation, reward, if its done and environment info
        new_obs, reward, done, info = env.step(action)
        # Check the contents of the steps taken
        print("Observations: {}".format(new_obs))
        print("Observations Data Type: {}".format(type(new_obs)))
        print("Reward: {}".format(reward))
        print("If done: {}".format(done))
        print("Environment info: {}".format(info))

        # Check the probability of the state transitioning to another state
        print("Transition Probability based on the environment {}".format(env.P[init_obs][action]))

        # Add the reward based on the environment and its state
        obs_rewards += reward

        if done:
          # Accumulate all the rewards from the observation to the overall total
          total_obs_rewards.append(obs_rewards)
          if reward == 1:
            print("Goal reached")
          else:
            print("Failed to reach the goal")

  # Close the environment
  env.close()

  # Overall success rate
  print('Overall Success rate: {0:.2f} %'.format(100*np.sum(total_obs_rewards)/len(total_obs_rewards)))

  # Average number of steps to take when reaching the goal
  print('Average number of steps taken to reach the goal: {0:.2f}'.format(np.mean(num_steps)))

  # Get the overall mean and std of all the gained rewards from each observation
  mean_reward = np.mean(total_obs_rewards)
  std_reward = np.std(total_obs_rewards)
  
  # Return the mean_reward and the std reward
  return mean_reward, std_reward


In [81]:
# Set the inital learning rate and gamma for the model and custom evaluation method
learning_rate = 0.8
num_obs = 1000
num_steps = 124
gamma = 0.95
max_epsilon = 1.0
min_epsilon = 0.01
prob_decay = 0.001

# Train the environment with specific number of observations
trained_Q_table = trainWithRandomActions(frozen_maze_env, num_obs, num_steps, learning_rate, gamma, max_epsilon, min_epsilon, prob_decay)
# Print mean reward from the custom training method
print("Trained Q Table from random action sample - Frozen Lake:- {}".format(trained_Q_table))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  (Up)
S[41mF[0mFF
FHFH
FFFH
HFFG
Action Sample: 3
Observations: 0
Observations Data Type: <class 'int'>
Reward: 0.0
If done: False
Environment info: {'prob': 0.3333333333333333}
Transition Probability based on the environment [(0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False)]
Q Table After Random Sampling: [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
  (Up)
[41mS[0mFFF
FHFH
FFFH
HFFG
Action Sample: 2
Observations: 4
Observations Data Type: <class 'int'>
Reward: 0.0
If done: False
Environment info: {'prob': 0.3333333333333333}
Transition Probability based on the environment [(0.3333333333333333, 8, 0.0, False), (0.3333333333333333, 5, 0.0, True), (0.3333333333333333

In [None]:
# Evaluate the environment with specific number of observations
Q_frozen_lake_mean_reward, std_reward = evaluateTrainedQTable(frozen_maze_env, 1000, num_steps, trained_Q_table)
# Print mean reward from the custom training method
print("Mean Reward from random action sample - Frozen Lake:- {}".format(Q_frozen_lake_mean_reward))
print("\n")
# Print std reward from the custom training method
print("Std Reward from random action sample - Frozen Lake:- {}".format(std_reward))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
F[41mH[0mFH
FFFH
HFFG
Action Sample: 1
Observations: 5
Observations Data Type: <class 'int'>
Reward: 0
If done: True
Environment info: {'prob': 1.0}
Transition Probability based on the environment [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 4, 0.0, False), (0.3333333333333333, 1, 0.0, False)]
Failed to reach the goal
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Action Sample: 1
Observations: 5
Observations Data Type: <class 'int'>
Reward: 0
If done: True
Environment info: {'prob': 1.0}
Transition Probability based on the environment [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 4, 0.0, False), (0.3333333333333333, 1, 0.0, False)]
Failed to reach the goal
  (Down)
SFFF
F[41mH[0mFH
FFFH
HFFG
Action Sample: 1
Observations: 5
Observations Data Type: <class 'int'>
Reward: 0
If done: True
Environment info: {'prob': 1.0}
Transition Probability based on the environment [(0.3333333333333333, 0, 0.0, False)

In [82]:
# Set the inital learning rate and gamma for the model and custom training method
learning_rate = 0.8
num_obs = 1000
num_steps = 100
gamma = 0.95
max_epsilon = 1.0
min_epsilon = 0.01
prob_decay = 0.001

# Train the environment with specific number of observations
trained_Q_table = trainWithRandomActions(cliff_walking_env, num_obs, num_steps, learning_rate, gamma, max_epsilon, min_epsilon, prob_decay)
print("Q Table After Training {}".format(trained_Q_table))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [ -1.       -1.00005 -18.51962  -1.00006]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]
 [  0.        0.        0.        0.     ]]
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  x  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

Action Sample: 0
Observations: 15
Observations Data Type: <class 'int'>
Reward: -1
If done: False
Environ

KeyboardInterrupt: ignored

In [None]:
Q_cliff_walking_mean_reward, std_reward = evaluateTrainedQTable(cliff_walking_env, 1000, num_steps, learning_rate, trained_Q_table)

# Print mean reward from the custom evaluation method
print("Mean Reward from random action sample - Cliff Walking:- {}".format(Q_cliff_walking_mean_reward))
print("\n")
# Print std reward from the custom evaluation method
print("Std Reward from random action sample - Cliff Walking:- {}".format(std_reward))

In [None]:
# Set the inital learning rate and gamma for the model and custom training method
learning_rate = 0.8
num_obs = 1000
num_steps = 100
gamma = 0.95
max_epsilon = 1.0
min_epsilon = 0.01
prob_decay = 0.001

# Train the environment with specific number of observations
trained_Q_table = trainWithRandomActions(taxi_env, num_obs, num_steps, learning_rate, gamma, max_epsilon, min_epsilon, prob_decay)
print("Q Table After Training {}".format(trained_Q_table))

In [None]:
Q_taxi_mean_reward, std_reward = evaluateTrainedQTable(taxi_env, num_obs, 1000, learning_rate, trained_Q_table)

# Print mean reward from the custom evaluation method
print("Mean Reward from random action sample - Taxi:- {}".format(Q_taxi_mean_reward))
print("\n")
# Print std reward from the custom evaluation method
print("Std Reward from random action sample - Taxi:- {}".format(std_reward))

In [None]:
# Visualise the overall results from each of the Q Learning rewards gained on each of the 3 environments
sns.set_theme()
Q_Learning_Environment_Labels = ['Frozen Lake', 'Cliff Walking', 'Taxi']
x_labels = np.arange(len(Q_Learning_Environment_Labels))
width = 0.35
fig, ax = plt.subplots()
Q_learning_rect1 = ax.bar(x_labels - width/3, Q_frozen_lake_mean_reward, width, label='Q Learning Mean Score - Frozen Lake')
Q_learning_rect2 = ax.bar(x_labels - width/3, Q_cliff_walking_mean_reward, width, label='Q Learning Mean Score - Cliff Walking')
Q_learning_rect3 = ax.bar(x_labels - width/3, Q_taxi_mean_reward, width, label='Q Learning Mean Score - Taxi')

ax.set_ylabel('Mean Scores')
ax.set_title('Q Learning - Overall Mean Scores')
ax.set_xticks(x_labels, Q_Learning_Environment_Labels)
ax.legend()

ax.bar_label(Q_learning_rect1, padding=3)
ax.bar_label(Q_learning_rect2, padding=3)
ax.bar_label(Q_learning_rect3, padding=3)

fig.tight_layout()

plt.show()

### 2.2 Stable Baselines RL Model Evaluation Method

In [None]:
# Set a reward-based callback method for every instance of training an algorithm
# When the evaluation reaches its maximum reward, the algorithm (for example A2C) should stop training
reward_stop_callback = StopTrainingOnRewardThreshold(reward_threshold=100, verbose=1)

In [None]:
# Create callback helper methods from Stable Baselines 3 to properly evaluate each environment with each model under specific conditions
frozen_maze_eval_callback = EvalCallback(frozen_maze_env, deterministic=False, best_model_save_path=saved_models_path, log_path=training_log_path, n_eval_episodes=10000)

cliff_walking_eval_callback = EvalCallback(cliff_walking_env, deterministic=False, best_model_save_path=saved_models_path, log_path=training_log_path, n_eval_episodes=10000)

taxi_eval_callback = EvalCallback(taxi_env, deterministic=False, best_model_save_path=saved_models_path, log_path=training_log_path, n_eval_episodes=10000)

### 2.1.1 PPO Model Evaluation - Frozen Maze Environment

In [84]:
# Vectorize the environment in order to work with Stable Baselines during training
vec_frozen_maze_env = DummyVecEnv([lambda:frozen_maze_env])

# Set the inital learning rate and gamma for the model and custom evaluation method
learning_rate = 2.5e-4
gamma = 0.95
max_epsilon = 1.0
min_epsilon = 0.05
prob_decay = 0.0005

# Perform the same training and evaluation with the PPO model
ppo_model_frozen_maze = PPO("MlpPolicy", vec_frozen_maze_env, verbose=1, learning_rate=learning_rate, gamma=gamma, batch_size=128, clip_range=0.3, n_steps=128, max_grad_norm=0.9, vf_coef=0.045, ent_coef=1e-7)

Using cpu device


In [85]:
# Apply the untrained model with the environment over a certain number of observations
mean_reward, std_reward = evaluate_policy(model=ppo_model_frozen_maze, env=vec_frozen_maze_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", sum(mean_reward), "Number of observations made: ", num_obs)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", num_obs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
HFFG
  (Right)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Up)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Right)
SFF[41mF[0m
FHFH
FFFH
HFFG

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Right)
SFFF
FHFH
F[41mF[0mFH
HFFG

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Right)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Right)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Up)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Right)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Up)
SFFF
FH[41mF[0mH
FFFH
HFFG

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG
  (Up)
S

In [86]:
# Train the model with the specified number of timesteps
trained_ppo_model_frozen_maze = ppo_model_frozen_maze.learn(total_timesteps=30000, callback=frozen_maze_eval_callback)

-----------------------------
| time/              |      |
|    fps             | 1383 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 128  |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1072         |
|    iterations           | 2            |
|    time_elapsed         | 0            |
|    total_timesteps      | 256          |
| train/                  |              |
|    approx_kl            | 4.807813e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.3          |
|    entropy_loss         | -1.39        |
|    explained_variance   | -1.94        |
|    learning_rate        | 0.00025      |
|    loss                 | -0.00308     |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00174     |
|    value_loss           | 0.0147       |
------------------------------------------
----------------

In [89]:
# Apply the trained model with the environment and the evaluation callback method
ppo_mean_reward_frozen_maze, std_reward = evaluate_policy(model=trained_ppo_model_frozen_maze, env=vec_frozen_maze_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", sum(ppo_mean_reward_frozen_maze), "Number of observations made: ", 10000)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", 10000)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
SFFF
[41mF[0mHFH
FFFH
HFFG
  (Down)
SFFF
FHFH
[41mF[0mFFH
HFFG
  (Down)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG

[41mS[0mFFF
FHFH
FFFH
HFFG
  (Down)
S[41mF[0mFF
FHFH
FFFH
HFFG
  (Down)
SF[41mF[0mF
FHFH
FFFH
HFFG
  (Down)
SFFF
FH[41mF[0mH
FFFH
HFFG
  (Left)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FF[41mF[0mH
HFFG
  (Down)
SFFF
FHFH
F[41mF[0mFH
HFFG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
HF[41mF[0mG
  (Down)
SFFF
FHFH
FFFH
H[41mF[0mFG
  (D

### 2.1.2 PPO Model Evaluation - Cliff Walking Environment

In [91]:
# Vectorize the environment in order to work with Stable Baselines during training
vec_cliff_walking_env = DummyVecEnv([lambda:cliff_walking_env])

# Set the inital learning rate and gamma for the model and custom evaluation method
learning_rate = 2.5e-4
gamma = 0.95
max_epsilon = 1.0
min_epsilon = 0.05
prob_decay = 0.0005

# Perform the same training and evaluation with the PPO model
ppo_model_cliff_walking = PPO("MlpPolicy", vec_cliff_walking_env, verbose=1, learning_rate=learning_rate, gamma=gamma, batch_size=128, clip_range=0.3, n_steps=128, max_grad_norm=0.9, vf_coef=0.045, ent_coef=0.04)

Using cpu device


In [92]:
mean_reward, std_reward = evaluate_policy(model=ppo_model_cliff_walking, env=vec_cliff_walking_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", sum(mean_reward), "Number of observations made: ", 10000)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", 10000)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  x  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  x  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  x  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  x  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  x  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  x  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  o  x  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o

KeyboardInterrupt: ignored

In [93]:
# Train the model with the specified number of timesteps
trained_ppo_model_cliff_walking = ppo_model_cliff_walking.learn(total_timesteps=30000, callback=cliff_walking_eval_callback)

-----------------------------
| time/              |      |
|    fps             | 1390 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 128  |
-----------------------------
-------------------------------------------
| time/                   |               |
|    fps                  | 1175          |
|    iterations           | 2             |
|    time_elapsed         | 0             |
|    total_timesteps      | 256           |
| train/                  |               |
|    approx_kl            | 0.00011416804 |
|    clip_fraction        | 0             |
|    clip_range           | 0.3           |
|    entropy_loss         | -1.39         |
|    explained_variance   | -8.37e-05     |
|    learning_rate        | 0.00025       |
|    loss                 | 996           |
|    n_updates            | 10            |
|    policy_gradient_loss | -0.00343      |
|    value_loss           | 2.21e+04      |
------------------------------------------

In [None]:
# Apply the trained model with the environment and the evaluation callback method
ppo_mean_reward_cliff_walking, std_reward = evaluate_policy(model=trained_ppo_model_cliff_walking, env=vec_cliff_walking_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", sum(ppo_mean_reward_cliff_walking), "Number of observations made: ", num_obs)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", num_obs)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
o  o  o  o  o  o  o  o  o  o  o  o
x  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  o  o  o  o  o  o  o
o  x  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  x  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  x  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  x  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  x  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  C  C  C  C  C  C  C  C  C  C  T

o  o  o  o  o  x  o  o  o  o  o  o
o  o  o  o  o  o  o  o  o  o  o  o
o  o  o  o  o  o  o

### 2.1.3 PPO Model Evaluation - Taxi Environment

In [None]:
# Vectorize the environment in order to work with Stable Baselines during training
vec_taxi_env = DummyVecEnv([lambda:taxi_env])

# Set the inital learning rate and gamma for the model and custom evaluation method
learning_rate = 2.5e-4
gamma = 0.95
max_epsilon = 1.0
min_epsilon = 0.05
prob_decay = 0.0005

# Perform the same training and evaluation with the PPO model
ppo_model_taxi = PPO("MlpPolicy", vec_taxi_env, verbose=1, learning_rate=learning_rate, gamma=gamma, batch_size=128, clip_range=0.3, n_steps=128, max_grad_norm=0.9, create_eval_env=True, vf_coef=0.045, ent_coef=0.04))

Using cuda device


In [None]:
# Apply the untrained model with the environment over a certain number of observations
mean_reward, std_reward = evaluate_policy(model=ppo_model_taxi, env=vec_taxi_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", mean_reward, "Number of observations made: ", 10000)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", 10000)

In [None]:
# Train the model with the specified number of timesteps
trained_ppo_model_taxi = ppo_model_taxi.learn(total_timesteps=30000, callback=taxi_eval_callback)

-----------------------------
| time/              |      |
|    fps             | 891  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 673       |
|    iterations           | 2         |
|    time_elapsed         | 6         |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 520062.62 |
|    clip_fraction        | 0.997     |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.0056   |
|    explained_variance   | -0.000317 |
|    learning_rate        | 2e+04     |
|    loss                 | 1.24e+10  |
|    n_updates            | 10        |
|    policy_gradient_loss | 0.747     |
|    value_loss           | 6.24e+10  |
---------------------------------------
--------------------------------------
| time/                   |    



Eval num_timesteps=10000, episode_reward=-1998.20 +/- 3.60
Episode length: 200.00 +/- 0.00
---------------------------------------
| eval/                   |           |
|    mean_ep_length       | 200       |
|    mean_reward          | -2e+03    |
| time/                   |           |
|    total_timesteps      | 10000     |
| train/                  |           |
|    approx_kl            | 0.0       |
|    clip_fraction        | 0         |
|    clip_range           | 0.2       |
|    entropy_loss         | 0         |
|    explained_variance   | 0         |
|    learning_rate        | 2e+04     |
|    loss                 | 1.22e+10  |
|    n_updates            | 40        |
|    policy_gradient_loss | -8.24e-09 |
|    value_loss           | 5.59e+10  |
---------------------------------------
New best mean reward!
------------------------------
| time/              |       |
|    fps             | 553   |
|    iterations      | 5     |
|    time_elapsed    | 18    |
|    total_t

KeyboardInterrupt: ignored

In [None]:
# Apply the trained model with the environment and the evaluation callback method
ppo_mean_reward_taxi, std_reward = evaluate_policy(model=trained_ppo_model_taxi, env=vec_taxi_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", sum(ppo_mean_reward_taxi), "Number of observations made: ", 10000)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", 10000)

### 2.2.1 DQN Model Evaluation - Frozen Maze Environment

In [None]:
# Vectorize the environment in order to work with Stable Baselines during training
vec_frozen_maze_env = DummyVecEnv([lambda:frozen_maze_env])

# Set the inital learning rate and gamma for the model and custom evaluation method
learning_rate = 2.5e-4
gamma = 0.95
max_epsilon = 1.0
min_epsilon = 0.05
prob_decay = 0.0005

# Perform the same training and evaluation with the DQN model
dqn_model_frozen_maze = DQN("MlpPolicy", vec_frozen_maze_env, verbose=1, buffer_size=50000, learning_rate=learning_rate, batch_size=32, max_grad_norm=0.9)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 33.8     |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.936    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 8437     |
|    time_elapsed     | 0        |
|    total_timesteps  | 135      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 34.4     |
|    ep_rew_mean      | 0        |
|    exploration_rate | 0.869    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 7714     |
|    time_elapsed     | 0        |
|    total_timesteps  | 275      |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 33       |
|    ep_rew_mean      | 0      

In [None]:
# Apply the untrained model with the environment over a certain number of observations
mean_reward, std_reward = evaluate_policy(model=dqn_model_frozen_maze, env=vec_frozen_maze_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", mean_reward, "Number of observations made: ", 10000)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", 10000)

In [None]:
# Train the model with the specified number of timesteps
trained_dqn_model_frozen_maze = dqn_model_frozen_maze.learn(total_timesteps=30000, callback=frozen_maze_eval_callback)

In [None]:
# Apply the trained model with the environment over a certain number of observations
dqn_mean_reward_frozen_maze, std_reward = evaluate_policy(model=trained_dqn_model_frozen_maze, env=vec_frozen_maze_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", dqn_mean_reward_frozen_maze, "Number of observations made: ", 10000)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", 10000)

### 2.2.2 DQN Model Evaluation - Cliff Walking Environment

In [None]:
# Vectorize the environment in order to work with Stable Baselines during training
vec_cliff_walking_env = DummyVecEnv([lambda:cliff_walking_env])

# Set the inital learning rate and gamma for the model and custom evaluation method
learning_rate = 2.5e-4
gamma = 0.95
max_epsilon = 1.0
min_epsilon = 0.05
prob_decay = 0.0005

# Perform the same training and evaluation with the DQN model
dqn_model_cliff_walking = DQN("MlpPolicy", vec_cliff_walking_env, verbose=1, buffer_size=50000, learning_rate=learning_rate, batch_size=32, max_grad_norm=0.9)reate_eval_env=True, vf_coef=0.045, ent_coef=0.04)

In [None]:
# Apply the untrained model with the environment over a certain number of observations
mean_reward, std_reward = evaluate_policy(model=dqn_model_cliff_walking, env=vec_cliff_walking_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", mean_reward, "Number of observations made: ", num_obs)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", num_obs)

In [None]:
# Train the model with the specified number of timesteps
trained_dqn_model_cliff_walking = dqn_model_cliff_walking.learn(total_timesteps=30000, callback=cliff_walking_eval_callback)

In [None]:
# Apply the trained model with the environment over a certain number of observations
dqn_mean_reward_cliff_walking, std_reward = evaluate_policy(model=trained_dqn_model_cliff_walking, env=vec_cliff_walking_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", dqn_mean_reward_cliff_walking, "Number of observations made: ", num_obs)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", num_obs)

### 2.2.3 DQN Model Evaluation - Taxi Environment

In [None]:
# Vectorize the environment in order to work with Stable Baselines during training
vec_taxi_env = DummyVecEnv([lambda:taxi_env])

# Set the inital learning rate and gamma for the model and custom evaluation method
learning_rate = 5e-4
gamma = 0.95
max_epsilon = 1.0
min_epsilon = 0.05
prob_decay = 0.0005

# Perform the same training and evaluation with the DQN model
dqn_model_taxi = DQN("MlpPolicy", vec_taxi_env, verbose=1, buffer_size=50000, learning_rate=learning_rate, batch_size=32, max_grad_norm=0.9)

In [None]:
# Apply the untrained model with the environment over a certain number of observations
mean_reward, std_reward = evaluate_policy(model=dqn_model_taxi, env=vec_taxi_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", mean_reward, "Number of observations made: ", num_obs)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", num_obs)

In [None]:
# Train the model with the specified number of timesteps
trained_dqn_model_taxi = dqn_model_taxi.learn(total_timesteps=30000, callback=taxi_eval_callback)

In [None]:
# Apply the trained model with the environment over a certain number of observations
dqn_mean_reward_taxi, std_reward = evaluate_policy(model=trained_dqn_model_taxi, env=vec_taxi_env, deterministic=False, warn=True, render=True, return_episode_rewards=True, n_eval_episodes=10000)

print("Mean Reward: ", dqn_mean_reward_taxi, "Number of observations made: ", num_obs)
print("\n")
print("Std Reward: ", std_reward, "Number of observations made: ", num_obs)

In [None]:
# Visualise the overall results from each of the Stable Baselines 3 models on each of the 3 environments
sns.set_theme()
SB3_Models = ['A2C', 'PPO', 'DQN']
ppo_mean_scores = [sum(ppo_mean_reward_frozen_maze), sum(ppo_mean_reward_cliff_walking), sum(ppo_mean_reward_taxi)]
dqn_mean_scores = [sum(dqn_mean_reward_frozen_maze), sum(dqn_mean_reward_cliff_walking), sum(dqn_mean_reward_taxi)]

x_labels = np.arange(len(SB3_Models))
width = 0.35
fig, ax = plt.subplots()
ppo_rect = ax.bar(x_labels - width/3, ppo_mean_scores, width, label='PPO Mean Scores')
dqn_rect = ax.bar(x_labels - width/3, dqn_mean_scores, width, label='DQN Mean Scores')

ax.set_ylabel('Mean Scores')
ax.set_title('Stable Baselines 3 Scores - After Training')
ax.set_xticks(x_labels, SB3_Models)
ax.legend()

ax.bar_label(ppo_rect, padding=3)
ax.bar_label(dqn_rect, padding=3)

fig.tight_layout()

plt.show()

## 3. A* Search Algorithm

In [1]:
# Visualise the graph with the graph and position nodes
def drawGraph(graph, pos_nodes, edge_cost):
        nx.draw_networkx_nodes(graph, pos_nodes)
        nx.draw_networkx_edges(graph, pos_nodes)
        nx.draw_networkx_labels(graph, pos_nodes)
        nx.draw_networkx_edge_labels(graph, pos_nodes, edge_cost)
        plt.show()

In [2]:
# Define the first graph for this pathfinding solution
# Create a new graph
searchGraph1 = nx.Graph()
# Define a list of nodes with weights attached to the edges between each node
searchGraph1.add_edges_from([
    ('A', 'B', {"weight": 2}),
    ('A', 'C', {"weight": 4}),
    ('A', 'D', {"weight": 5}),
    ('G', 'F', {"weight": 3}),
    ('B', 'C', {"weight": 4}),
    ('F', 'M', {"weight": 4}),
    ('C', 'G', {"weight": 2}),
    ('B', 'C', {"weight": 2}),
    ('C', 'D', {"weight": 2}),
    ('D', 'H', {"weight": 2}),
    ('G', 'H', {"weight": 2}),
    ('J', 'K', {"weight": 2}),
    ('H', 'I', {"weight": 1}),
    ('E', 'J', {"weight": 2}),
    ('H', 'F', {"weight": 2}),
    ('E', 'G', {"weight": 2}),
    ('K', 'I', {"weight": 3}),
    ('K', 'L', {"weight": 4}),
    ('L', 'N', {"weight": 2}),
    ('M', 'G', {"weight": 2}),
    ('N', 'O', {"weight": 2}),
    ('P', 'N', {"weight": 3}),
    ('O', 'M', {"weight": 2}),
    ('M', 'N', {"weight": 3})
])

# Check the number of nodes and edges
print("Search Graph 1: {}".format(searchGraph1))

print("\n")

nxPosNodes1 = nx.spring_layout(searchGraph1)

# Check the position of the nodes
print("Node position for searchGraph1: {}".format(nxPosNodes1))

graphEdgeCostLabel1 = nx.get_edge_attributes(searchGraph1, 'weight')

print("Cost of edges: {}".format(graphEdgeCostLabel1))

drawGraph(searchGraph1, nxPosNodes1, graphEdgeCostLabel1)

NameError: name 'nx' is not defined

In [None]:
# Define the second graph for this pathfinding solution

# Create the directed graph using the nodes and weighted edges in a list
searchGraph2 = nx.Graph()
searchGraph2.add_edges_from([
    ('A', 'B', {"weight": 4}),
    ('A', 'C', {"weight": 2}),
    ('B', 'F', {"weight": 3}),
    ('F', 'G', {"weight": 5}),
    ('B', 'C', {"weight": 6}),
    ('C', 'G', {"weight": 2}),
    ('B', 'C', {"weight": 7}),
    ('C', 'D', {"weight": 9}),
    ('D', 'H', {"weight": 4}),
    ('G', 'H', {"weight": 1}),
    ('J', 'K', {"weight": 8}),
    ('H', 'I', {"weight": 2}),
    ('E', 'J', {"weight": 4}),
    ('H', 'F', {"weight": 3}),
    ('E', 'G', {"weight": 6}),
    ('K', 'I', {"weight": 4}),
    ('K', 'L', {"weight": 5}),
    ('L', 'N', {"weight": 2}),
    ('M', 'G', {"weight": 7}),
    ('N', 'O', {"weight": 1}),
    ('P', 'N', {"weight": 8}),
    ('O', 'M', {"weight": 3}),
    ('M', 'N', {"weight": 2})
])

# Check the number of nodes and edges
print("Search Graph 2: {}".format(searchGraph2))

print("\n")

nxPosNodes2 = nx.spring_layout(searchGraph2)

# Check the position of the nodes
print("Node position for searchGraph2: {}".format(nxPosNodes2))

graphEdgeCostLabel2 = nx.get_edge_attributes(searchGraph2, 'weight')

print("Cost of edges: {}".format(graphEdgeCostLabel2))

drawGraph(searchGraph2, nxPosNodes2, graphEdgeCostLabel2)

In [None]:
# Evaluate the shortest distance between two nodes in the graph
def evaluateAStar(graph, node_pos, current_node, goal_node):

  # Initialise the costs (weights) and estimated costs from the euclideanDistance method
  total_cost = dict()
  # Set the cost for the starting node
  total_cost[current_node] = 0

  priority_cost = dict()

  # Define a set that tracks the nodes that will be visited in the graph
  # Add the starting node to the new_nodes set
  new_nodes = set()
  new_nodes.add(current_node)

  node_path = dict()
  node_path[current_node] = current_node

  # Define a dict that records the previous nodes
  previous_nodes = set()

  # While the 'new' priority queue is not empty
  while not len(new_nodes) < 0:

        previous_nodes.add(current_node)

        # If the current node is equal to the goal, which is P, end the search method and return the path
        if current_node == goal_node:
          return returnPath(current_node, previous_nodes)

        first_node = []
        end_node = []

        # for node in graph.nodes:
        #   print(graph.nodes)
        #   first_node.append(node[0])
        #   end_node.append(node[1])

        node_path = set(first_node).union(set(end_node))

        for neighbours in graph.neighbors(current_node):
          # Search through the neighbouring nodes of the neighbours
          for next_node in neighbours:
            if next_node not in previous_nodes:
              previous_nodes.add(next_node)
            print("Next node: {}".format(next_node))
            costs = nx.get_edge_attributes(graph, "weight")
            print(costs)
            new_cost = total_cost[current_node] + costs[current_node, next_node]
            print("The cost between " + str(current_node) + " and " + str(next_node) + " = " + str(new_cost))
            if next_node not in total_cost or new_cost < total_cost[next_node]:
              # The current node in the graph is visited and is recorded as such
              previous_nodes.add(current_node)
              print("Previous Nodes", previous_nodes)
              # Add the new costs (as a value) to the total gathered cost of the next node (as a key)
              total_cost[next_node] = new_cost
              print("Minimum cost from start to " + str(next_node) + " has been found")
              # Add the next node in the graph
              new_nodes.add(next_node)
              print("New nodes in open list: {}".format(new_nodes))
            # Get the maximum cost of the current node
            total_cost[current_node] = 999999
            # Get the lowest cost based on the maximum cost and the current node
            lowest_cost_node = min(total_cost, key=total_cost.get)
            # If the node has a cost that is not the lowest cost, run the pathfinding method again
            if lowest_cost_node not in previous_nodes:
              evaluateAStar(graph, total_cost, lowest_cost_node, goal_node)
            # Define an empty dictionary that records the path with the least cost
            least_cost_path = dict()
            for node in node_path:
              least_cost_path[node] = '  '


In [None]:
# Show the graph as an external window
drawGraph(searchGraph1, nxPosNodes1, graphEdgeCostLabel1)

# Get the starting node from the input
start_node = input("Enter the starting node: ")

# Get the goal node from the input
goal_node = input("Enter the goal node: ")

# Run the evaluation method with the first graph and the necessary parameters
evaluateAStar(searchGraph1, nxPosNodes1, start_node, goal_node)

In [None]:
# Show the graph as an external window
drawGraph(searchGraph2, nxPosNodes2, graphEdgeCostLabel2)

# Get the starting node from the input
start_node = input("Enter the starting node: ")

# Get the goal node from the input
goal_node = input("Enter the goal node: ")

# Run the evaluation method with the second graph and the necessary parameters
evaluateAStar(searchGraph2, nxPosNodes2, start_node, goal_node)