# **Task 2.2C**


GitHub: https://github.com/StrikerXYZ/Knots-And-Crosses

In [None]:
# install required system dependencies
!apt-get install -y xvfb x11-utils  
!apt-get install x11-utils > /dev/null 2>&1
!pip install PyOpenGL==3.1.* \
            PyOpenGL-accelerate==3.1.* \
            gym[box2d]==0.17.* \
!pip install pyglet
!pip install ffmpeg
!pip install pyvirtualdisplay
!pip install Image
!pip install gym-maze-trustycoder83

In [None]:
!mkdir ./vid
!rm ./vid/*.*

In [None]:
import sys
import numpy as np
import gym
import gym_maze
import base64
import io

from IPython import display
from pyvirtualdisplay import Display
from gym.wrappers.monitoring import video_recorder

d = Display()
d.start()

env = gym.make("maze-sample-10x10-v0")
video_name = "./vid/Practical_2.mp4"
vid = None

def startVideo():
  return video_recorder.VideoRecorder(env,video_name)

def endVideo(vid):
  vid.close()
  vid.enabled = False
  video = io.open(video_name, 'r+b').read()
  encoded = base64.b64encode(video)
  display.display(display.HTML(data="""
    <video alt="test" controls>
    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
    </video>
    """.format(encoded.decode('ascii'))))

def test(vid, n_episodes, max_iter_episode, epsilon, gamma, optimisticInit):
  current_state = env.reset()

  states_dic = {} #dictionary to keep the states/coordinates of the Q table
  count = 0
  for i in range(10):
      for j in range(10):
          states_dic[i, j] = count
          count+=1
          
  n_actions = env.action_space.n

  # Initialize the Q-table to 0
  Q_table = np.zeros((len(states_dic), n_actions))

  if optimisticInit:
    Q_table = np.ones((len(states_dic), n_actions))

  # Number of episode we will run
  #n_episodes = 10

  # Maximum of iteration per episode
  #max_iter_episode = 100

  # Epsilon
  #epsilon = 0.5
  #epsilon_decay = 0.001
  #min_epsilon = 0.01

  # Learning rate
  gamma = 0.7

  rewards_per_episode = list()

  # Iterate over episodes
  for e in range(n_episodes):
      
      # We are not done yet
      done = False
      
      # Sum the rewards that the agent gets from the environment
      total_episode_reward = 0

      n_a = 0;

      #last_reward = 0

      for i in range(max_iter_episode): 
          env.unwrapped.render()
          vid.capture_frame()
          current_coordinate_x = int(current_state[0])
          current_coordinate_y = int(current_state[1])
          current_Q_table_coordinates = states_dic[current_coordinate_x, current_coordinate_y]

          explore = np.random.uniform(0,1) < epsilon
          if explore:
            action = env.action_space.sample()
          else:
            action = int(np.argmax(Q_table[current_Q_table_coordinates]))
            
          #print("Explore: ", explore)
          #print(current_Q_table_coordinates, ">>>", Q_table[current_Q_table_coordinates], " ==> ", action)

          next_state, reward, done, _ = env.step(action)

          next_coordinate_x = int(next_state[0]) #get coordinates to be used in dictionary
          next_coordinate_y = int(next_state[1]) #get coordinates to be used in dictionary


          # Update our Q-table using the Q-learning iteration
          next_Q_table_coordinates = states_dic[next_coordinate_x, next_coordinate_y]
          #Q_table[current_Q_table_coordinates, action] = (1-lr) *Q_table[current_Q_table_coordinates, action] +lr*(reward + max(Q_table[next_Q_table_coordinates,:]))

          #increment N(A)
          n_a += 1;
          alpha = 1/n_a;

          #Evaluate Q value
          q = Q_table[current_Q_table_coordinates, action];
          q_max = np.amax(Q_table[next_Q_table_coordinates]);
          q += alpha * (reward - gamma*q)
          #q += alpha * (reward + gamma*q_max - q)
          Q_table[current_Q_table_coordinates, action] = q;
          #print("after: ", Q_table[current_Q_table_coordinates])

          #Decay epsilon
          #epsilon = max(min_epsilon, epsilon * (1 - epsilon_decay))
          #last_reward = q
          total_episode_reward = total_episode_reward + reward
          # If the episode is finished, we leave the for loop
          if done:
              break
          current_state = next_state

      #Show the total episode reward        
      #print("Total episode reward:", total_episode_reward)
      
      #Reset enviroment for next episode
      current_state = env.reset()
      
      rewards_per_episode.append(total_episode_reward)
  
  return rewards_per_episode

#vid = startVideo()
#test(vid, 10, 1000, 0.5, 1, True)
#endVideo(vid)

We can now play the video using the following code

In [None]:
import matplotlib.pyplot as mp

vid = startVideo()

# Number of episode we will run
n_episodes = 10
# Maximum of iteration per episode
max_iter_episode = 1000

epsilonList = list()
start = 0
inc = 0.2
for i in range(6):
  epsilonList.append(start + i * inc);

print("Epsilon Report:")
labelTxt = "eps = {v:.1f}"
for i in range(6):
  epsilonResult = test(vid, n_episodes, max_iter_episode, epsilonList[i], 0.5, False)
  mp.plot( np.arange(0, n_episodes), epsilonResult, 'o--', linewidth=1, markersize=10, label=labelTxt.format(v = epsilonList[i]))
mp.xlabel('episodes')
mp.ylabel('reward')
mp.legend(loc='best')
mp.show()


gammaList = list()
start = 0
inc = 0.2
for i in range(6):
  gammaList.append(start + i * inc);

print("Gamma Report:")
labelTxt = "gamma = {v:.1f}"
for i in range(6):
  gammaResult = test(vid, n_episodes, max_iter_episode, 0.5, gammaList[i], False)
  mp.plot( np.arange(0, n_episodes), gammaResult, 'o--', linewidth=1, markersize=10, label=labelTxt.format(v = gammaList[i]))
mp.xlabel('episodes')
mp.ylabel('reward')
mp.legend(loc='best')
mp.show()

print("Optimistic vs Realistic:")

realisticResult = test(vid, n_episodes, max_iter_episode, 0.5, 0.5, False)
mp.plot( np.arange(0, n_episodes), realisticResult, 'o--', linewidth=1, markersize=10, label="realistic")
optimisticResult = test(vid, n_episodes, max_iter_episode, 0.5, 0.5, False)
mp.plot( np.arange(0, n_episodes), optimisticResult, 'o--', linewidth=1, markersize=10, label="optimistic")
mp.xlabel('episodes')
mp.ylabel('reward')
mp.legend(loc='best')
mp.show()

endVideo(vid)

Discussion:

- In the first evaluation the lower epsilon seemed to provided a better results at the earliest of the iterations while the higher epsilon provided better results towards more iterations. Lower epsilon resulted in less exploring but seems more likely to find a lower minimum quickly which in this maze may be global minimum. This is most likely why the lower epsilon may find better results at lower iterations. Higher epsilon may explore the grid a lot more and would seem to take more educated actions in the higher iterations

- In the second evaulation the the lower gamma/ learning rate seems to provide a smoother curve compared to the higher gamma results. The lower gamma seems to be able to converge on the results quicker while the higher gamma seems to have larger jumps before it can settle towards better rewards

- The last evaluation of realistic vs optimisic shows that realistic method may find the rewards more consistently while optimistic approach is able to find higher rewards very early on.