Maze Solver using e-greedy with realistic vs optimal initialization of Q-values.

In [None]:
# install required system dependencies
!apt-get install -y xvfb x11-utils  
!apt-get install x11-utils > /dev/null 2>&1
!pip install PyOpenGL==3.1.* \
            PyOpenGL-accelerate==3.1.* \
            gym[box2d]==0.17.* 
!pip install pyglet
!pip install ffmpeg
! pip install pyvirtualdisplay
!pip install Image
!pip install gym-maze-trustycoder83
#!pip install plotting


In [None]:
!mkdir ./vid
!rm ./vid/*.*

In [None]:
import sys
# import pygame
import numpy as np
# import math
# import base64
# import io
# import IPython
import gym
import gym_maze

import itertools


# from gym.wrappers import Monitor
# from IPython import display
#from pyvirtualdisplay import Display
#from gym.wrappers.monitoring import video_recorder

#d = Display()
#d.start()

# Recording filename
#video_name = "./vid/Practical_2.mp4"

# Setup the environment for the maze
env = gym.make("maze-sample-10x10-v0")

# Setup the video
#vid = None
#vid = video_recorder.VideoRecorder(env,video_name)

# env = gym.wrappers.Monitor(env,'./vid',force=True)
current_state = env.reset()


1. Realistic Initialization

In [None]:
states_dic = {} #dictionary to keep the states/coordinates of the Q table
count = 0
for i in range(10):
    for j in range(10):
        states_dic[i, j] = count
        count+=1
        
n_actions = env.action_space.n

# Initialize the Q-table to 0
Q_table_1 = {}


# Number of episode we will run
n_episodes = 300


# Initialize the exploration probability to 1
exploration_proba = [0.2, 0.4, 0.8]
gamma_list = [0.1, 0.4, 0.8]

alpha = 1 / n_actions


rewards_per_episode_dict = {}

for gamma in gamma_list:
      for proba in exploration_proba:
        rewards_per_episode_dict['{},{}'.format('%f'%proba, '%f'%gamma)] = []
        Q_table = * np.ones((len(states_dic),n_actions))

        # Iterate over episodes
        for e in range(n_episodes):

            # We are not done yet
            done = False
            steps = 0
            # Sum the rewards that the agent gets from the environment
            total_episode_reward = 0

            for i in itertools.count(): 
                env.unwrapped.render()
                #vid.capture_frame()
                current_coordinate_x = int(current_state[0])
                current_coordinate_y = int(current_state[1])
                current_Q_table_coordinates = states_dic[current_coordinate_x, current_coordinate_y]

                if np.random.uniform(0,1) < proba:
                    action = env.action_space.sample()
                else:
                    action = int(np.argmax(Q_table[current_Q_table_coordinates]))

                next_state, reward, done, _ = env.step(action)

                next_coordinate_x = int(next_state[0]) #get coordinates to be used in dictionary
                next_coordinate_y = int(next_state[1]) #get coordinates to be used in dictionary


                # Update our Q-table using the Q-learning iteration
                next_Q_table_coordinates = states_dic[next_coordinate_x, next_coordinate_y]
                Q_table[current_Q_table_coordinates, action] = Q_table[current_Q_table_coordinates, action] + (alpha * (reward + (gamma * ( Q_table[next_Q_table_coordinates, action])) - Q_table[current_Q_table_coordinates, action]))
                #Q_table[current_Q_table_coordinates, action] = (1-lr) *Q_table[current_Q_table_coordinates, action] +lr*(reward + max(Q_table[next_Q_table_coordinates,:]))

                total_episode_reward = total_episode_reward + reward


                #step.append(e)
                #reward_.append(reward)

                steps = i



                #if e == 9:
                  #avg_reward = total_episode_reward / (i+1)
                  #dict_step_avgReward[i+1] = total_episode_reward

                # If the episode is finished, we leave the for loop
                if done:
                    break
                current_state = next_state

            #Show the total episode reward        
            print("Total episode reward:", total_episode_reward)

            #Reset enviroment for next episode
            current_state = env.reset()

            rewards_per_episode_dict['{},{}'.format('%f'%proba, '%f'%gamma)].append(total_episode_reward / steps)
        Q_table_1['{},{}'.format('%f'%proba, '%f'%gamma)] = Q_table
    # Save video episode and close
#print("Video successfuly saved.")
#vid.close()
#vid.enabled = False

import base64
import io
from IPython import display

video_name = "./vid/Practical_2.mp4"

video = io.open(video_name, 'r+b').read()
encoded = base64.b64encode(video)

display.display(display.HTML(data="""
  <video alt="test" controls>
  <source src="data:video/mp4;base64,{0}" type="video/mp4" />
  </video>
  """.format(encoded.decode('ascii'))))

In [None]:
import pickle
with open('q.pkl', 'wb') as f:
    pickle.dump(Q_table_1, f)
    
with open('r.pkl', 'wb') as f:
    pickle.dump(rewards_per_episode_dict, f)

In [None]:
import matplotlib.pyplot as plt

from scipy.signal import savgol_filter

plt.figure(figsize=(15,10))
for i in rewards_per_episode_dict.keys():
    y = savgol_filter(rewards_per_episode_dict[i], 51, 1)
    plt.plot(range(1,len(rewards_per_episode_dict[i])+1), y, label='%s'%i)
    
    
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.legend()
plt.show()





In [None]:
np.mean(Q_table_1['0.200000,0.800000'])

2. Optimal Initialization

In [None]:
states_dic = {} #dictionary to keep the states/coordinates of the Q table
count = 0
for i in range(10):
    for j in range(10):
        states_dic[i, j] = count
        count+=1
        
n_actions = env.action_space.n

# Initialize the Q-table to 0
Q_table_2 = {}


# Number of episode we will run
n_episodes = 300


# Initialize the exploration probability to 1
exploration_proba = [0.2, 0.4, 0.8]
gamma_list = [0.1, 0.4, 0.8]

alpha = 1 / n_actions


rewards_per_episode_dict_1 = {}

for gamma in gamma_list:
      for proba in exploration_proba:
        rewards_per_episode_dict_1['{},{}'.format('%f'%proba, '%f'%gamma)] = []
        Q_table =  1.6420482407868309 * np.ones((len(states_dic),n_actions))

        # Iterate over episodes
        for e in range(n_episodes):

            # We are not done yet
            done = False
            steps = 0
            # Sum the rewards that the agent gets from the environment
            total_episode_reward = 0

            for i in itertools.count(): 
                env.unwrapped.render()
                #vid.capture_frame()
                current_coordinate_x = int(current_state[0])
                current_coordinate_y = int(current_state[1])
                current_Q_table_coordinates = states_dic[current_coordinate_x, current_coordinate_y]

                if np.random.uniform(0,1) < proba:
                    action = env.action_space.sample()
                else:
                    action = int(np.argmax(Q_table[current_Q_table_coordinates]))

                next_state, reward, done, _ = env.step(action)

                next_coordinate_x = int(next_state[0]) #get coordinates to be used in dictionary
                next_coordinate_y = int(next_state[1]) #get coordinates to be used in dictionary


                # Update our Q-table using the Q-learning iteration
                next_Q_table_coordinates = states_dic[next_coordinate_x, next_coordinate_y]
                Q_table[current_Q_table_coordinates, action] = Q_table[current_Q_table_coordinates, action] + (alpha * (reward + (gamma * ( Q_table[next_Q_table_coordinates, action])) - Q_table[current_Q_table_coordinates, action]))
                #Q_table[current_Q_table_coordinates, action] = (1-lr) *Q_table[current_Q_table_coordinates, action] +lr*(reward + max(Q_table[next_Q_table_coordinates,:]))

                total_episode_reward = total_episode_reward + reward


                #step.append(e)
                #reward_.append(reward)

                steps = i



                #if e == 9:
                  #avg_reward = total_episode_reward / (i+1)
                  #dict_step_avgReward[i+1] = total_episode_reward

                # If the episode is finished, we leave the for loop
                if done:
                    break
                current_state = next_state

            #Show the total episode reward        
            print("Total episode reward:", total_episode_reward)

            #Reset enviroment for next episode
            current_state = env.reset()

            rewards_per_episode_dict_1['{},{}'.format('%f'%proba, '%f'%gamma)].append(total_episode_reward / steps)
        Q_table_2['{},{}'.format('%f'%proba, '%f'%gamma)] = Q_table

In [None]:
with open('q1.pkl', 'wb') as f:
    pickle.dump(Q_table_2, f)
    
with open('r1.pkl', 'wb') as f:
    pickle.dump(rewards_per_episode_dict_1, f)

In [None]:
plt.figure(figsize=(15,10))
for i in rewards_per_episode_dict_1.keys():
    y = savgol_filter(rewards_per_episode_dict_1[i], 51, 1)
    plt.plot(range(1,len(rewards_per_episode_dict_1[i])+1), y, label='%s'%i)
    
    
plt.xlabel('Episode')
plt.ylabel('Average Reward')
plt.legend()
plt.show()
