In [None]:
import numpy as np
import cv2
from google.colab.patches import cv2_imshow
from IPython.display import clear_output
from IPython.display import Video
import os

class Cell:
  cellType = 0
  def __init__(self, cellType=0, reward=0, available=True, terminal=False):
    self.reward = reward
    self.walls = 0
    self.cellType = cellType
    self.available = available
    self.terminal = terminal

  def __str__(self):
    return self.cellType

class Sprite:
  def __init__(self, img, mask):
    self.img = img
    self.mask = mask
    self.width, self.height, _ = img.shape

  def draw(self, img, position, vnum):
    img[position[1] : position[1]+self.height, position[0] : position[0]+self.height] &= self.mask
    img[position[1] : position[1]+self.height, position[0] : position[0]+self.height] |= self.img

class Maze:

  vtable = [[-1 for x in range(8)] for y in range(8)]
  ctable = [[[] for x in range(8)] for y in range(8)]

  GLIE_ENABLED = True
  STEP_PENALTY = 0
  GREEDY_CHANCE = 0.25
  LEARNING = True

  def __init__(self, maze, colors, player, background, resolution, epsilon=0):
    self.maze = maze
    self.colors = colors
    self.player = player
    self.resolution = resolution
    self.epsilon=epsilon
    self.background = background
    self.mazeHeight, self.mazeWidth = maze.shape
    self.imgWidth = self.mazeWidth * self.resolution
    self.imgHeight = self.mazeHeight * self.resolution
    self.reset()

  def reset(self, fixedPosition=True):
    self.reward = 0.0
    self.cumulativeReward = 0.0
    self.steps = 1
    self.x = 1*fixedPosition
    self.y = 1*fixedPosition
    while(not self.maze[self.y, self.x].available):
      self.x = np.random.randint(1, self.mazeWidth)
      self.y = np.random.randint(1, self.mazeHeight)
    self.replay_path = []
    self.replay_path.append((self.y, self.x))

  def info(self):
    info = dict()
    info["dimensions"] = self.maze.shape
    info["epsilon"] = self.epsilon
    info["actions"] = (1, 2, 4, 8)
    return info

  def render(self):
    border = 10
    img = self.background.copy()
    self.player.draw(img, (self.x*self.resolution, self.y*self.resolution), self.vtable[self.x][self.y])
    img = img[self.resolution-border: -self.resolution+border, self.resolution-border: -self.resolution+border]
    return img

  def display(self):
    cv2_imshow(self.render())

  def showReplay(self):
    video_path = "tmp.mp4"
    video = cv2.VideoWriter(video_path, cv2.VideoWriter_fourcc(*"JPEG"), 10.0, (env.render().shape[1::-1]))
    for pos in self.replay_path:
      self.y, self.x = pos
      video.write(env.render())
    video.release()
    fileToDisplay = "video.mp4"
    os.system(f"ffmpeg -y -i {video_path} -vcodec libx264 -x264opts keyint=123:min-keyint=120 -an {fileToDisplay}")
    display(Video(fileToDisplay, embed=True))

  def findDir(self, dir):
    dir_e = int((np.random.randint(1,3)<<2)/((dir|dir>>1)&5)*(np.random.random()<self.epsilon))
    dir = dir*(not dir_e) | dir_e

    if np.random.random() + 0.001 > self.GREEDY_CHANCE - (1 - self.LEARNING):
      dir_v = {
          4 : self.vtable[self.y + 1][self.x],
          8 : self.vtable[self.y - 1][self.x],
          1 : self.vtable[self.y][self.x + 1],
          2 : self.vtable[self.y][self.x - 1]
      }

      for key in dir_v.keys():
        if dir_v[key] == -1:
          dir_v[key] = -99999
      max_v = max(dir_v.values())

      dir = list(dir_v.keys())[list(dir_v.values()).index(max_v)]
    return dir

  def vtableUpdate(self):
    G = 0
    for t in reversed(range(len(self.last_episode))):
        state, reward = self.last_episode[t]
        G = self.gamma * G + reward
        if state not in [x[0] for x in self.last_episode[:t]]:
            self.vtable[state[0]][state[1]] += self.alpha * (G - self.vtable[state[0]][state[1]])

  def action(self, dir):

    max_tries = 10
    temp_tries = 0
    dir = self.findDir(dir)

    if not self.GLIE_ENABLED:
      while (self.y + (dir>>2&1) - (dir>>3&1), self.x + (dir&1) - (dir>>1&1), self.vtable[self.y + (dir>>2&1) - (dir>>3&1)][self.x + (dir&1) - (dir>>1&1)]) in self.ctable[self.x][self.y]:
        temp_tries += 1
        dir = self.findDir(dir)
        if temp_tries > max_tries:
          break

    x=self.x + (dir&1) - (dir>>1&1)
    y=self.y + (dir>>2&1) - (dir>>3&1)

    self.x=x*self.maze[y, x].available + self.x*(not self.maze[y, x].available)
    self.y=y*self.maze[y, x].available + self.y*(not self.maze[y, x].available)

    self.steps += 1

    self.replay_path.append((self.y, self.x))
    return dir

  def state(self, lastAction = 0):
    done = self.maze[self.y, self.x].terminal

    reward = self.maze[self.y, self.x].reward + self.vtable[self.y][self.x] + self.STEP_PENALTY
    self.vtable[self.y][self.x] = reward

    info = self.info()

    self.ctable[self.x][self.y].append((self.y, self.x, reward))
    return (self.y, self.x), lastAction, reward, done

  def getVtable(self):
    return self.vtable

  def getCTable(self):
    return self.ctable

  def step(self, action):
    action = self.action(action)
    return self.state(action)


#The environment
The environment was defined in the Maze class.<br><br>
##The most important elements:
###State
State is returned by methods step(action) and state().<br>
State consists of 5 values:
*position, lastAction, reward, done, info*:
* position --- tuple of two real numbers (y, x),
* lastAction --- last action passed to step(),
* reward --- real number,
* done --- boolean value: True if game is over,

###Info
Method *info()* returns a dictionary containing information about the environment,
 * dimensions --- shape of the environment,
 * epsilon --- the probability of taking a random step,
 * actions --- available actions.

###Action
Action is determined by value: 1, 2, 4 or 8:
*   1 --- right,
*   2 --- left,
*   4 --- down,
*   8 --- up.

###Reset
reset(fixedPosition=True) resets the eivironment.<br>
fixedPosition:
* True --- agent starts at position 1, 1
* False --- agent starts at random position

###Visualisation
* display() --- display image of actual environment state
* showReplay() --- display video of the last game

#Loading environment

---



In [None]:
from urllib.request import urlopen
import pickle

def loadEnv():
  with open("./DRL_a1.pickle", 'rb') as file2:
    env = pickle.load(file2)
  return env

env = loadEnv()


#Printing environment and the state details

In [None]:
info = env.info()
print("info", info)
print("environment dimensions", info["dimensions"])
print("probability of random step", info["epsilon"])
print("allowed actions", info["actions"])
print("vtable", )


actPos, action, reward, done, vtable = env.state()
for row in vtable:
    print(list(row))
print("\nagent position", actPos)
print("performed action", action)
print("last reward", reward)
print("the game is over", done)


info {'dimensions': (7, 8), 'epsilon': 0.0, 'actions': (1, 2, 4, 8)}
environment dimensions (7, 8)
probability of random step 0.0
allowed actions (1, 2, 4, 8)
vtable
(1, 1)
[0, 0, 0, 0, 0, 0, 0]
[0, -1, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0]

agent position (1, 1)
performed action 0
last reward -1
the game is over False


#Visualization


###Training

In [None]:
class RandomAgent:
  def __init__(self, actions):
    self.actions = actions

  def play(self, state):
    return np.random.choice(self.actions)

info = env.info()

agent = RandomAgent(info["actions"])

alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate

env.STEP_PENALTY = -0.5
env.GREEDY_CHANCE = 0.75
env.GLIE_ENABLED = True

epochn = 50
spetn = 1000
episodes = 10

steptemp = 0
rewardtemp = 10

success_episode = 0

episodes_to_skip = 0
epochs_to_skip = 0

for epoch in range(epochn):

  if epochs_to_skip > 0:
    epochs_to_skip -= 1
    continue

  for episode in range(episodes):

    env.reset()
    state = env.state()

    if episodes_to_skip > 0:
      episodes_to_skip -= 1
      continue

    for step in range(spetn + 5):
      action = agent.play(state)
      state = env.step(action)
      steptemp += 1

      if state[3] == True:
        spetn = max(spetn, steptemp) + 10
        success_episode += 1
        break

    if rewardtemp == state[2]:
      episodes_to_skip = 3
      epochs_to_skip = 5
    elif rewardtemp < state[2]:
      episodes_to_skip = 1
      epochs_to_skip = 1
    rewardtemp = state[2]
    steptemp = 0

    env.update_vtable()

    if(state[2]>0):
      print("Yupi :) Epoch: " + str(epoch) + " Episode: " + str(episode) + " Reward: " + str(state[2]))
    if(state[2]<0):
      print(":< Epoch: " + str(epoch) + " Episode: " + str(episode) + " Reward: " + str(state[2]))

  epoch -= int(episodes/(success_episode+1))
  success_episode = 0
  episodes = 10

env.reset()
state = env.state()
for step in range(100):
  action = agent.play(state)
  state = env.step(action)
  if state[3] == True:
      break
for row in env.getVtable():
  for r in row:
    if r == -1:
      print('XXXX', end=",")
    else:
      print(str(round(r)), end=",")
  print('\n')
env.showReplay()

env.LEARNING = False
env.reset()
state = env.state()
for step in range(100):
  action = agent.play(state)
  state = env.step(action)
  if state[3] == True:
      break
for row in env.getVtable():
  for r in row:
    if r == -1:
      print('XXXX', end=",")
    else:
      print(str(round(r)), end=",")
  print('\n')
env.showReplay()

Yupi :) Epoch: 0 Episode: 0 Reward: 169.0
Yupi :) Epoch: 0 Episode: 2 Reward: 169.5
Yupi :) Epoch: 0 Episode: 4 Reward: 170.0
:< Epoch: 0 Episode: 6 Reward: -134.5
Yupi :) Epoch: 0 Episode: 7 Reward: 170.5
Yupi :) Epoch: 0 Episode: 9 Reward: 171.0
Yupi :) Epoch: 2 Episode: 1 Reward: 171.5
Yupi :) Epoch: 2 Episode: 3 Reward: 172.0
Yupi :) Epoch: 2 Episode: 5 Reward: 172.5
Yupi :) Epoch: 2 Episode: 7 Reward: 173.0
Yupi :) Epoch: 2 Episode: 9 Reward: 173.5
:< Epoch: 4 Episode: 1 Reward: -136.0
Yupi :) Epoch: 4 Episode: 2 Reward: 174.0
Yupi :) Epoch: 4 Episode: 4 Reward: 174.5
Yupi :) Epoch: 4 Episode: 6 Reward: 175.0
Yupi :) Epoch: 4 Episode: 8 Reward: 175.5
Yupi :) Epoch: 6 Episode: 0 Reward: 176.0
Yupi :) Epoch: 6 Episode: 2 Reward: 176.5
Yupi :) Epoch: 6 Episode: 4 Reward: 177.0
:< Epoch: 6 Episode: 6 Reward: -137.5
Yupi :) Epoch: 6 Episode: 7 Reward: 177.5
Yupi :) Epoch: 6 Episode: 9 Reward: 178.0
Yupi :) Epoch: 8 Episode: 1 Reward: 178.5
Yupi :) Epoch: 8 Episode: 3 Reward: 179.0
Yupi

XXXX,XXXX,XXXX,XXXX,XXXX,XXXX,XXXX,XXXX,

XXXX,-656,-334,-250,-240,-204,-127,XXXX,

XXXX,-334,-250,XXXX,XXXX,-128,-130,XXXX,

XXXX,-250,XXXX,XXXX,XXXX,-120,-123,XXXX,

XXXX,-208,-158,-73,-96,-111,-118,XXXX,

XXXX,-158,-124,228,-90,-98,-107,XXXX,

XXXX,XXXX,XXXX,XXXX,XXXX,XXXX,XXXX,XXXX,

XXXX,XXXX,XXXX,XXXX,XXXX,XXXX,XXXX,XXXX,

