In [1]:
# TODO - use a GYM
# TODO - complexe... first without the maze, find the place to go

# TODO - move through a maze where 1 are blocked, 0 are free, and you must find the end
# TODO - use convolution net to find the right decision
# TODO - reward is -1 for each time your are in the maze

import abc
from collections import *
from dataclasses import dataclass
import enum
import numpy as np
from typing import *

%matplotlib inline
import matplotlib
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from PIL import Image

In [2]:
class Move(enum.Enum):
    UP = 0
    DOWN = 1
    LEFT = 2
    RIGHT = 3
    
    @staticmethod
    def all():
        return [Move.UP, Move.DOWN, Move.LEFT, Move.RIGHT]


@dataclass(frozen=True)
class State:
    i: int
    j: int


class MazeEnv:
    def __init__(self, maze: np.ndarray, start_pos: Tuple[int, int], end_pos: Tuple[int, int]):
        self.maze: np.ndarray = maze
        self.start_pos: Tuple[int, int] = start_pos
        self.end_pos: Tuple[int, int] = end_pos
        self.i = self.start_pos[0]
        self.j = self.start_pos[1]
    
    def reset(self):
        self.i = self.start_pos[0]
        self.j = self.start_pos[1]
        
    def get_state(self) -> State:
        return State(i=self.i, j=self.j)
    
    def get_actions(self) -> List[Move]:
        return Move.all()
    
    def is_done(self) -> bool:
        return self.i == self.end_pos[0] and self.j == self.end_pos[1]
    
    def step(self, action):
        if self.is_done():
            raise Exception("Game is over")
        
        h, w = self.maze.shape
        if action == Move.UP:
            self.i = max(0, self.i - 1)
        elif action == Move.DOWN:
            self.i = min(h - 1, self.i + 1)
        elif action == Move.LEFT:
            self.j = max(0, self.j - 1)
        elif action == Move.RIGHT:
            self.j = min(w - 1, self.j + 1)
        return -1

In [5]:
class ActionValues(abc.ABC):
    @abc.abstractmethod
    def get_actions(self, state):
        pass
    
    @abc.abstractmethod
    def get_action_value(self, state, action) -> float:
        pass
    
    def get_best_action(self, state):
        best_action = None
        best_score = float('-inf')
        for action in self.get_actions(state):
            score = self.get_action_value(state, action)
            if score > best_score:
                best_score = score
                best_action = action
        return best_action


class DiscreteActionValues(ActionValues):
    def __init__(self, learning_rate: float = 0.1):
        self.learning_rate = learning_rate
        self.values = defaultdict(lambda: defaultdict(float))
    
    def add(self, state, action, score: float) -> float:
        self.values[state][action] += self.learning_rate * (score - self.values[state][action])
    
    def get_actions(self, state):
        return self.values[state].keys()
    
    def get_action_value(self, state, action) -> float:
        return self.values[state][action]


class SARSA:
    def __init__(self,
                 default_value: float = 0.,
                 learning_rate: float = 0.1,
                 reward_discount: float = 1.,
                 epsilon: float = 0.1
                ):
        # a big default_value might help favoring exploration at early stages (but biase the results)
        self.q_values = DiscreteActionValues(learning_rate)
        self.reward_discount = reward_discount  # discount factor of future reward taken into account at present time
        self.epsilon = epsilon                  # probability to take a random action
    
    def step(self, env) -> float:
        state = env.get_state()
        action = self._behavior_policy_action(env, state)
        reward = env.step(action)
        new_state = env.get_state()
        new_action = self._target_policy_action(env, new_state)
        score = reward + self.reward_discount * self.q_values.get_action_value(new_state, new_action)
        self.q_values.add(state, action, score)
        return reward
    
    def _target_policy_action(self, env, state):
        return self._behavior_policy_action(env, state)
    
    def _behavior_policy_action(self, env, state):
        if self.epsilon > 0. and np.random.random() < self.epsilon:
            return np.random.choice(env.get_actions())
        return self.q_values.get_best_action(state)


class QLearning(SARSA):
    def __init__(self,
                 default_value: float = 0.,
                 learning_rate: float = 0.1,
                 reward_discount: float = 1.,
                 epsilon: float = 0.1
                ):
        super().__init__(default_value=default_value, learning_rate=learning_rate, reward_discount=reward_discount, epsilon=epsilon)
    
    def _target_policy_action(self, env, state):
        return self.q_values.get_best_action(state)
    

"""
Training Loop
"""


class RunningAverage:
    def __init__(self):
        self.average = 0.
        self.count = 0
    
    def add(self, value):
        self.average += 1 / (self.count + 1) * (value - self.average)
        self.count += 1
    
    def reset(self):
        self.average = 0.
        self.count = 0
    
    def __call__(self):
        return self.average
    

def simulate_episode(env, agent) -> float:
    total_reward = 0.
    env.reset()
    while not env.is_done():
        total_reward += agent.step(env)
    return total_reward


def train_agent(env, agent, nb_episodes: int):
    running_average = RunningAverage()
    temperature_decrease_period = nb_episodes // 21
    temperature_decrease = agent.epsilon / 20
    for episode in range(1, nb_episodes + 1):
        reward = simulate_episode(env, agent)
        running_average.add(reward)
        if episode % temperature_decrease_period == 0:
            print("Episode", episode, ":", running_average(), " (epsilon " + str(agent.epsilon) + ")")
            agent.epsilon -= temperature_decrease
            running_average.reset()

In [6]:
maze = MazeEnv(
    maze=np.array([[0, 0, 0, 0, 0],
                   [0, 0, 0, 0, 0],
                   [0, 0, 0, 0, 0],
                   [0, 0, 0, 0, 0]]),
    start_pos=(0, 0),
    end_pos=(3, 4)
)

agent = QLearning(
     default_value = 0.,
     learning_rate = 0.1,
     reward_discount = 1.,
     epsilon = 0.1
)

train_agent(env=maze, agent=QLearning(), nb_episodes=500)

Episode 23 : -63.3478260869565  (epsilon 0.1)
Episode 46 : -26.434782608695652  (epsilon 0.095)
Episode 69 : -15.56521739130435  (epsilon 0.09)
Episode 92 : -14.217391304347826  (epsilon 0.08499999999999999)
Episode 115 : -15.304347826086955  (epsilon 0.07999999999999999)
Episode 138 : -12.695652173913043  (epsilon 0.07499999999999998)
Episode 161 : -9.217391304347826  (epsilon 0.06999999999999998)
Episode 184 : -9.826086956521738  (epsilon 0.06499999999999997)
Episode 207 : -9.869565217391305  (epsilon 0.05999999999999998)
Episode 230 : -9.21739130434783  (epsilon 0.05499999999999998)
Episode 253 : -8.826086956521744  (epsilon 0.04999999999999998)
Episode 276 : -8.695652173913041  (epsilon 0.044999999999999984)
Episode 299 : -7.956521739130436  (epsilon 0.03999999999999999)
Episode 322 : -8.260869565217392  (epsilon 0.03499999999999999)
Episode 345 : -7.869565217391305  (epsilon 0.02999999999999999)
Episode 368 : -7.739130434782608  (epsilon 0.024999999999999988)
Episode 391 : -7.6086