In [11]:
import numpy as np

In [20]:
# Environment
# Observation:
#    state
#    action
#    reward
#    done
class Environment():

    # Encoding:
    # "*": agent position
    # " ": empty square
    # "T": Table
    # "G": Goal
    def __init__(self):
        self.agent_position = (0, 0)
        self.map = [
            [" ", " ", " ", " ", "G"],
            [" ", " ", " ", " ", " "],
            [" ", " ", " ", " ", " "],
            [" ", " ", " ", " ", " "],
            [" ", " ", " ", " ", " "]
        ]

    def draw_env(self):

        x = self.agent_position[1]
        y = self.agent_position[0]

        last_token = self.map[y][x]

        self.map[y][x] = "*"
        print('----------------------')
        for l in self.map:
            print(l)

        self.map[y][x] = last_token

    # get the token from the current position
    def get_token(self):
        x = self.agent_position[1]
        y = self.agent_position[0]

        return self.map[y][x]

    # reward mapping:
    #  " " ->  0
    #  "T" -> -1
    #  "G" -> +1
    def reward(self):
        token = self.get_token()

        if token == " ":
            return -0.1

        if token == "T":
            return -1

        if token == "G":
            return 100

        return 0

    # clamp a value between 0 and 4
    def clamp_to_map(self, value):
        if value < 0:
            return 0

        if value > 4:
            return 4

        return value

    # action:
    #   UP, DOWN, LEFT, RIGHT
    #   state_position, action, reward, done
    def next(self, action):

        start_position = self.agent_position

        x = self.agent_position[1]
        y = self.agent_position[0]

        # move the agent
        if action == "U":
            y = y - 1

        if action == "D":
            y = y + 1

        if action == "L":
            x = x - 1

        if action == "R":
            x = x + 1

        # clamp it to the environment
        x = self.clamp_to_map(x)
        y = self.clamp_to_map(y)

        self.agent_position = (y, x)

        # determine the reward
        reward = self.reward()

        # is episode complete ?
        token = self.get_token()
        done = (token == "1" or token == "T")

        return (start_position, action, reward, done)

    # sets the agent position back to (0,0)
    def reset(self):
        self.agent_position = (0, 0)

In [5]:
# Agent:
#   model as Model
#   state as State
#   exploration as Float
class Agent():

    # needs a model to represent the rewards
    def __init__(self, model, start_state, exploration):
        self.model = model
        self.state = start_state
        self.exploration = exploration

    # encoding
    #   0 <- UP
    #   1 <- RIGHT
    #   2 <- LEFT
    #   3 <- DOWN
    def get_action(self, action_id):
        if action_id == 0:
            return "U"

        if action_id == 1:
            return "R"

        if action_id == 2:
            return "D"

        return "L"

    def next_action(self, env):
        # test against the current exploration constant
        prob = np.random.random()
        action_id = None

        if prob < self.exploration:
            action_id = np.random.choice(4)
        else:
            action_id = self.model.predict(self.state)

        # get the action token
        action = self.get_action(action_id)
        observation = env.next(action)

        self.state = observation[0]

        # return the observation
        return observation

    def reduce_exploration(self):
        self.exploration = self.exploration ** 0.99

In [6]:
# Model
class Model():

    def __init__(self, discount_factor, alpha):
        self.discount_factor = discount_factor
        self.actions_options = ("U", "R", "D", "L")
        self.alpha = alpha
        self.Q = {}

        # initialize the actions for all states to zero
        for y in range(5):
            for x in range(5):
                state = (y, x)

                self.Q[state] = {}

                for a in self.actions_options:
                    self.Q[state][a] = 0


    def predict(self, state):

        actions = self.Q[state]

        max_key = None
        max_val = float('-inf')
        for k, v in actions.items():
            if v > max_val:
                max_val = v
                max_key = k

        return max_key

    def update(self, state, action, reward, state2, action2):
        lastQ = self.Q[state][action]
        self.Q[state][action] = self.Q[state][action] + self.alpha * (reward + self.discount_factor * self.Q[state2][action2] - self.Q[state][action])

        return np.abs(lastQ - self.Q[state][action])

    def policy(self, map):
        policy = []

        for y in range(5):
            l = []

            for x in range(5):
                action = self.predict((y, x))

                if map[y][x] != " ":
                    action = map[y][x]

                l.append(action)

            policy.append(l)

        return policy

In [8]:
def episode(agent, env):

    done = False

    state = (0, 0)
    observation = agent.next_action(env)
    action = observation[1]

    highest_delta = 0

    while not done:
        #  state_position, action, reward, done
        observation = agent.next_action(env)

        state2 = observation[0]
        action2 = observation[1]
        reward = observation[2]
        done = observation[3]

        delta = agent.model.update(state, action, reward, state2, action2)
        highest_delta = max(delta, highest_delta)

        state = state2
        action = action2

        if done:
            agent.model.Q[state][action] = reward

    return highest_delta

def train_agent(agent, env):
    done = False
    max_iterations = 1000
    i = 0

    while not done:
        change = episode(agent, env)
        env.reset()
        done = (change < 0.005)

        i = i + 1
        if i == max_iterations:
            done = True

        agent.reduce_exploration()

    policy = agent.model.policy(env.map)

    for l in policy:
        print(l)

In [22]:
# grid_world = Environment()
# agent_model = Model(discount_factor=0.98, alpha=0.1)
# agent = Agent(agent_model, (0,0), 1.0)
# train_agent(agent, grid_world)

In [19]:
agent_model.Q

{(0, 0): {'U': 61.05048114314936,
  'R': 64.75281822208728,
  'D': 58.69488618959197,
  'L': 61.90057745957728},
 (0, 1): {'U': 66.4048918619842,
  'R': 90.54349006476626,
  'D': 63.114264483608935,
  'L': 61.589424566442474},
 (0, 2): {'U': 86.78094798620326,
  'R': 145.9536423765898,
  'D': 80.55690534528776,
  'L': 69.10096140282273},
 (0, 3): {'U': 141.35741470293084,
  'R': 100,
  'D': 101.51841338708829,
  'L': 83.86474510752114},
 (0, 4): {'U': 0, 'R': 0, 'D': 0, 'L': 0},
 (1, 0): {'U': 61.26512971757272,
  'R': 67.30804336610764,
  'D': 53.04006561380301,
  'L': 61.0224626456537},
 (1, 1): {'U': 70.92454467023319,
  'R': 77.67107027571464,
  'D': 57.376310697481415,
  'L': 56.328755661162},
 (1, 2): {'U': 96.10112867204197,
  'R': 93.47538274406405,
  'D': 70.12883609089943,
  'L': 61.58467666861638},
 (1, 3): {'U': 120.7285992406861,
  'R': 114.03711327456833,
  'D': 79.29018926206925,
  'L': 83.26805318048483},
 (1, 4): {'U': 100,
  'R': 117.3475898072442,
  'D': 84.148472454