In [47]:
from copy import deepcopy
import numpy as np
import random


TRAFFIC = "T"
AGENT = "A"
HOME = "H"
EMPTY = "E"

grid = [
    [HOME, EMPTY],
    [TRAFFIC, AGENT]
]
    
UP = 0
DOWN = 1
LEFT = 2
RIGHT = 3

ACTIONS = [UP, DOWN, LEFT, RIGHT]


class State:
    
    def __init__(self, grid, agent_pos):
        self.grid = grid
        self.agent_pos = agent_pos
        
    def __eq__(self, other):
        return isinstance(other, State) and self.grid == other.grid and self.agent_pos == other.agent_pos
    
    def __hash__(self):
        return hash(str(self.grid) + str(self.agent_pos))
    
    def __str__(self):
        return f"State(grid={self.grid}, agent_pos={self.agent_pos})"
    
start_state = State(grid=grid, agent_pos=[1, 1])



def path(state, action):
    
    def new_pos(state, action):
        p = deepcopy(state.agent_pos)
        if action == UP:
            p[0] = max(0, p[0] - 1)
        elif action == DOWN:
            p[0] = min(len(state.grid) - 1, p[0] + 1)
        elif action == LEFT:
            p[1] = max(0, p[1] - 1)
        elif action == RIGHT:
            p[1] = min(len(state.grid[0]) - 1, p[1] + 1)
        else:
            raise ValueError(f"Unknown action {action}")
        return p
            
    p = new_pos(state, action)
    grid_item = state.grid[p[0]][p[1]]
    
    new_grid = deepcopy(state.grid)
    
    if grid_item == TRAFFIC:
        reward = -10
        is_done = True
        new_grid[p[0]][p[1]] += AGENT
    elif grid_item == HOME:
        reward = 100
        is_done = True
        new_grid[p[0]][p[1]] += AGENT
    elif grid_item == EMPTY:
        reward = -1
        is_done = False
        old = state.agent_pos
        new_grid[old[0]][old[1]] = EMPTY
        new_grid[p[0]][p[1]] = AGENT
    elif grid_item == AGENT:
        reward = -1
        is_done = False
    else:
        raise ValueError(f"Unknown grid item {grid_item}")
    
    return State(grid=new_grid, agent_pos=p), reward, is_done


N_STATES = 4
N_EPISODES = 10

MAX_EPISODE_STEPS = 10

MIN_ALPHA = 0.02

alphas = np.linspace(1.0, MIN_ALPHA, N_EPISODES)
gamma = 1.0
eps = 0.2

q_table = dict()

def q(state, action=None):
    
    if state not in q_table:
        q_table[state] = np.zeros(len(ACTIONS))
        
    if action is None:
        return q_table[state]
    
    return q_table[state][action]

def choose_action(state):
    if random.uniform(0, 1) < eps:
        return random.choice(ACTIONS) 
    else:
        return np.argmax(q(state))

for e in range(N_EPISODES):
    
    state = start_state
    total_reward = 0
    alpha = alphas[e]
    
    for i in range(MAX_EPISODE_STEPS):
        action = choose_action(state)
        next_state, reward, done = path(state, action)
        total_reward += reward
        
        q(state)[action] = q(state, action) + \
                alpha * (reward + gamma *  np.max(q(next_state)) - q(state, action))
        state = next_state
        if done:
            break
    print(f"Episode {e + 1}: total reward -> {total_reward}")

Episode 1: total reward -> -14
Episode 2: total reward -> 97
Episode 3: total reward -> 99
Episode 4: total reward -> 98
Episode 5: total reward -> 98
Episode 6: total reward -> 99
Episode 7: total reward -> 99
Episode 8: total reward -> 99
Episode 9: total reward -> 99
Episode 10: total reward -> 97


In [48]:
r = q(start_state)
print(f"up={r[UP]}, down={r[DOWN]}, left={r[LEFT]}, right={r[RIGHT]}")

up=97.62234223747662, down=48.39273733150436, left=-10.0, right=45.0056329218107


In [50]:
new_state, reward, done = path(start_state, UP)
r = q(new_state)
print(f"up={r[UP]}, down={r[DOWN]}, left={r[LEFT]}, right={r[RIGHT]}")

up=-1.0, down=0.9519170608828851, left=99.92190645654732, right=0.0
