In [1]:
# initializing the grid environment:
DESERT = "desert"
AGENT = "agent"
WATER = "water"
EMPTY = "*"

grid = [[AGENT, EMPTY, DESERT], [EMPTY, EMPTY, WATER]]

# initializing the agent's movement:
UP = 0
DOWN = 1
LEFT = 2
RIGHT = 3

ACTIONS = [UP, DOWN, LEFT, RIGHT]

In [2]:
class State:

    def __init__(self, grid, agent_pos):
        self.grid = grid
        self.agent_pos = agent_pos

    def __eq__(self, other):
        return (
            isinstance(other, State)
            and self.grid == other.grid
            and self.agent_pos == other.agent_pos
        )

    def __hash__(self):
        return hash(str(self.grid) + str(self.agent_pos))

    def __str__(self):
        return f"State(grid={self.grid}, agent_pos={self.agent_pos})"


# initializing the start state:
start_state = State(grid=grid, agent_pos=[0, 0])

In [3]:
from copy import deepcopy

def new_agent_pos(state, action):
    p = deepcopy(state.agent_pos)
    if action == UP:
        p[0] = max(0, p[0] - 1)
    elif action == DOWN:
        p[0] = min(len(state.grid) - 1, p[0] + 1)
    elif action == LEFT:
        p[1] = max(0, p[1] - 1)
    elif action == RIGHT:
        p[1] = min(len(state.grid[0]) - 1, p[1] + 1)
    else:
        raise ValueError(f"Unknown action {action}")
    return p

# inspecting the new position:
p = new_agent_pos(State(grid=grid, agent_pos=[1, 0]), RIGHT)
print(p)

[1, 1]


In [4]:
def act(state, action):
            
    p = new_agent_pos(state, action)
    grid_item = state.grid[p[0]][p[1]]
    
    new_grid = deepcopy(state.grid)
    
    if grid_item == DESERT:
        reward = -100
        is_done = True
        new_grid[p[0]][p[1]] += AGENT
        
    elif grid_item == WATER:
        reward = 1000
        is_done = True
        new_grid[p[0]][p[1]] += AGENT
        
    elif grid_item == EMPTY:
        reward = -1
        is_done = False
        old = state.agent_pos
        new_grid[old[0]][old[1]] = EMPTY
        new_grid[p[0]][p[1]] = AGENT        
        
    elif grid_item == AGENT:
        reward = -1
        is_done = False
        
    else:
        raise ValueError(f"Unknown grid item {grid_item}")
    
    return State(grid=new_grid, agent_pos=p), reward, is_done

# inspecting the act function:
s, r, d = act(start_state, RIGHT)
print(s)

State(grid=[['*', 'agent', 'desert'], ['*', '*', 'water']], agent_pos=[0, 1])


In [5]:
import numpy as np
import random

random.seed(2024)

N_STATES = 6
N_EPISODES = 20

MAX_EPISODE_STEPS = 100

MIN_ALPHA = 0.02

alphas = np.linspace(1.0, MIN_ALPHA, N_EPISODES)
gamma = 1.0
eps = 0.2

q_table = dict()

In [6]:
def q(state, action=None):
    
    if state not in q_table:
        q_table[state] = np.zeros(len(ACTIONS))
        
    if action is None:
        return q_table[state]
    
    return q_table[state][action]


def choose_action(state):
    if random.uniform(0, 1) < eps:
        return random.choice(ACTIONS) 
    else:
        return np.argmax(q(state))

In [7]:
# training the agent:
for e in range(N_EPISODES):
    
    state = start_state
    total_reward = 0
    alpha = alphas[e]
    
    for _ in range(MAX_EPISODE_STEPS):
        action = choose_action(state)
        next_state, reward, done = act(state, action)
        total_reward += reward
        
        q(state)[action] = q(state, action) + \
                alpha * (reward + gamma *  np.max(q(next_state)) - q(state, action))
        state = next_state
        if done:
            break
    print(f"Episode {e + 1}: total reward -> {total_reward}")

Episode 1: total reward -> 979
Episode 2: total reward -> 997
Episode 3: total reward -> -101
Episode 4: total reward -> 995
Episode 5: total reward -> 997
Episode 6: total reward -> 998
Episode 7: total reward -> 998
Episode 8: total reward -> 998
Episode 9: total reward -> 998
Episode 10: total reward -> 998
Episode 11: total reward -> 997
Episode 12: total reward -> 998
Episode 13: total reward -> 998
Episode 14: total reward -> 998
Episode 15: total reward -> 997
Episode 16: total reward -> 996
Episode 17: total reward -> 998
Episode 18: total reward -> 996
Episode 19: total reward -> 998
Episode 20: total reward -> 998


In [8]:
# inspecting the q_table:
r = q(start_state)
print(f"up={r[UP]}, down={r[DOWN]}, left={r[LEFT]}, right={r[RIGHT]}")

up=-2.0, down=261.56121883656516, left=479.65705625534633, right=997.8221292574273


In [9]:
# since the right action has the highest value, the agent should move right:
new_state, reward, done = act(start_state, RIGHT)

In [10]:
# inspecting the new state:
r = q(new_state)
print(f"up={r[UP]}, down={r[DOWN]}, left={r[LEFT]}, right={r[RIGHT]}")

up=-1.845263157894737, down=998.9890982508236, left=318.69261805835083, right=-89.6842105263158


In [11]:
# since the down action has the highest value, the agent should move right:
new_state, reward, done = act(start_state, DOWN)

In [12]:
# inspecting the new state:
r = q(new_state)
print(f"up={r[UP]}, down={r[DOWN]}, left={r[LEFT]}, right={r[RIGHT]}")

up=-2.0, down=261.56121883656516, left=-2.0, right=961.7545706371192


*Yeah! The agent moves to WATER finally which is expected.*