In [1]:
#!pip install gridworld

In [8]:
from gridworld import gridworld
import numpy as np
import matplotlib.pyplot as plt

In [10]:
def standard_grid():
    """
    Uma função que cria uma standard grid, aquela que mais vezes vai ser utilizada
    ao longo dos exemplos deste documento.
    """
    grid = gridworld(3,4,[(1,1)],[(0,3),(1,3)],(2,0),{(0,3): 1,(1,3): -1})
    return grid


class gridworld:
    def __init__(self, rows, columns, walls, end_states, start_position, rewards, allowed_actions=['U', 'D', 'L', 'R']):
        
        self.grid = np.ones((rows,columns))
        self.max_rows = rows
        self.max_cols = columns
        
        for wall in walls:
            self.grid[wall[0],wall[1]] = 0
        
        self.end_states = end_states
        self.current_position_row = start_position[0]
        self.current_position_col = start_position[1]
        self.rewards = rewards
        self.allowed_actions = allowed_actions
        self.all_positions = self.possible_states()
        self.start_position = start_position
        
    
    def change_position(self,d_rows, d_cols):
        target_position_row = self.current_position_row + d_rows
        target_position_col = self.current_position_col + d_cols
        
        check_position = self.is_position_allowed((target_position_row,target_position_col))
        if check_position:
            self.current_position_row = target_position_row
            self.current_position_col = target_position_col

        
    def check_if_game_over(self):
        if (self.current_position_row, self.current_position_col) in self.end_states:
            return True
        else:
            return False
    
    def move(self, action, epsilon = 0.05):
        d_rows = 0
        d_cols = 0
        
        if np.random.rand() < epsilon: 
            action = np.random.choice(self.allowed_actions)
            
        if action in self.allowed_actions:
            
            if action == 'U':
                d_rows = -1
            elif action == 'D':
                d_rows = 1
            elif action == 'L':
                d_cols = -1
            elif action == 'R':
                d_cols = 1
                
            self.change_position(d_rows, d_cols)
    
    def possible_states(self):
        output = []
        for row in range(self.max_rows):
            for col in range(self.max_cols):
                if self.grid[row,col]:
                    output.append((row,col))
        return output
    
    def is_position_allowed(self, position):
        p_row = position[0]
        p_col = position[1]

        if (p_row >= 0) & (p_row < self.max_rows):
            if (p_col >= 0) & (p_col < self.max_cols):
                if self.grid[p_row,p_col]:
                    return True
        return False
    
    def new_position(self, new_position):
        if self.grid[new_position[0],new_position[1]]:
            self.current_position_row = new_position[0]
            self.current_position_col = new_position[1]
    
    def current_position(self):
        return (self.current_position_row, self.current_position_col)
        
    
    def what_if_move(self, start, action):
        
        start_row = start[0]
        start_col = start[1]
        
        d_rows = 0
        d_cols = 0

        if action in self.allowed_actions:
            if action == 'U':
                d_rows = -1
            elif action == 'D':
                d_rows = 1
            elif action == 'L':
                d_cols = -1
            elif action == 'R':
                d_cols = 1
        
        target_position_row = start_row + d_rows
        target_position_col = start_col + d_cols
        
        target_position = (target_position_row, target_position_col)
        
        check_position = self.is_position_allowed(target_position)
        
        if check_position:
            return target_position
        else:
            return start

In [13]:
# precisamos de fazer print à value function e à política

def print_politica(P,grid):
    """
    Faz print a uma politica P.
    """
    rows, cols = grid.grid.shape
    for row in range(rows):
        print ("----------------")
        for col in range(cols):
            a = P.get((row,col),' ')
            print(f' {a} |', end="")
        print("")
    
def print_grid(grid):
    """
    Faz print a uma grid em que cada posição consiste
    nas coordenadas na matriz. 
    """
    rows, cols = grid.grid.shape
    for row in range(rows):
        print ("--------------------------------")
        for col in range(cols):
            print(f' ({row},{col}) |', end = "")
        
        print("")

def print_value(V, grid):
    """
    Faz print à função valor V.
    """
    rows, cols = grid.grid.shape
    for row in range(rows):
        print ("----------------------------")
        for col in range(cols):
            v = V.get((row,col),0)
            a = np.round(np.abs(v),2)
            prefix = "-" if v < 0 else " "
            print(f'{prefix}%.2f |' % a, end="")

        print("")

policy = {
     (0, 0): 'R',
     (0, 1): 'R',
     (0, 2): 'R',
     (1, 0): 'U',
     (1, 2): 'R',
     (2, 0): 'U',
     (2, 1): 'R',
     (2, 2): 'U',
     (2, 3): 'L'
}

V = {
     (0, 0): -1,
     (0, 1): 2,
     (0, 2): 4,
     (1, 0): 7,
     (1, 2): 0.2,
     (2, 0): 4,
     (2, 1): 1,
     (2, 2): 9,
     (2, 3): 0.5
}

print_politica(policy, grid)

print_grid(grid)

print_value(V, grid)

def politica_random(grid):
    # todos os estados:
    states = grid.all_positions
    end_states = grid.end_states
    all_actions = grid.allowed_actions
    P = {}
    
    # Loop random choice
    for state in states:
        if state not in end_states:
            P[state] = np.random.choice(all_actions)
    
    return P

def epsilon_action(a, eps=0.1, grid=grid):
    """
    Escolhe a acção a com uma probabilidade 1-eps e uma acção random 
    com uma probabilidade eps.
    """
    p = np.random.random()
    if p < (1 - eps):
        return a
    else:
        return np.random.choice(grid.allowed_actions)


def td_game(grid, policy):
    """
    Semelhante ao monte carlo game mas, desta vez, a aleatoriedade é 
    dada pelo epsilon greedy e não por começar numa posição aleatória.
    """
    s = grid.start_position
    grid.new_position(s)
    states_and_rewards = [(s, 0)] 
    while not grid.check_if_game_over():
        a = policy[s]
        a = epsilon_action(a)
        grid.move(a)
        s = grid.current_position()
        r = grid.rewards.get(s,0)
        states_and_rewards.append((s, r))
    return states_and_rewards

gamma = 0.9
lr = 0.1

grid = standard_grid()

# politica para avaliar
policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'U',
    (2, 1): 'R',
    (2, 2): 'U',
    (2, 3): 'L',
}

V = {}
states = grid.all_positions
for s in states:
    V[s] = 0

for i in range(100000):

    states_and_rewards = td_game(grid, policy)
    for t in range(len(states_and_rewards) - 1):
        s, _ = states_and_rewards[t]
        s2, r = states_and_rewards[t+1]
        V[s] = V[s] + lr*(r + gamma*V[s2] - V[s])

print("values:")
print_value(V, grid)
print("policy:")
print_politica(policy, grid)

----------------
 R | R | R |   |
----------------
 U |   | R |   |
----------------
 U | R | U | L |
--------------------------------
 (0,0) | (0,1) | (0,2) | (0,3) |
--------------------------------
 (1,0) | (1,1) | (1,2) | (1,3) |
--------------------------------
 (2,0) | (2,1) | (2,2) | (2,3) |
----------------------------
-1.00 | 2.00 | 4.00 | 0.00 |
----------------------------
 7.00 | 0.00 | 0.20 | 0.00 |
----------------------------
 4.00 | 1.00 | 9.00 | 0.50 |
values:
----------------------------
 0.76 | 0.85 | 0.98 | 0.00 |
----------------------------
 0.68 | 0.00 | 0.76 | 0.00 |
----------------------------
 0.60 | 0.59 | 0.63 | 0.57 |
policy:
----------------
 R | R | R |   |
----------------
 U |   | U |   |
----------------
 U | R | U | L |
