In [1]:
import numpy as np

In [2]:
class Cell:
    def __init__(self, i=None, j=None):
        self.value = 0
        self.actions = {}
        self.policy = None
        self.i = i
        self.j = j
        
    def __repr__(self):
        return "location: ({i},{j}), value: {v}, policy: {p}, actions: {a}".format(i=self.i, j = self.j, v=self.value,
                                                                                  p=self.policy, a=self.actions)
        
    def set_actions(self, actions): # actions = {'r' (action):}
        self.actions = actions
        self.policy = list(actions.keys())[0]
        
    def next_state(self, action):
        i, j = self.i, self.j
        
        if action == 'r': return i, j+1
        elif action == 'l': return i, j-1
        elif action == 'u': return i-1, j
        elif action == 'd': return i+1, j
        
        return i, j

In [3]:
def new_grid(n=10, m=10):
    grid = np.array([[Cell(i,j) for j in range(m)] for i in range(n)])
    
    for row in grid[1:-1, 1:-1]:
        for x in row:
            x.set_actions({'r': 0, 'l': 0, 'd': 0, 'u': 0})

    for x in grid[1:-1, 0]:
        x.set_actions({'r': 0, 'd': 0, 'u': 0})

    for x in grid[1:-1, -1]:
        x.set_actions({'l': 0, 'd': 0, 'u': 0})

    for x in grid[0, 1:-1]:
        x.set_actions({'r': 0, 'd': 0, 'l': 0})

    for x in grid[-1, 1:-1]:
        x.set_actions({'r': 0, 'u': 0, 'l': 0})

    grid[0, 0].set_actions({'r': 0, 'd': 0})
    grid[-1, 0].set_actions({'r': 0, 'u': 0})
    grid[-1, -1].set_actions({'l': 0, 'u': 0})
    grid[0, -1].set_actions({'.': 0}) # stay in that cell

    grid[0, -2].actions['r'] = 100
    grid[1, -1].actions['u'] = 100
    
    return grid

In [11]:
θ = 1
γ = 0.8

In [12]:
grid = new_grid()

it = 0

while True:
    
    it += 1

    while True:
        Δ = 0

        for row in grid:
            for x in row:
                tmp = x.value
                a = x.policy
                x.value = x.actions[a] + γ * grid[x.next_state(a)].value
                Δ = max(Δ, np.abs(tmp - x.value))

        if Δ < θ:
            break

    policy_stable = True

    for row in grid:
        for x in row:
            val = -1e10
            for a in x.actions:
                tmp = x.actions[a] + γ * grid[x.next_state(a)].value
                if tmp > val:
                    val = tmp
                    pol = a
            if x.policy != pol:
                x.policy = pol
                policy_stable = False

    if policy_stable:
        break

it

18

In [13]:
grid[0]

array([ location: (0,0), value: 16.777216000000006, policy: r, actions: {'d': 0, 'r': 0},
       location: (0,1), value: 20.97152000000001, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,2), value: 26.21440000000001, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,3), value: 32.76800000000001, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,4), value: 40.96000000000001, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,5), value: 51.2, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,6), value: 64.0, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,7), value: 80.0, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,8), value: 100.0, policy: r, actions: {'l': 0, 'd': 0, 'r': 100},
       location: (0,9), value: 0.0, policy: ., actions: {'.': 0}], dtype=object)

In [9]:
grid = new_grid()

it = 0

while True:
    
    it += 1
    Δ = 0

    for row in grid:
        for x in row:
            val = x.value
            for a in x.actions:
                tmp = x.actions[a] + γ * grid[x.next_state(a)].value
                if tmp > x.value:
                    x.value = tmp
                    x.policy = a

            Δ = max(Δ, np.abs(val - x.value))
    if Δ < θ:
        break

iter

10

In [10]:
grid[0]

array([ location: (0,0), value: 16.777216000000006, policy: r, actions: {'d': 0, 'r': 0},
       location: (0,1), value: 20.97152000000001, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,2), value: 26.21440000000001, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,3), value: 32.76800000000001, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,4), value: 40.96000000000001, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,5), value: 51.2, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,6), value: 64.0, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,7), value: 80.0, policy: r, actions: {'l': 0, 'd': 0, 'r': 0},
       location: (0,8), value: 100.0, policy: r, actions: {'l': 0, 'd': 0, 'r': 100},
       location: (0,9), value: 0, policy: ., actions: {'.': 0}], dtype=object)