# Policy Evaluation(To find V(s) for every s)
# (1)Fixed policy for GridWorld-> v(s)=r+γv(s')

## -Step 1: include grid_world 
>Grid Class含有的變數如下    
width、height：設定grid world長寬    
i、j：目前所在的位置，起始為start[0]和start[1]    
rewards、actions：dictionary型態，設定Grid world的格子狀態   

In [2]:
import grid_world
from grid_world import *

In [3]:
# "uniform random policy"使用的topology
grid = Grid(3, 4, (2, 0))            #設定width, height, start位置
rewards = {(0, 3): 1, (1, 3): -1}    #設定reward位置
actions = {                          #每個格子可以移動的方向
    (0, 0): ('D', 'R'),
    (0, 1): ('L', 'R'),
    (0, 2): ('L', 'D', 'R'),
    (1, 0): ('U', 'D'),
    (1, 2): ('U', 'D', 'R'),
    (2, 0): ('U', 'R'),
    (2, 1): ('L', 'R'),
    (2, 2): ('L', 'R', 'U'),
    (2, 3): ('L', 'U'),  }

grid.set(rewards, actions)
print('Rewards=',grid.rewards)
print('Actions=',grid.actions)

Rewards= {(0, 3): 1, (1, 3): -1}
Actions= {(0, 0): ('D', 'R'), (0, 1): ('L', 'R'), (0, 2): ('L', 'D', 'R'), (1, 0): ('U', 'D'), (1, 2): ('U', 'D', 'R'), (2, 0): ('U', 'R'), (2, 1): ('L', 'R'), (2, 2): ('L', 'R', 'U'), (2, 3): ('L', 'U')}


In [4]:
# 隨機製造fixed policy
list_random_action(grid)

---------------------------
  D  |  R  |  D  |     |
---------------------------
  D  |     |  U  |     |
---------------------------
  R  |  L  |  L  |  U  |


### 製造出3x4的Grid World，有固定的移動方向    

In [5]:
# the grid looks like this
# x means you can't go there
# s means start position
# number means reward at that state
# .  .  .  1
# .  x  . -1
# s  .  .  .
grid = Grid(3, 4, (2, 0))            #設定width, height, start位置
rewards = {(0, 3): 1, (1, 3): -1}    #設定reward位置
actions = {                          #每個格子可以移動的方向
    (0, 0): ('R'),
    (0, 1): ('R'),
    (0, 2): ('R'),
    (1, 0): ('U'),
    (1, 2): ('R'),
    (2, 0): ('U'),
    (2, 1): ('R'),
    (2, 2): ('U'),
    (2, 3): ('U'),  }

grid.set(rewards, actions)           #把rewards和actions存入grid中
states = grid.all_states()

print('Rewards=',grid.rewards)
print('Actions=',grid.actions)
print(states)

for i in range(grid.width):
    print("---------------------------")
    for j in range(grid.height):
        a = grid.actions.get((i,j), ' ')
        print("  %s  |" %a, end="")
    print("")

Rewards= {(0, 3): 1, (1, 3): -1}
Actions= {(0, 0): 'R', (0, 1): 'R', (0, 2): 'R', (1, 0): 'U', (1, 2): 'R', (2, 0): 'U', (2, 1): 'R', (2, 2): 'U', (2, 3): 'U'}
{(0, 1), (1, 2), (0, 0), (1, 3), (2, 1), (2, 0), (2, 3), (2, 2), (1, 0), (0, 2), (0, 3)}
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  U  |  U  |


## -Step 2: initialization

In [6]:
#initialize V, gamma, policy, grid
v = {}    #list
gamma = 0.8
small_enough = 1e-3
for s in states:
    v[s] = 0

## -Step 3: Main function

In [7]:
import numpy as np
while True:
    biggest_change = 0.0
    for s in states:                     #s會得到(0, 1)、(1, 2)...等11種state
        old_v = v[s]
        new_v = 0
        if s in grid.actions.keys():
            #for a in grid.actions[s]:   #uniform random
            a = grid.actions[s]          #a會得到R或U
            grid.set_state(s)            #i,j會變成s所在的格子
            r = grid.move(a)             #i,j的數字會隨前進方向改變，r會收到回傳的reward值
            new_v = r + gamma * v[grid.current_state()]
        v[s] = round(new_v,5)
        biggest_change = max(biggest_change, np.abs(old_v - new_v))
    if biggest_change < small_enough:
        break

print(v)
print("\nGrid World V[s]如下(gamma值為0.8)：")
print_values(v, grid)

{(0, 1): 0.8, (1, 2): -1.0, (0, 0): 0.64, (1, 3): 0, (2, 1): -0.64, (2, 0): 0.4096, (2, 3): -1.0, (2, 2): -0.8, (1, 0): 0.512, (0, 2): 1.0, (0, 3): 0}

Grid World V[s]如下(gamma值為0.8)：
---------------------------
 0.64| 0.80| 1.00| 0.00|
---------------------------
 0.51| 0.00|-1.00| 0.00|
---------------------------
 0.41|-0.64|-0.80|-1.00|


# (2)uniformly random policy for GridWorld-> v(s)=Σπ(a|s){r+γv(s')}
## -Step 1:Set Grid world 

In [8]:
grid = Grid(3, 4, (2, 0))            #設定width, height, start位置
rewards = {(0, 3): 1, (1, 3): -1}    #設定reward位置
actions = {                          #每個格子可以移動的方向
    (0, 0): ('D', 'R'),
    (0, 1): ('L', 'R'),
    (0, 2): ('L', 'D', 'R'),
    (1, 0): ('U', 'D'),
    (1, 2): ('U', 'D', 'R'),
    (2, 0): ('U', 'R'),
    (2, 1): ('L', 'R'),
    (2, 2): ('L', 'R', 'U'),
    (2, 3): ('L', 'U'),  }
prob = {                          #走向可移動方向的機率
    (0, 0): 1/2,
    (0, 1): 1/2,
    (0, 2): 1/3,
    (1, 0): 1/2,
    (1, 2): 1/3,
    (2, 0): 1/2,
    (2, 1): 1/2,
    (2, 2): 1/3,
    (2, 3): 1/2,  }

grid.set(rewards, actions)
states = grid.all_states()

print('Rewards=',grid.rewards)
print('Actions=',grid.actions)
print(states)

for i in range(grid.width):
    print("---------------------------")
    for j in range(grid.height):
        a = grid.actions.get((i,j), ' ')
        if len(a) == 1:
            print("  %s  |" %a[0], end="")
        elif len(a) == 2:
            print(" %s,%s |" %(a[0],a[1]), end="")
        elif len(a) == 3:
            print("%s,%s,%s|" %(a[0],a[1],a[2]), end="")
    print("")

Rewards= {(0, 3): 1, (1, 3): -1}
Actions= {(0, 0): ('D', 'R'), (0, 1): ('L', 'R'), (0, 2): ('L', 'D', 'R'), (1, 0): ('U', 'D'), (1, 2): ('U', 'D', 'R'), (2, 0): ('U', 'R'), (2, 1): ('L', 'R'), (2, 2): ('L', 'R', 'U'), (2, 3): ('L', 'U')}
{(0, 1), (1, 2), (0, 0), (1, 3), (2, 1), (2, 0), (2, 3), (2, 2), (1, 0), (0, 2), (0, 3)}
---------------------------
 D,R | L,R |L,D,R|     |
---------------------------
 U,D |     |U,D,R|     |
---------------------------
 U,R | L,R |L,R,U| L,U |


## -Step 2:Calculate V[s] 

In [9]:
#initialize V, gamma, policy, grid
v = {}
gamma = 0.9
small_enough = 1e-3
for s in states:
    v[s] = 0

while True:
    biggest_change = 0.0
    for s in states:                     #s會得到(0, 1)、(1, 2)...等11種state
        old_v = v[s]
        new_v = 0
        if s in grid.actions.keys():
            for a in grid.actions[s]:   #uniform random
                grid.set_state(s)            #i,j會變成s所在的格子
                r = grid.move(a)             #i,j的數字會隨前進方向改變，r會收到回傳的reward值
                #new_v += (r + gamma * v[grid.current_state()]) * prob[s]
                new_v += (r + gamma * v[grid.current_state()]) / len(grid.actions[s])
        v[s] = round(new_v,5)
        biggest_change = max(biggest_change, np.abs(old_v - new_v))
    if biggest_change < small_enough:
        break

print(v)
print("\nGrid World V[s]如下(gamma值為0.9)：")
print_values(v, grid)

{(0, 1): 0.14579, (1, 2): -0.36542, (0, 0): 0.05535, (1, 3): 0, (2, 1): -0.2167, (2, 0): -0.10777, (2, 3): -0.66863, (2, 2): -0.37522, (1, 0): -0.02359, (0, 2): 0.26744, (0, 3): 0}

Grid World V[s]如下(gamma值為0.9)：
---------------------------
 0.06| 0.15| 0.27| 0.00|
---------------------------
-0.02| 0.00|-0.37| 0.00|
---------------------------
-0.11|-0.22|-0.38|-0.67|


# (3)change topology fixed or random 5x6
## 1. Fixed policy

In [10]:
grid = Grid(5, 6, (0, 0))            #設定width, height, start位置
rewards = {(3, 4): 1, (2, 5): -1}    #設定reward位置
actions = {                          #每個格子可以移動的方向
    (0, 0): ('D'), (0, 1): ('R'), (0, 2): ('R'), (0, 3): ('D'), (0, 4): ('R'), (0, 5): ('D'),
    (1, 0): ('D')               , (1, 2): ('R'), (1, 3): ('R'), (1, 4): ('D'), (1, 5): ('D'),
    (2, 0): ('R'), (2, 1): ('R'), (2, 2): ('U'), (2, 3): ('U'), (2, 4): ('D'),
    (3, 0): ('D'), (3, 1): ('L'), (3, 2): ('U'),                               (3, 5): ('U'),
    (4, 0): ('R'), (4, 1): ('R'), (4, 2): ('R'), (4, 3): ('R'), (4, 4): ('R'), (4, 5): ('U'),
}

grid.set(rewards, actions)           #把rewards和actions存入grid中
states = grid.all_states()

print('Rewards=',grid.rewards)
print('Actions=',grid.actions)
print(states)

for i in range(grid.width):
    print("------------------------------------")
    for j in range(grid.height):
        a = grid.actions.get((i,j), ' ')
        print("  %s  |" %a, end="")
    print("")

Rewards= {(3, 4): 1, (2, 5): -1}
Actions= {(0, 0): 'D', (0, 1): 'R', (0, 2): 'R', (0, 3): 'D', (0, 4): 'R', (0, 5): 'D', (1, 0): 'D', (1, 2): 'R', (1, 3): 'R', (1, 4): 'D', (1, 5): 'D', (2, 0): 'R', (2, 1): 'R', (2, 2): 'U', (2, 3): 'U', (2, 4): 'D', (3, 0): 'D', (3, 1): 'L', (3, 2): 'U', (3, 5): 'U', (4, 0): 'R', (4, 1): 'R', (4, 2): 'R', (4, 3): 'R', (4, 4): 'R', (4, 5): 'U'}
{(3, 2), (1, 3), (0, 0), (3, 0), (4, 5), (2, 1), (1, 4), (0, 5), (2, 3), (4, 2), (0, 3), (1, 0), (2, 5), (2, 4), (4, 0), (1, 2), (3, 5), (0, 1), (4, 4), (3, 1), (1, 5), (2, 0), (0, 4), (4, 3), (2, 2), (3, 4), (4, 1), (0, 2)}
------------------------------------
  D  |  R  |  R  |  D  |  R  |  D  |
------------------------------------
  D  |     |  R  |  R  |  D  |  D  |
------------------------------------
  R  |  R  |  U  |  U  |  D  |     |
------------------------------------
  D  |  L  |  U  |     |     |  U  |
------------------------------------
  R  |  R  |  R  |  R  |  R  |  U  |


In [11]:
#initialize V, gamma, policy, grid
v = {}
gamma = 0.8
small_enough = 1e-3
for s in states:
    v[s] = 0
    
while True:
    biggest_change = 0.0
    for s in states:                     #s會得到(0, 1)、(1, 2)...等11種state
        old_v = v[s]
        new_v = 0
        if s in grid.actions.keys():
            #for a in grid.actions[s]:   #uniform random
            a = grid.actions[s]          #a會得到R或U
            grid.set_state(s)            #i,j會變成s所在的格子
            r = grid.move(a)             #i,j的數字會隨前進方向改變，r會收到回傳的reward值
            new_v = r + gamma * v[grid.current_state()]
        v[s] = round(new_v,5)
        biggest_change = max(biggest_change, np.abs(old_v - new_v))
    if biggest_change < small_enough:
        break

print(v)
print("\nGrid World V[s]如下(gamma值為0.8)：")
print_values(v, grid)

{(3, 2): 0.32768, (1, 3): 0.64, (0, 0): 0.16777, (3, 0): -0.20971, (4, 5): -0.8, (2, 1): 0.32768, (1, 4): 0.8, (0, 5): -0.8, (2, 3): 0.512, (4, 2): -0.4096, (0, 3): 0.512, (1, 0): 0.20971, (2, 5): 0, (2, 4): 1.0, (4, 0): -0.26214, (1, 2): 0.512, (3, 5): -1.0, (0, 1): 0.32768, (4, 4): -0.64, (3, 1): -0.16777, (1, 5): -1.0, (2, 0): 0.26214, (0, 4): -0.64, (4, 3): -0.512, (2, 2): 0.4096, (3, 4): 0, (4, 1): -0.32768, (0, 2): 0.4096}

Grid World V[s]如下(gamma值為0.8)：
---------------------------
 0.17| 0.33| 0.41| 0.51|-0.64|-0.80|
---------------------------
 0.21| 0.00| 0.51| 0.64| 0.80|-1.00|
---------------------------
 0.26| 0.33| 0.41| 0.51| 1.00| 0.00|
---------------------------
-0.21|-0.17| 0.33| 0.00| 0.00|-1.00|
---------------------------
-0.26|-0.33|-0.41|-0.51|-0.64|-0.80|


## 2. Random policy

In [12]:
grid = Grid(5, 6, (0, 0))            #設定width, height, start位置
rewards = {(3, 4): 1, (2, 5): -1}    #設定reward位置
actions = {                          #每個格子可以移動的方向，(1,1)
    (0, 0): ('D','R'),(0, 1): ('R','L'),(0, 2): ('D','R','L'), (0, 3): ('D','R','L'), (0, 4): ('D','R','L'), (0, 5): ('D','L'),
    (1, 0): ('D','U'), (1, 2): ('D','R','U'), (1, 3): ('D','R','L','U'), (1, 4): ('D','R','L','U'), (1, 5): ('D','L','U'),
    (2, 0): ('D','R','U'),(2, 1): ('D','R','L'),(2, 2): ('D','R','L','U'), (2, 3): ('R','L','U'),(2, 4): ('D','R','L','U'),
    (3, 0): ('D','R','U'),(3, 1): ('D','R','L', 'U'),(3, 2): ('D','L','U'),(3, 5): ('D','L','U'),
    (4, 0): ('R','U'), (4, 1): ('R','L','U'),(4, 2): ('R','L','U'), (4, 3): ('R','L'),(4, 4): ('R','L','U'),(4, 5): ('L','U'),
}

grid.set(rewards, actions)           #把rewards和actions存入grid中
states = grid.all_states()

print('Rewards=',grid.rewards)
#print('Actions=',grid.actions)
print(states)

for i in range(grid.width):
    print("------------------------------------------------")
    for j in range(grid.height):
        a = grid.actions.get((i,j), ' ')
        if len(a) == 1:
            print("   %s   |" %a[0], end="")
        elif len(a) == 2:
            print("  %s,%s  |" %(a[0],a[1]), end="")
        elif len(a) == 3:
            print(" %s,%s,%s |" %(a[0],a[1],a[2]), end="")
        elif len(a) == 4:
            print("%s,%s,%s,%s|" %(a[0],a[1],a[2],a[3]), end="")
    print("")

Rewards= {(3, 4): 1, (2, 5): -1}
{(3, 2), (1, 3), (0, 0), (3, 0), (4, 5), (2, 1), (1, 4), (0, 5), (2, 3), (4, 2), (0, 3), (1, 0), (2, 5), (2, 4), (4, 0), (1, 2), (3, 5), (0, 1), (4, 4), (3, 1), (1, 5), (2, 0), (0, 4), (4, 3), (2, 2), (3, 4), (4, 1), (0, 2)}
------------------------------------------------
  D,R  |  R,L  | D,R,L | D,R,L | D,R,L |  D,L  |
------------------------------------------------
  D,U  |       | D,R,U |D,R,L,U|D,R,L,U| D,L,U |
------------------------------------------------
 D,R,U | D,R,L |D,R,L,U| R,L,U |D,R,L,U|       |
------------------------------------------------
 D,R,U |D,R,L,U| D,L,U |       |       | D,L,U |
------------------------------------------------
  R,U  | R,L,U | R,L,U |  R,L  | R,L,U |  L,U  |


In [13]:
#initialize V, gamma, policy, grid
v = {}
gamma = 1.0
small_enough = 1e-3
for s in states:
    v[s] = 0

while True:
    biggest_change = 0.0
    for s in states:                     #s會得到(0, 1)、(1, 2)...等11種state
        old_v = v[s]
        new_v = 0
        if s in grid.actions.keys():
            for a in grid.actions[s]:   #uniform random
                grid.set_state(s)            #i,j會變成s所在的格子
                r = grid.move(a)             #i,j的數字會隨前進方向改變，r會收到回傳的reward值
                new_v += (r + gamma * v[grid.current_state()]) / len(grid.actions[s])
        v[s] = round(new_v,5)
        biggest_change = max(biggest_change, np.abs(old_v - new_v))
    if biggest_change < small_enough:
        break

print(v)
print("\nGrid World V[s]如下(gamma值為1.0)：")
print_values(v, grid)

{(3, 2): 0.07434, (1, 3): -0.18667, (0, 0): -0.06428, (3, 0): 0.05633, (4, 5): 0.34616, (2, 1): 0.01619, (1, 4): -0.29878, (0, 5): -0.45358, (2, 3): -0.10746, (4, 2): 0.19101, (0, 3): -0.21973, (1, 0): -0.02363, (2, 5): 0, (2, 4): -0.10156, (4, 0): 0.08541, (1, 2): -0.1234, (3, 5): 0.11539, (0, 1): -0.10669, (4, 4): 0.57679, (3, 1): 0.06534, (1, 5): -0.58412, (2, 0): 0.0163, (0, 4): -0.32403, (4, 3): 0.3839, (2, 2): -0.03508, (3, 4): 0, (4, 1): 0.11392, (0, 2): -0.14994}

Grid World V[s]如下(gamma值為1.0)：
---------------------------
-0.06|-0.11|-0.15|-0.22|-0.32|-0.45|
---------------------------
-0.02| 0.00|-0.12|-0.19|-0.30|-0.58|
---------------------------
 0.02| 0.02|-0.04|-0.11|-0.10| 0.00|
---------------------------
 0.06| 0.07| 0.07| 0.00| 0.00| 0.12|
---------------------------
 0.09| 0.11| 0.19| 0.38| 0.58| 0.35|
