# Monte Carlo Prediction
## 1. First-visit MC

In [1]:
import grid_world
from grid_world import *
import numpy as np

In [9]:
SMALL_ENOUGH = 1e-3
GAMMA = 0.9
ALL_POSSIBLE_ACTIONS = ('U', 'D', 'L', 'R')

grid = standard_grid()

print("rewards:")
print_values(grid.rewards, grid)

rewards:
---------------------------
 0.00| 0.00| 0.00| 1.00|
---------------------------
 0.00| 0.00| 0.00|-1.00|
---------------------------
 0.00| 0.00| 0.00| 0.00|


In [10]:
policy = {
    (2, 0): 'U',
    (1, 0): 'U',
    (0, 0): 'R',
    (0, 1): 'R',
    (0, 2): 'R',
    (1, 2): 'R',
    (2, 1): 'R',
    (2, 2): 'R',
    (2, 3): 'U',
}

print("Policy:")
print_policy(policy, grid)

Policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |


In [11]:
def play_game(grid, policy):
    start_states = list(grid.actions.keys())
    start_idx = np.random.choice(len(start_states))
    grid.set_state(start_states[start_idx])

    s = grid.current_state()
    states_and_rewards = [(s, 0)] # list of tuples of (state, reward)
    while not grid.game_over():
        a = policy[s]
        r = grid.move(a)
        s = grid.current_state()
        states_and_rewards.append((s, r))
        
    G = 0
    states_and_returns = []
    first = True
    for s, r in reversed(states_and_rewards):
        if first:      #跳過第一個(最後的terminal state)
            first = False
        else:
            states_and_returns.append((s, G))
        G = r + GAMMA*G
    states_and_returns.reverse() # we want it to be in order of state visited
    return states_and_returns

In [23]:
# initialize V(s) and returns
V = {}
returns = {} # dictionary of state -> list of returns we've received
states = grid.all_states()
for s in states:
    if s in grid.actions:
        returns[s] = []
    else:
        # terminal state or state we can't otherwise get to
        V[s] = 0
        
states_and_returns = play_game(grid, policy)
print("從隨機開始點，玩一次遊戲：")
print(states_and_returns)

for t in range(100):
    states_and_returns = play_game(grid, policy)
    seen_states = set()   #創建一個無序不重複元素集
    for s, G in states_and_returns:
    # check if we have already seen s
    # called "first-visit" MC policy evaluation
        if s not in seen_states:
            returns[s].append(G)
            V[s] = np.mean(returns[s])    # Returns the average of the array elements
            seen_states.add(s)

print("\nvalues:")
print_values(V, grid)
print("\npolicy:")
print_policy(policy, grid)

從隨機開始點，玩一次遊戲：
[((0, 0), 0.81), ((0, 1), 0.9), ((0, 2), 1.0)]

values:
---------------------------
 0.81| 0.90| 1.00| 0.00|
---------------------------
 0.73| 0.00|-1.00| 0.00|
---------------------------
 0.66|-0.81|-0.90|-1.00|

policy:
---------------------------
  R  |  R  |  R  |     |
---------------------------
  U  |     |  R  |     |
---------------------------
  U  |  R  |  R  |  U  |


### 筆記：
蒙地卡羅預測是在沒有完整環境資訊的情況下使用。    
First-visit MC prediction 是在固定的policy之下，找到各個state的value。    
1.先選定隨機起始點，遊玩一次地圖，走到終點算是遊戲結束，此為一個episode。   
2.從終點回推各state會得到的G值，把對應state和return(G值)記錄下來。   
3.將這個episode中各state第一次出現時的return存起來。   
4.重複上述動作許多次，最後將每個state存起來的很多return值平均，就是value值。   