## First Visit Monte Carlo Prediction Estimating State-Value Function

In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.getcwd()))

from gridworld import GridWorld,plot
import numpy as np

In [13]:
generate_episode(Pi,4)

{(1, 3): -31,
 (0, 3): -29,
 (0, 2): -28,
 (0, 1): -26,
 (1, 1): -24,
 (1, 2): -18,
 (2, 2): -11,
 (2, 1): -10,
 (3, 1): -9,
 (3, 2): -7,
 (3, 0): -4,
 (2, 0): -3,
 (1, 0): -2}

In [19]:

def generate_episode(Pi,size):
    '''
    play an episode on given policy and return G
    '''
    
    env = GridWorld(size)
    done = False
    G = {} #episode reward
    
    while not done:

        #current_state
        current_state = env.agent_position.copy()
        
        #get action and step
        action_prob = Pi[tuple(current_state)] #
        action = np.random.choice(range(len(action_prob)),p = action_prob)
        next_state, reward, done, info = env.step(action)
        
        #append reward to all visited state
        if tuple(current_state) in G:
            G = {k:v+reward for k,v in G.items()}
        else:
            G[tuple(current_state)] = 0
            G = {k:v+reward for k,v in G.items()}

    return G

def MC_prediction(Pi,size,iter_num):
    '''
    perfrom first-visit MC prediction
    '''
    
    V = dict.fromkeys([(i,j) for i in range(size) for j in range(size)], 0) # values as 0
    Returns = {k: [] for k in [(i,j) for i in range(size) for j in range(size)]} # returns as 0
    
    
    for i in range(iter_num):
        #generate an episode
        G = generate_episode(Pi,size)
        
        for seen_state in G:
            #append G to Returns
            Returns[seen_state]+=[G[seen_state]]
            #average returns and update V
            V[seen_state] = round(sum(Returns[seen_state])/len(Returns[seen_state]),1)
    return V,Returns

In [22]:
size = 5
V = dict.fromkeys([(i,j) for i in range(size) for j in range(size)], 0) # values as 0
Pi = dict.fromkeys([(i,j) for i in range(size) for j in range(size) if i+j != 0 and i+j != (size-1)*2], [0.25]*4)# inital actions as UP
V_final,Returns = MC_prediction(Pi,size,1000)

In [23]:
plot(V_final,Pi,size)

0	|-21.8	|-33.8	|-38.5	|-39.6	|
-23.1	|-30.3	|-36.2	|-38.5	|-36.8	|
-33.9	|-35.8	|-37.0	|-34.0	|-31.7	|
-38.0	|-37.6	|-35.7	|-29.9	|-20.2	|
-39.4	|-38.3	|-33.0	|-20.7	|0	|
-------------------------------
X	|↑←↓→	|↑←↓→	|↑←↓→	|↑←↓→	|
↑←↓→	|↑←↓→	|↑←↓→	|↑←↓→	|↑←↓→	|
↑←↓→	|↑←↓→	|↑←↓→	|↑←↓→	|↑←↓→	|
↑←↓→	|↑←↓→	|↑←↓→	|↑←↓→	|↑←↓→	|
↑←↓→	|↑←↓→	|↑←↓→	|↑←↓→	|X	|
 
