In [2]:
import numpy as np

In [3]:
actions = ['up', 'down', 'left', 'right']
row = 3
column = 4
start_state = (2, 0)
win_state = (0, 3)
lose_state = (1, 3)
wall = (1, 1)
policy = np.random.choice(actions, (row, column))
v_s = np.zeros((row, column))
tetha = 1e-5
gamma = 0.9
epsilon = 0.1

In [4]:
def move(a, r, c):
    current_state = (r, c)
    if current_state == wall:
        return current_state
    new_r, new_c = r, c
    if a == 'up':
        new_r = max(r-1, 0)
    elif a == 'down':
        new_r = min(r+1, row-1)
    elif a == 'right':
        new_c = min(c+1, column-1)
    elif a == 'left':
        new_c = max(c-1, 0)
    new_state = (new_r, new_c)
    if new_state == wall:
        return current_state
    else:
        return new_state

In [5]:
def end_game(state):
    return state == win_state or state == lose_state

In [6]:
def reward(state):
    if state == win_state:
        return 1
    elif state == lose_state:
        return -1
    return -0.04

In [7]:
def monte_carlo_policy_eva():
    global v_s
    returns_sum = np.zeros((row, column))
    returns_count = np.zeros((row, column))
    
    for _ in range(5000):
        state = start_state
        episode_states = []
        episode_rewards = []
        steps = 0
        
        while not end_game(state) and steps < 100:
            action = policy[state[0], state[1]]
            next_state = move(action, state[0], state[1])
            r = reward(next_state)

            episode_states.append(state)
            episode_rewards.append(r)
            state = next_state
            steps += 1
        
        G = 0
        visited = set()
        for t in reversed(range(len(episode_states))):
            s = episode_states[t]
            G = episode_rewards[t] + gamma * G
            if s not in visited:
                visited.add(s)
                returns_sum[s[0], s[1]] += G
                returns_count[s[0], s[1]] += 1
    
    for r in range(row):
        for c in range(column):
            if returns_count[r, c] > 0:
                v_s[r, c] = returns_sum[r, c] / returns_count[r, c]


In [8]:
monte_carlo_policy_eva()

In [9]:
print("Value function:")
print(v_s)

Value function:
[[ 0.          0.          0.          0.        ]
 [-0.04        0.          0.          0.        ]
 [-0.39998938  0.          0.          0.        ]]


In [10]:
policy

array([['down', 'up', 'up', 'right'],
       ['left', 'left', 'up', 'left'],
       ['up', 'down', 'down', 'left']], dtype='<U5')

In [11]:
def random_init_state():
    while True:
        state = (np.random.randint(0, row), np.random.randint(0, column))
        if (state != wall) and (state != win_state) and (state != lose_state):
            return state

def random_action():
    return np.random.choice(actions)

In [12]:
def monte_carlo_policy_imp():
    global policy
    Q = np.zeros((row, column, len(actions)))
    returns = {((r, c), a): [] for r in range(row) for c in range(column) for a in actions}

    for _ in range(3000):
        state = random_init_state()
        action = random_action()
        episode = []
        steps = 0

        while not end_game(state) and steps < 100:
            next_state = move(action, state[0], state[1])
            r = reward(next_state)
            episode.append((state, action, r))
            state = next_state
            action = random_action()
            steps += 1

        G = 0
        visited = set()
        for t in reversed(range(len(episode))):
            state, action, r = episode[t]
            G = r + gamma * G
            state_action = (state, action)
            if state_action not in visited:
                visited.add(state_action)
                returns[state, action].append(G)
                Q[state[0], state[1], actions.index(action)] = np.mean(returns[state, action])

        for r in range(row):
            for c in range(column):
                if (r, c) == wall or end_game((r, c)):
                    continue
                best_action_index = np.argmax(Q[r, c])
                
                if np.random.rand() < epsilon:
                    policy[r, c] = np.random.choice(actions)
                else:
                    policy[r, c] = actions[best_action_index]



In [13]:
for i in range(10):
    monte_carlo_policy_imp()
    if i % 2 == 0:
        monte_carlo_policy_eva()

In [14]:
print("Final Policy:")
print(policy)
print("\nValue function:")
print(v_s)

Final Policy:
[['right' 'right' 'right' 'right']
 ['up' 'left' 'up' 'left']
 ['up' 'up' 'left' 'left']]

Value function:
[[0.734   0.86    1.      0.     ]
 [0.6206  0.      0.      0.     ]
 [0.51854 0.      0.      0.     ]]
