# 蒙特卡洛 Exploring Starts算法
## 1.游戏环境设置

In [51]:
# 构造一个网格环境(3x3)
def get_state(row, col):
    if row == 0 and col == 1:
        return 'trap'
    if row == 2 and col == 0:
        return 'trap'
    if row == 2 and col == 2:
        return 'terminal'
    return 'ground'


# 设定agent在每一个格子里的动作
def move(row, col, action):
    # 如果当前已经在陷阱或者终点，则不能执行任何动作，反馈都是0
    if get_state(row, col) in ['trap', 'terminal']:
        return row, col, 0

    # 执行动作
    if action == 0:  # ↑
        row -= 1
    elif action == 1:  # ↓
        row += 1
    elif action == 2:  # ←
        col -= 1
    elif action == 3:  # →
        col += 1

    # 初始化reward
    reward = -1  # 这样强迫了机器尽快结束游戏,因为每走一步都要扣一分

    # 不允许走到地图外面去，撞墙扣5分
    if row < 0 or row > 2 or col < 0 or col > 2:
        reward = -5
        row = max(0, min(row, 2))
        col = max(0, min(col, 2))

    # 是陷阱的话，奖励是-100
    # 结束最好是以走到终点的形式,避免被扣100分
    state = get_state(row, col)
    if state == 'trap':
        reward = -100
    elif state == 'terminal':
        reward = 100

    return row, col, reward

## 2.游戏环境初始化

In [52]:
import numpy as np
import random
# 初始化每个格子的价值
values = np.zeros([3, 3])
# 初始化策略
actions = [0, 1, 2, 3]
pi = np.zeros([3, 3])
for row in range(3):
    for col in range(3):
        pi[row, col] = random.choice(actions)
pi

array([[1., 3., 3.],
       [3., 3., 1.],
       [3., 2., 1.]])

## 3.算法设置

In [53]:
# 生成一个回合
def generate_episode(start_row, start_col, policy):
    episode = []
    row, col = start_row, start_col
    # 设置一个episode长度为30, 设置较长的episode保证尽量能够访问所有的状态
    for _ in range(30):
        # 检查是否到达终止状态
        if get_state(row, col) in ['trap', 'terminal']:
            break
        action = policy[row, col]
        next_row, next_col, reward = move(row, col, action)
        episode.append(((row, col), action, reward))
        row, col = next_row, next_col
    return episode

In [54]:
# MC Exploring Starts算法
def MonteCarlo_ExploringStarts(values, policy,actions, num_iterations, gamma=0.9):
    returns = {}  # 存储每个状态-动作对的回报列表
    for _ in range(num_iterations):

        # MC Exploring Starts算法的初始状态是随机的
        # 随机选择一个状态-动作对作为起点
        start_row = random.randint(0, 2)
        start_col = random.randint(0, 2)
        episode = generate_episode(start_row, start_col, policy)  # 在第一个随机动作之后使用提供的策略


        # 策略评估
        G = 0
        # 采用首次访问策略
        visited = set()  # 用于记录已访问的状态-动作对
        # 倒序计算，节省时间
        for step in reversed(range(len(episode))):
            state, act, reward = episode[step]
            G = gamma * G + reward
            # 仅当状态-动作对第一次访问时计算回报
            if (state, act) not in visited:
                visited.add((state, act))
                if (state,act) not in returns:
                    returns[(state, act)] = []
                returns[(state, act)].append(G)
                # 更新状态值（即从当前状态-动作对开始的平均值）
                values[state[0], state[1]] = np.mean(returns[(state, act)])

        # 策略提升
        for row in range(3):
            for col in range(3):
                action_values = [np.mean(returns.get(((row, col), a), [0])) for a in actions]
                best_action_index = np.argmax(action_values)
                best_action = actions[best_action_index]
                policy[row, col] = best_action
    return values, policy

## 4.训练

In [56]:
# 有时候训练无法得到结果
values, pi = MonteCarlo_ExploringStarts(values, pi, actions, 100)
pi

array([[1., 0., 1.],
       [3., 3., 1.],
       [0., 0., 0.]])

## 5.结果展示

In [45]:
# 打印游戏，方便测试
def show(row, col, action):
    graph = [
        '□', '□', '○', '□', '□', '□', '○', '□', '❤'
    ]

    action = {0: '↑', 1: '↓', 2: '←', 3: '→'}[action]

    graph[row * 3 + col] = action

    graph = ''.join(graph)

    for i in range(0, 3 * 3, 3):
        print(graph[i:i + 3])

In [46]:
from IPython import display
import time


def test():
    #起点在0,0
    row = 0
    col = 0

    #最多玩N步
    for _ in range(200):

        #选择一个动作
        action = pi[row, col]

        #打印这个动作
        display.clear_output(wait=True)
        time.sleep(0.5)
        show(row, col, action)

        #执行动作
        row, col, reward = move(row, col, action)

        #获取当前状态，如果状态是终点或者掉陷阱则终止
        if get_state(row, col) in ['trap', 'terminal']:
            break

In [47]:
test()

□□○
□□□
○→❤
