# 值迭代
## 1.游戏环境设置

In [2]:
# 游戏环境设置：
# 4x12，（11，11）是终点，最下面一行除了（3，0）以外全是不可走的地方
#获取一个格子的状态
def get_state(row, col):
    if row != 3:
        return 'ground'

    if row == 3 and col == 0:
        return 'ground'

    if row == 3 and col == 11:
        return 'terminal'

    return 'trap'

In [3]:
# 设定agent在每一个格子里的动作
def move(row, col, action):
    # 如果当前已经在陷阱或者终点，则不能执行任何动作，反馈都是0
    if get_state(row, col) in ['trap', 'terminal']:
        return row, col, 0

    # 执行动作
    if action == 0:   # ↑
        row -= 1
    elif action == 1: # ↓
        row += 1
    elif action == 2: # ←
        col -= 1
    elif action == 3: # →
        col += 1

    # 初始化reward
    reward = -1  # 这样强迫了机器尽快结束游戏,因为每走一步都要扣一分

    # 不允许走到地图外面去，撞墙扣5分
    if row < 0 or row > 3 or col < 0 or col > 11:
        reward = -5
        row = max(0, min(row, 3))
        col = max(0, min(col, 11))

    # 是陷阱的话，奖励是-100
    # 结束最好是以走到终点的形式,避免被扣100分
    state = get_state(row, col)
    if state == 'trap':
        reward = -100
    elif state == 'terminal':
        reward = 100

    return row, col, reward

## 2.算法设置

In [4]:
# 计算在一个状态下执行动作的分数
def get_qsa(values, row, col, action):
    # 在当前状态下执行动作,得到下一个状态和reward
    next_row, next_col, reward = move(row, col, action)

    # 计算下一个状态的分数,取values当中记录的分数即可,0.9是折扣因子γ
    value = values[next_row, next_col] * 0.9

    # 如果下个状态是陷阱,则下一个状态的分数是0
    if get_state(next_row, next_col) in ['trap','terminal']:
        value = 0

    # 动作的分数本身就是reward,加上下一个状态的分数
    return value + reward

# 值迭代算法

In [13]:
def valueIteration(values, pi, threshold=0.000001):
    while True:
        # 初始化一个新的values，重新评估所有格子的分数
        new_values = np.copy(values)

        # 重新初始化每个格子下采用动作的概率，重新评估
        new_pi = np.copy(pi)

        # 遍历所有格子
        for row in range(4):
            for col in range(12):
                # 计算当前格子4个动作分别的分数
                action_values = np.zeros(4)
                maxActionIdx = 0
                # 遍历所有动作
                for action in range(4):
                    action_values[action] = get_qsa(values, row, col, action)
                    # 获取q值最大的动作作为下一步的动作
                    if action_values[action] > action_values[maxActionIdx]:
                        maxActionIdx = action
                # 策略更新
                # 将动作值最大的动作作为下一步
                # 让该动作的概率为1，其他动作的概率为0
                new_pi[row, col, :] = 0  # 将所有动作的概率重置为0
                new_pi[row, col, maxActionIdx] = 1  # 将最优动作的概率设置为1

                # 值更新
                # 求每一个格子的分数，等于该格子下所有动作的最大分数
                new_values[row, col] = action_values[maxActionIdx]

        # 检查是否收敛 ||v_k - v_k-1 || < threshold?
        if np.allclose(new_values, values, atol=threshold):
            break
        # 更新价值函数和策略
        values = new_values
        pi = new_pi

    return values, pi


## 3.游戏初始化与训练

In [14]:
import numpy as np

#初始化每个格子的价值
values = np.zeros([4, 12])

#初始化每个格子下采用动作的概率
pi = np.ones([4, 12, 4]) * 0.25
#循环迭代策略评估和策略提升，寻找最优解

values, pi = valueIteration(values,pi)
values

array([[ 17.96052411,  21.06724901,  24.51916557,  28.35462841,
         32.61625379,  37.3513931 ,  42.612659  ,  48.45851   ,
         54.9539    ,  62.171     ,  70.19      ,  79.1       ],
       [ 21.06724901,  24.51916557,  28.35462841,  32.61625379,
         37.3513931 ,  42.612659  ,  48.45851   ,  54.9539    ,
         62.171     ,  70.19      ,  79.1       ,  89.        ],
       [ 24.51916557,  28.35462841,  32.61625379,  37.3513931 ,
         42.612659  ,  48.45851   ,  54.9539    ,  62.171     ,
         70.19      ,  79.1       ,  89.        , 100.        ],
       [ 21.06724901,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ,   0.        ,   0.        ]])

## 4.训练结果可视化

In [7]:
#打印游戏，方便测试
def show(row, col, action):
    graph = [
        '□', '□', '□', '□', '□', '□', '□', '□', '□', '□', '□', '□', '□', '□',
        '□', '□', '□', '□', '□', '□', '□', '□', '□', '□', '□', '□', '□', '□',
        '□', '□', '□', '□', '□', '□', '□', '□', '□', '○', '○', '○', '○', '○',
        '○', '○', '○', '○', '○', '❤'
    ]

    action = {0: '↑', 1: '↓', 2: '←', 3: '→'}[action]

    graph[row * 12 + col] = action

    graph = ''.join(graph)

    for i in range(0, 4 * 12, 12):
        print(graph[i:i + 12])


show(1, 1, 0)

□□□□□□□□□□□□
□↑□□□□□□□□□□
□□□□□□□□□□□□
□○○○○○○○○○○❤


In [8]:
from IPython import display
import time


def test():
    #起点在0,0
    row = 0
    col = 0

    #最多玩N步
    for _ in range(200):

        #选择一个动作
        action = np.random.choice(np.arange(4), size=1, p=pi[row, col])[0]

        #打印这个动作
        display.clear_output(wait=True)
        time.sleep(0.3)
        show(row, col, action)

        #执行动作
        row, col, reward = move(row, col, action)

        #获取当前状态，如果状态是终点或者掉陷阱则终止
        if get_state(row, col) in ['trap', 'terminal']:
            break

In [9]:
test()

KeyboardInterrupt: 