In [1]:
import numpy as np

# 网格世界的定义
GRID_SIZE = 5  # 网格的大小为4x4
ACTIONS = ['up', 'down', 'left', 'right']  # 可用的动作
ACTION_TO_DELTA = {
    'up': (-1, 0),    # 向上移动，行减1，列不变
    'down': (1, 0),   # 向下移动，行加1，列不变
    'left': (0, -1),  # 向左移动，行不变，列减1
    'right': (0, 1)   # 向右移动，行不变，列加1
}
# 动作到箭头的映射
ACTION_TO_ARROW = {
    'up': '↑',
    'down': '↓',
    'left': '←',
    'right': '→'
}
DISCOUNT_FACTOR = 1.0  # 折扣因子，用于计算未来奖励的折现值
THRESHOLD = 1e-4  # 收敛阈值，用于判断值迭代是否收敛
# 目标状态
GOAL_STATE = (1, 3)

# 奖励函数: 默认每次移动得到-1的奖励，到达目标状态得到 +10 的奖励
def reward_function(state, goal_state=GOAL_STATE):
    if state == goal_state:  # 目标状态
        return 10
    else:
        return -1

# 转移函数：给定状态和动作，返回下一个状态和奖励
def transition(state, action):
    delta = ACTION_TO_DELTA[action]  # 获取动作对应的行和列的变化量
    next_state = (state[0] + delta[0], state[1] + delta[1])  # 计算下一个状态
    # 边界检测，确保在网格内
    if 0 <= next_state[0] < GRID_SIZE and 0 <= next_state[1] < GRID_SIZE:
        return next_state
    else:  # 如果越界，返回原状态
        return state

# 值迭代算法
def value_iteration(goal_state=GOAL_STATE):
    V = np.zeros((GRID_SIZE, GRID_SIZE))  # 初始化价值函数，所有状态的价值初始为0
    iteration_count = 0  # 初始化迭代次数
    while True:
        # 打印每次迭代的价值函数
        #print(f"Iteration {iteration_count}:")
        #print(V)
        delta = 0  # 用于记录当前迭代中最大的价值变化
        new_V = V.copy()  # 复制当前的价值函数用于更新
        for i in range(GRID_SIZE):
            for j in range(GRID_SIZE):
                state = (i, j)
                if state == goal_state:  # 目标状态无需更新
                    continue
                # 计算每个动作的价值
                action_values = []
                for action in ACTIONS:
                    next_state = transition(state, action)  # 计算下一个状态
                    reward = reward_function(next_state, goal_state)  # 计算奖励
                    action_value = reward + DISCOUNT_FACTOR * V[next_state]  # 计算动作价值
                    action_values.append(action_value)
                # 更新当前状态的价值
                new_V[state] = max(action_values)  # 选择最大动作价值作为当前状态的价值
                delta = max(delta, abs(new_V[state] - V[state]))  # 更新最大价值变化
        V = new_V  # 更新价值函数
        iteration_count += 1  # 增加迭代次数
        # 检查收敛
        if delta < THRESHOLD:  # 如果最大价值变化小于收敛阈值，则认为收敛
            break
    return V, iteration_count  # 返回最优价值函数和迭代次数


# 提取最优策略
def extract_policy(V, goal_state=GOAL_STATE):
    policy = np.empty((GRID_SIZE, GRID_SIZE), dtype=object)  # 初始化策略函数，使用 object 类型以存储多个动作
    for i in range(GRID_SIZE):
        for j in range(GRID_SIZE):
            state = (i, j)
            if state == goal_state:  # 目标状态
                policy[state] = ''.join(['G'])  # 目标状态，策略为到达目标
                continue
            action_values = {}  # 存储每个动作的价值
            for action in ACTIONS:
                next_state = transition(state, action)  # 计算下一个状态
                reward = reward_function(next_state, goal_state)  # 计算奖励
                action_values[action] = reward + DISCOUNT_FACTOR * V[next_state]  # 计算动作价值
            
            # 找出所有最优动作（可能多个）
            max_value = max(action_values.values())  # 找到最大动作价值
            best_actions = [ACTION_TO_ARROW[action] for action, value in action_values.items() if value == max_value]  # 找到所有具有最大动作价值的动作
            # 拼接最优动作为字符串
            policy[state] = ''.join(best_actions)  # 将最优动作存储在策略中
    return policy


# 运行值迭代
V, iteration_count = value_iteration(GOAL_STATE)
optimal_policy = extract_policy(V, GOAL_STATE)

# 打印结果
print("Number of Iterations:", iteration_count)
print("Optimal Value Function:")
print(np.round(V, 2))  # 打印最优价值函数，保留两位小数
print("\nOptimal Policy:")
# 打印最优策略
for row in optimal_policy:
    print(row)


Number of Iterations: 7
Optimal Value Function:
[[ 7.  8.  9. 10.  9.]
 [ 8.  9. 10.  0. 10.]
 [ 7.  8.  9. 10.  9.]
 [ 6.  7.  8.  9.  8.]
 [ 5.  6.  7.  8.  7.]]

Optimal Policy:
['↓→' '↓→' '↓→' '↓' '↓←']
['→' '→' '→' 'G' '←']
['↑→' '↑→' '↑→' '↑' '↑←']
['↑→' '↑→' '↑→' '↑' '↑←']
['↑→' '↑→' '↑→' '↑' '↑←']
