In [1]:
import numpy as np
import random

class Gridworld:
    def __init__(self, grid_size=5, traps=[(1, 1), (1, 2), (1, 3), (3, 1), (3, 3), (4, 0), (4, 4)], goal=(4, 2)):
        self.grid_size = grid_size  # 网格的大小，默认为5x5
        self.goal = goal  # 目标位置，默认为(4, 4)
        self.traps = traps  # 陷阱的位置列表
        self.state_value = np.zeros((grid_size, grid_size))  # 状态价值函数，初始化为全零矩阵
        self.action_space = ["up", "down", "left", "right"]  # 动作空间，包括上、下、左、右四个动作

    def reset(self):
        self.state_value = np.zeros((self.grid_size, self.grid_size))

    def generate_traps(self, trapNum, seed=43):
        trap_candidates = [
            (i, j) for i in range(self.grid_size) for j in range(self.grid_size)
            if (i, j) != self.goal  # 排除目标位置
        ]
        if seed is not None:
            random.seed(seed)  # 设置随机种子，确保每次生成的陷阱位置相同
        self.traps = random.sample(trap_candidates, trapNum)  # 从候选陷阱位置中随机选择指定数量的陷阱位置

    def is_terminal(self, state):
        return state == self.goal or state in self.traps  # 判断状态是否为终止状态（到达目标或掉入陷阱）

    def get_next_state(self, state, action):
        x, y = state
        if action == "up":
            x = max(x - 1, 0)  # 向上移动，但不能超出网格边界
        elif action == "down":
            x = min(x + 1, self.grid_size - 1)  # 向下移动，但不能超出网格边界
        elif action == "left":
            y = max(y - 1, 0)  # 向左移动，但不能超出网格边界
        elif action == "right":
            y = min(y + 1, self.grid_size - 1)  # 向右移动，但不能超出网格边界
        return (x, y)  # 返回新的状态

    def get_reward(self, state):
        if state == self.goal:
            return 10  # 到达目标的奖励
        elif state in self.traps:
            return -10  # 掉入陷阱的惩罚
        else:
            return -1  # 普通状态的负奖励

    def getStartState(self):
        return (0, 0)  # 返回起始状态

    def render(self):
        grid = [["." for _ in range(self.grid_size)] for _ in range(self.grid_size)]  # 创建一个网格，初始状态下所有位置都是"."
        for trap in self.traps:
            grid[trap[0]][trap[1]] = "T"  # 将陷阱位置标记为"T"
        grid[self.goal[0]][self.goal[1]] = "G"  # 将目标位置标记为"G"
        for row in grid:
            print(" ".join(row))  # 打印网格
        print()

In [2]:
import numpy as np
import random

def n_step_td_forward(gridworld, n, episodes, alpha=0.1, gamma=0.9):
    """
    多步时序差分的前向算法，用于评估策略。
    
    参数：
    - gridworld: Gridworld 类的实例
    - n: 使用 n 步回报，n > 0
    - episodes: 模拟的训练轮数
    - alpha: 学习率
    - gamma: 折扣因子
    """
    for episode in range(episodes):
        # 初始化起始状态
        state = gridworld.getStartState()
        states = [state]
        rewards = []
        
        # 记录时间步
        t = 0  # 当前时间步
        T = float('inf')  # 终止时间步（初始为无穷大，直到遇到终止状态）

        while True:
            if t < T:
                # 选择动作（简单策略：随机选择）
                action = random.choice(["up", "down", "left", "right"])
                next_state = gridworld.get_next_state(state, action)
                reward = gridworld.get_reward(next_state)
                
                # 记录奖励和状态
                rewards.append(reward)
                states.append(next_state)
                
                # 如果到达终止状态，更新 T
                if gridworld.is_terminal(next_state):
                    T = t + 1

            # 确定更新的时间步
            tau = t - n + 1
            if tau >= 0:
                # 计算 n 步回报
                G = sum(gamma**i * rewards[tau + i] for i in range(min(n, T - tau)))
                if tau + n < T:  # 非终止状态时，添加价值函数的估计值
                    G += gamma**n * gridworld.state_value[states[tau + n]]
                
                # 更新状态价值函数
                gridworld.state_value[states[tau]] += alpha * (G - gridworld.state_value[states[tau]])
            
            # 检查循环是否结束
            if tau == T - 1:
                break
            
            # 前进到下一个时间步
            state = next_state
            t += 1

def extract_policy(gridworld, gamma=0.9):
    """
    根据状态价值函数提取最终策略，允许多个最优动作。
    
    参数：
    - gridworld: Gridworld 类的实例
    - gamma: 折扣因子
    
    返回：
    - policy: 一个字典，表示每个状态的最优动作（可能是多个）。
    """
    actions = ["up", "down", "left", "right"]
    policy = {}
    
    for x in range(gridworld.grid_size):
        for y in range(gridworld.grid_size):
            state = (x, y)
            if gridworld.is_terminal(state):
                policy[state] = None  # 终止状态没有动作
                continue
            
            # 计算每个动作的价值
            action_values = []
            for action in actions:
                next_state = gridworld.get_next_state(state, action)
                reward = gridworld.get_reward(next_state)
                value = reward + gamma * gridworld.state_value[next_state]
                action_values.append(value)
            
            # 找到所有最大价值的动作
            max_value = max(action_values)
            best_actions = [actions[i] for i, v in enumerate(action_values) if v == max_value]
            policy[state] = best_actions  # 将所有最优动作存储起来
    
    return policy

def render_policy(gridworld, policy):
    """
    渲染策略的网格表示，支持多个最优动作。
    
    参数：
    - gridworld: Gridworld 类的实例
    - policy: 策略字典
    """
    grid = [["." for _ in range(gridworld.grid_size)] for _ in range(gridworld.grid_size)]
    
    for x in range(gridworld.grid_size):
        for y in range(gridworld.grid_size):
            state = (x, y)
            if state == gridworld.goal:
                grid[x][y] = "G"  # 目标状态
            elif state in gridworld.traps:
                grid[x][y] = "T"  # 陷阱状态
            elif policy.get(state) is None:
                grid[x][y] = "X"  # 终止状态（一般不出现在此处）
            else:
                actions = policy[state]
                # 将多个动作的方向合并为一个符号
                symbols = []
                if "up" in actions:
                    symbols.append("↑")
                if "down" in actions:
                    symbols.append("↓")
                if "left" in actions:
                    symbols.append("←")
                if "right" in actions:
                    symbols.append("→")
                grid[x][y] = "".join(symbols)  # 合并为单个字符串

    for row in grid:
        print(" ".join(row))
    print()

if __name__ == "__main__":
    # 创建 Gridworld 实例
    grid = Gridworld(grid_size=5, traps=[(1, 1), (1, 2), (1, 3), (3, 1), (3, 3), (4, 0), (4, 4)], goal=(4, 2))
    grid.render()  # 渲染初始网格

    # 使用 n-step TD 前向算法评估状态价值函数
    n_step_td_forward(grid, n=3, episodes=1000, alpha=0.1, gamma=0.9)

    # 输出更新后的状态价值函数
    print("更新后的状态价值函数：")
    print(np.round(grid.state_value, 2))
    
    # 提取最终策略
    final_policy = extract_policy(grid, gamma=0.9)

    # 渲染最终策略
    print("最终策略：")
    render_policy(grid, final_policy)


. . . . .
. T T T .
. . . . .
. T . T .
T . G . T

更新后的状态价值函数：
[[-9.97 -9.99 -9.99 -9.81 -8.99]
 [-9.94  0.    0.    0.   -9.18]
 [-9.94 -8.8  -6.84 -6.89 -7.81]
 [-9.92  0.   -2.63  0.   -5.7 ]
 [ 0.    0.    0.    0.    0.  ]]
最终策略：
↓ ← → → ↑→
↓ T T T ↓
→ → ↓ ← ↓
← T ↓ T →
T → G ← T



In [3]:
def td_lambda_optimized(gridworld, episodes=500, alpha=0.1, gamma=0.9, lambd=0.8):
    """
    使用稀疏资格迹优化的 TD(λ) 算法实现。
    
    参数：
    - gridworld: Gridworld 类的实例
    - episodes: 训练的回合数
    - alpha: 学习率
    - gamma: 折扣因子
    - lambd: λ 参数
    
    返回：
    - 无，直接更新 gridworld.state_value
    """
    for episode in range(episodes):
        state = gridworld.getStartState()
        eligibility_trace = {}  # 使用字典表示稀疏的资格迹
        
        while not gridworld.is_terminal(state):
            # 1. 选择随机动作（或基于策略）
            actions = ["up", "down", "left", "right"]
            action = random.choice(actions)
            
            # 2. 执行动作，观察下一个状态和奖励
            next_state = gridworld.get_next_state(state, action)
            reward = gridworld.get_reward(next_state)
            
            # 3. 计算 TD 误差
            td_error = (
                reward
                + gamma * gridworld.state_value[next_state]
                - gridworld.state_value[state]
            )
            
            # 4. 更新当前状态的资格迹
            eligibility_trace[state] = eligibility_trace.get(state, 0) + 1
            
            # 5. 更新所有非零资格迹状态的价值函数
            for s in list(eligibility_trace.keys()):  # 遍历非零资格迹的状态
                gridworld.state_value[s] += alpha * td_error * eligibility_trace[s]
                eligibility_trace[s] *= gamma * lambd  # 衰减资格迹
                
                # 如果资格迹变得足够小，则删除该状态
                if eligibility_trace[s] < 1e-6:
                    del eligibility_trace[s]
            
            # 6. 转移到下一个状态
            state = next_state

# 创建 Gridworld 实例
grid = Gridworld(grid_size=5, traps=[(1, 1), (1, 2), (1, 3), (3, 1), (3, 3), (4, 0), (4, 4)], goal=(4, 2))
grid.render()

# 使用 TD(λ) 算法更新状态价值函数
td_lambda_optimized(grid, episodes=500, alpha=0.1, gamma=0.9, lambd=0.8)

# 输出更新后的状态价值函数
print("更新后的状态价值函数：")
print(np.round(grid.state_value, 2))
# 提取最终策略
final_policy = extract_policy(grid, gamma=0.9)

# 渲染最终策略
print("最终策略：")
render_policy(grid, final_policy)

. . . . .
. T T T .
. . . . .
. T . T .
T . G . T

更新后的状态价值函数：
[[-9.99 -9.99 -9.97 -9.74 -9.43]
 [-9.99  0.    0.    0.   -8.38]
 [-9.94 -9.73 -8.93 -5.62 -4.85]
 [-9.93  0.   -6.26  0.   -0.7 ]
 [ 0.    0.    0.    0.    0.  ]]
最终策略：
↓ → → → ↓
↓ T T T ↓
→ → → → ↓
← T ↓ T →
T → G ← T



In [4]:
from collections import defaultdict

def sarsa_lambda(
    gridworld, episodes=500, alpha=0.1, gamma=0.9, lambd=0.8, epsilon=0.1
):
    """
    SARSA(λ) 算法实现。

    参数：
    - gridworld: Gridworld 类的实例
    - episodes: 总回合数
    - alpha: 学习率
    - gamma: 折扣因子
    - lambd: 资格迹衰减参数
    - epsilon: ε-贪婪策略中的探索概率

    返回：
    - Q: 状态-动作值函数表
    """
    actions = ["up", "down", "left", "right"]
    # 延迟初始化 Q 表
    Q = defaultdict(lambda: defaultdict(float))  # Q[state][action] 默认为 0
    for episode in range(episodes):
        # 初始化状态和动作
        state = gridworld.getStartState()
        action = epsilon_greedy_policy(Q, state, actions, epsilon)
        # 延迟初始化资格迹
        eligibility_trace = defaultdict(lambda: defaultdict(float))  # eligibility_trace[state][action] 默认为 0

        while not gridworld.is_terminal(state):
            # 执行动作，观察下一个状态和奖励
            next_state = gridworld.get_next_state(state, action)
            reward = gridworld.get_reward(next_state)
            next_action = epsilon_greedy_policy(Q, next_state, actions, epsilon)

            # 计算 TD 误差
            td_error = reward + gamma * Q[next_state][next_action] - Q[state][action]

            # 更新 Q 值和资格迹
            for s in Q.keys():
                for a in actions:
                    eligibility_trace[s][a] *= gamma * lambd
                    if s == state and a == action:
                        eligibility_trace[s][a] += 1
                    Q[s][a] += alpha * td_error * eligibility_trace[s][a]

            # 更新状态和动作
            state = next_state
            action = next_action

    return Q


def epsilon_greedy_policy(Q, state, actions, epsilon):
    """
    ε-贪婪策略选择动作。
    """
    if random.random() < epsilon:
        return random.choice(actions)  # 探索
    else:
        return max(actions, key=lambda a: Q[state][a])  # 利用

def get_policy_with_all_optimal_actions(Q):
    """
    根据 Q 表提取最终策略，显示所有最优动作。

    参数：
    - Q: 状态-动作值函数表，形式为 Q[state][action]

    返回：
    - policy: 字典，表示最终策略，每个状态对应一个列表，包含所有最优动作
    """
    policy = {}
    for state, action_values in Q.items():
        max_value = max(action_values.values())
        # 找到所有具有最大值的动作
        optimal_actions = [action for action, value in action_values.items() if value == max_value]
        policy[state] = optimal_actions
    return policy

# 创建 Gridworld 实例
grid = Gridworld(grid_size=5, traps=[(1, 1), (1, 2), (1, 3), (3, 1), (3, 3), (4, 0), (4, 4)], goal=(4, 2))
grid.render()

# 运行 SARSA(λ) 算法
Q = sarsa_lambda(grid, episodes=500, alpha=0.1, gamma=0.9, lambd=0.8, epsilon=0.1)

# 提取最终策略
policy = get_policy_with_all_optimal_actions(Q)

# 可视化最终策略
print("策略（显示所有最优动作）:")
render_policy(grid, policy)


. . . . .
. T T T .
. . . . .
. T . T .
T . G . T

策略（显示所有最优动作）:
↓ → → → ↓
↓ T T T ↓
→ → ↓ ← ←
↑ T ↓ T →
T X G X T



In [5]:
import numpy as np
from collections import defaultdict

def watkins_q_lambda(gridworld, episodes, alpha=0.1, gamma=0.99, lambd=0.9, epsilon=0.1):
    """
    Q-learning(λ) 算法

    参数：
    - env: 环境，需有 `reset` 和 `step` 方法
    - num_episodes: 总的训练回合数
    - alpha: 学习率
    - gamma: 折扣因子
    - lambd: 资格迹衰减参数
    - epsilon: ε-greedy 策略中的探索概率

    返回：
    - Q: 状态-动作值函数
    """
    actions = ["up", "down", "left", "right"]
    # 延迟初始化 Q 表
    Q = defaultdict(lambda: defaultdict(float))  # Q[state][action] 默认为 0
    for episode in range(episodes):
        # 初始化状态和动作
        state = gridworld.getStartState()
        action = epsilon_greedy_policy(Q, state, actions, epsilon)
        # 延迟初始化资格迹
        eligibility_trace = defaultdict(lambda: defaultdict(float))  # eligibility_trace[state][action] 默认为 0

        while not gridworld.is_terminal(state):
            # 执行动作，观察下一个状态和奖励
            next_state = gridworld.get_next_state(state, action)
            reward = gridworld.get_reward(next_state)
            next_action = epsilon_greedy_policy(Q, next_state, actions, epsilon)

            # 计算 TD 误差
            max_next_q = max(Q[next_state].values()) if Q[next_state] else 0  # 获取最大 Q 值，如果 Q[next_state] 为空，则设置为 0
            td_error = reward + gamma * max_next_q - Q[state][action]

            # 更新 Q 值和资格迹
            for s in Q.keys():
                for a in actions:
                    eligibility_trace[s][a] *= gamma * lambd
                    if s == state and a == action:
                        eligibility_trace[s][a] += 1
                    Q[s][a] += alpha * td_error * eligibility_trace[s][a]

            # 更新状态和动作
            state = next_state
            action = next_action

    return Q

# 创建 Gridworld 实例
grid = Gridworld(grid_size=5, traps=[(1, 1), (1, 2), (1, 3), (3, 1), (3, 3), (4, 0), (4, 4)], goal=(4, 2))
grid.render()

# 运行 Q(λ) 算法
Q = watkins_q_lambda(grid, episodes=500, alpha=0.1, gamma=0.9, lambd=0.8, epsilon=0.1)

# 提取最终策略
policy = get_policy_with_all_optimal_actions(Q)

# 可视化最终策略
print("策略（显示所有最优动作）:")
render_policy(grid, policy)

. . . . .
. T T T .
. . . . .
. T . T .
T . G . T

策略（显示所有最优动作）:
↓ → → → ↓
↓ T T T ↓
→ → ↓ ← ←
↑ T ↓ T ←→
T X G X T



In [8]:
import numpy as np
from collections import defaultdict

def watkins_q_lambda(gridworld, episodes, alpha=0.1, gamma=0.99, lambd=0.9, epsilon=0.1):
    """
    Q-learning(λ) 算法

    参数：
    - env: 环境，需有 `reset` 和 `step` 方法
    - num_episodes: 总的训练回合数
    - alpha: 学习率
    - gamma: 折扣因子
    - lambd: 资格迹衰减参数
    - epsilon: ε-greedy 策略中的探索概率

    返回：
    - Q: 状态-动作值函数
    """
    actions = gridworld.action_space
    # 延迟初始化 Q 表
    Q = defaultdict(lambda: {action: 0.0 for action in actions})  # Q[state][action] 默认为 0
    for episode in range(episodes):
        # 初始化状态和动作
        state = gridworld.getStartState()
        action = epsilon_greedy_policy(Q, state, actions, epsilon)
        # 延迟初始化资格迹
        eligibility_trace = defaultdict(lambda: {action: 0.0 for action in actions})  # eligibility_trace[state][action] 默认为 0

        while not gridworld.is_terminal(state):
            # 执行动作，观察下一个状态和奖励
            next_state = gridworld.get_next_state(state, action)
            reward = gridworld.get_reward(next_state)
            next_action = epsilon_greedy_policy(Q, next_state, actions, epsilon)

            # 计算 TD 误差
            max_next_q = max(Q[next_state].values()) if Q[next_state] else 0  # 获取最大 Q 值，如果 Q[next_state] 为空，则设置为 0
            td_error = reward + gamma * max_next_q - Q[state][action]

            # 如果当前动作不是最优，则资格迹清零
            if action != max(Q[state], key=Q[state].get):
                eligibility_trace = defaultdict(lambda: {action: 0.0 for action in actions})
                
            # 更新 Q 值和资格迹
            for s in Q.keys():
                for a in actions:
                    eligibility_trace[s][a] *= gamma * lambd
                    if s == state and a == action:
                        eligibility_trace[s][a] += 1
                    Q[s][a] += alpha * td_error * eligibility_trace[s][a]

            # 更新状态和动作
            state = next_state
            action = next_action

    return Q

# 创建 Gridworld 实例
grid = Gridworld(grid_size=5, traps=[(1, 1), (1, 2), (1, 3), (3, 1), (3, 3), (4, 0), (4, 4)], goal=(4, 2))
grid.render()

# 运行 Q(λ) 算法
Q = watkins_q_lambda(grid, episodes=500, alpha=0.1, gamma=0.9, lambd=0.8, epsilon=0.1)

# 提取最终策略
policy = get_policy_with_all_optimal_actions(Q)

# 可视化最终策略
print("策略（显示所有最优动作）:")
render_policy(grid, policy)

. . . . .
. T T T .
. . . . .
. T . T .
T . G . T

策略（显示所有最优动作）:
↓ ← ← → ↑
↓ T T T ↑
→ → ↓ ← ←
↑ T ↓ T ←→
T X G X T

