In [None]:
import numpy as np
import random

class Gridworld:
    def __init__(self, grid_size=5, traps=[(1, 1), (1, 2), (1, 3), (3, 1), (3, 3), (4, 0), (4, 4)], goal=(4, 2)):
        self.grid_size = grid_size  # 网格的大小，默认为5x5
        self.goal = goal  # 目标位置，默认为(4, 4)
        self.traps = traps  # 陷阱的位置列表
        self.state_value = np.zeros((grid_size, grid_size))  # 状态价值函数，初始化为全零矩阵

    def generate_traps(self, trapNum, seed=43):
        trap_candidates = [
            (i, j) for i in range(self.grid_size) for j in range(self.grid_size)
            if (i, j) != self.goal  # 排除目标位置
        ]
        if seed is not None:
            random.seed(seed)  # 设置随机种子，确保每次生成的陷阱位置相同
        self.traps = random.sample(trap_candidates, trapNum)  # 从候选陷阱位置中随机选择指定数量的陷阱位置

    def is_terminal(self, state):
        return state == self.goal or state in self.traps  # 判断状态是否为终止状态（到达目标或掉入陷阱）

    def get_next_state(self, state, action):
        x, y = state
        if action == "up":
            x = max(x - 1, 0)  # 向上移动，但不能超出网格边界
        elif action == "down":
            x = min(x + 1, self.grid_size - 1)  # 向下移动，但不能超出网格边界
        elif action == "left":
            y = max(y - 1, 0)  # 向左移动，但不能超出网格边界
        elif action == "right":
            y = min(y + 1, self.grid_size - 1)  # 向右移动，但不能超出网格边界
        return (x, y)  # 返回新的状态

    def get_reward(self, state):
        if state == self.goal:
            return 10  # 到达目标的奖励
        elif state in self.traps:
            return -10  # 掉入陷阱的惩罚
        else:
            return -1  # 普通状态的负奖励

    def getRandomStartState(self):
        """随机选择一个非终止状态作为起点"""
        possible_states = [
            (i, j) for i in range(self.grid_size) for j in range(self.grid_size)
            if (i, j) != self.goal and (i, j) not in self.traps
        ]
        return random.choice(possible_states)

    def render(self):
        grid = [["." for _ in range(self.grid_size)] for _ in range(self.grid_size)]  # 创建一个网格，初始状态下所有位置都是"."
        for trap in self.traps:
            grid[trap[0]][trap[1]] = "T"  # 将陷阱位置标记为"T"
        grid[self.goal[0]][self.goal[1]] = "G"  # 将目标位置标记为"G"
        for row in grid:
            print(" ".join(row))  # 打印网格
        print()


def epsilon_greedy_action(env, state, epsilon, gamma=0.9):
    """选择动作，采用 epsilon-greedy 策略"""
    actions = ["up", "down", "left", "right"]
    if random.random() < epsilon:
        # 探索：随机选择动作
        return random.choice(actions)
    else:
        # 利用：选择即时奖励和未来价值最大的动作
        values = {}
        for action in actions:
            next_state = env.get_next_state(state, action)
            reward = env.get_reward(next_state)
            values[action] = reward + gamma * env.state_value[next_state]
        return max(values, key=values.get)

# TD(0) 训练
def td_training_with_policy(env, episodes=500, alpha=0.1, gamma=0.9, epsilon=0.1):
    for episode in range(episodes):
        state = env.getRandomStartState()
        while not env.is_terminal(state):
            action = epsilon_greedy_action(env, state, epsilon, gamma)
            next_state = env.get_next_state(state, action)
            reward = env.get_reward(next_state)
            env.state_value[state] += alpha * (
                reward + gamma * env.state_value[next_state] - env.state_value[state]
            )
            state = next_state
        #if (episode + 1) % 100 == 0:
            #print(f"Episode {episode + 1}/{episodes} completed.")

def print_optimal_policy(env, gamma=0.9):
    """打印最优策略，考虑一个状态可能有多个最优动作"""
    actions = ["up", "down", "left", "right"]
    action_symbols = {"up": "↑", "down": "↓", "left": "←", "right": "→"}
    policy = [["." for _ in range(env.grid_size)] for _ in range(env.grid_size)]
    
    for i in range(env.grid_size):
        for j in range(env.grid_size):
            state = (i, j)
            if env.is_terminal(state):
                if state == env.goal:
                    policy[i][j] = "G"
                elif state in env.traps:
                    policy[i][j] = "T"
            else:
                # 计算动作-状态值函数 Q(s, a)
                q_values = {}
                for action in actions:
                    next_state = env.get_next_state(state, action)
                    reward = env.get_reward(next_state)
                    q_values[action] = reward + gamma * env.state_value[next_state]
                
                # 找到最大值对应的动作集合
                max_value = max(q_values.values())
                best_actions = [action_symbols[action] for action, value in q_values.items() if value == max_value]
                
                # 用多个符号表示最优动作
                policy[i][j] = "".join(best_actions)
    
    print("\nOptimal Policy:")
    for row in policy:
        print(" ".join(row))

# 测试代码
if __name__ == "__main__":
    env = Gridworld(grid_size=5, traps=[(1, 1), (1, 2), (1, 3), (3, 1), (3, 3), (4, 0), (4, 4)], goal=(4, 2))  # 创建环境
    env.render()  # 显示网格
    
    print("Starting TD(0) training with epsilon-greedy policy...\n")
    td_training_with_policy(env, episodes=100, alpha=0.1, gamma=0.9, epsilon=0.1)  # 训练
    
    print("\nLearned State Values:")
    print(env.state_value)
    
    print_optimal_policy(env)  # 输出最优策略


. . . . .
. T T T .
. . . . .
. T . T .
T . G . T

Starting TD(0) training with epsilon-greedy policy...


Learned State Values:
[[ 0.00829893 -0.02243397 -0.07837071 -0.03839208  0.0017454 ]
 [ 0.12715385  0.          0.          0.          0.1303622 ]
 [ 0.36119756  0.3878087   0.62790981  0.40720198  0.31008431]
 [-0.02693164  0.          0.8112383   0.         -0.05218608]
 [ 0.          0.5217031   0.          0.468559    0.        ]]

Optimal Policy:
↓ ← ← → ↓
↓ T T T ↓
→ → ↓ ← ←
↑ T ↓ T ↑
T → G ← T


In [27]:
def epsilon_greedy_action(state, Q, actions, epsilon):
    if random.uniform(0, 1) < epsilon:
        return random.choice(actions)  # 随机选择动作
    else:
        # 选择具有最大Q值的动作
        max_value = max(Q[state][a] for a in actions)
        best_actions = [a for a in actions if Q[state][a] == max_value]
        return random.choice(best_actions)


def sarsa(env, episodes=500, alpha=0.1, gamma=0.9, epsilon=0.1):
    """SARSA 算法"""
    actions = ["up", "down", "left", "right"]
    Q = {  # 初始化 Q 表
        (i, j): {a: 0 for a in actions}
        for i in range(env.grid_size)
        for j in range(env.grid_size)
    }

    for episode in range(episodes):
        state = env.getRandomStartState()  # 初始化起始状态
        action = epsilon_greedy_action(state, Q, actions, epsilon)  # 选择初始动作

        while not env.is_terminal(state):
            next_state = env.get_next_state(state, action)  # 执行动作，得到下一个状态
            reward = env.get_reward(next_state)  # 即时奖励
            next_action = epsilon_greedy_action(next_state, Q, actions, epsilon)  # 选择下一个动作

            # SARSA 更新公式
            Q[state][action] += alpha * (
                reward + gamma * Q[next_state][next_action] - Q[state][action]
            )

            # 状态和动作更新
            state, action = next_state, next_action

    return Q


def print_optimal_policy(env, Q):
    """打印最优策略"""
    actions = ["up", "down", "left", "right"]
    action_symbols = {"up": "↑", "down": "↓", "left": "←", "right": "→"}
    policy = [["." for _ in range(env.grid_size)] for _ in range(env.grid_size)]

    for i in range(env.grid_size):
        for j in range(env.grid_size):
            state = (i, j)
            if env.is_terminal(state):
                if state == env.goal:
                    policy[i][j] = "G"
                elif state in env.traps:
                    policy[i][j] = "T"
            else:
                # 找到Q值最大的动作
                max_value = max(Q[state][a] for a in actions)
                best_actions = [action_symbols[a] for a in actions if Q[state][a] == max_value]
                policy[i][j] = "".join(best_actions)

    print("\nOptimal Policy:")
    for row in policy:
        print(" ".join(row))


# 创建 Gridworld 环境
env = Gridworld(grid_size=5, traps=[(1, 1), (1, 2), (1, 3), (3, 1), (3, 3), (4, 0), (4, 4)], goal=(4, 2))
env.render()

# 使用 SARSA 训练
Q = sarsa(env, episodes=100, alpha=0.1, gamma=0.9, epsilon=0.1)

# 打印最优策略
print_optimal_policy(env, Q)

. . . . .
. T T T .
. . . . .
. T . T .
T . G . T


Optimal Policy:
↓ ← ← ← ↓
↓ T T T ↓
→ → ↓ ← ←
↑ T ↓ T ↑
T → G ← T


In [36]:
def q_learning(env, episodes=500, alpha=0.1, gamma=0.9, epsilon=0.1):
    """Q-Learning 算法"""
    actions = ["up", "down", "left", "right"]
    Q = {  # 初始化 Q 表
        (i, j): {a: 0 for a in actions}
        for i in range(env.grid_size)
        for j in range(env.grid_size)
    }

    for episode in range(episodes):
        state = env.getRandomStartState()  # 初始化起始状态

        while not env.is_terminal(state):
            action = epsilon_greedy_action(state, Q, actions, epsilon)  # 选择动作
            next_state = env.get_next_state(state, action)  # 执行动作，得到下一个状态
            reward = env.get_reward(next_state)  # 即时奖励

            # Q-Learning 更新公式
            max_next_q = max(Q[next_state][a] for a in actions)
            Q[state][action] += alpha * (
                reward + gamma * max_next_q - Q[state][action]
            )

            state = next_state  # 更新状态

    return Q


def print_optimal_policy(env, Q):
    """打印最优策略"""
    actions = ["up", "down", "left", "right"]
    action_symbols = {"up": "↑", "down": "↓", "left": "←", "right": "→"}
    policy = [["." for _ in range(env.grid_size)] for _ in range(env.grid_size)]

    for i in range(env.grid_size):
        for j in range(env.grid_size):
            state = (i, j)
            if env.is_terminal(state):
                if state == env.goal:
                    policy[i][j] = "G"
                elif state in env.traps:
                    policy[i][j] = "T"
            else:
                # 找到 Q 值最大的动作
                max_value = max(Q[state][a] for a in actions)
                best_actions = [action_symbols[a] for a in actions if Q[state][a] == max_value]
                policy[i][j] = "".join(best_actions)

    print("\nOptimal Policy:")
    for row in policy:
        print(" ".join(row))


# 创建 Gridworld 环境
env = Gridworld(grid_size=5, traps=[(1, 1), (1, 2), (1, 3), (3, 1), (3, 3), (4, 0), (4, 4)], goal=(4, 2))
env.render()

# 使用 Q-Learning 训练
Q = q_learning(env, episodes=200, alpha=0.1, gamma=0.9, epsilon=0.1)

# 打印最优策略
print_optimal_policy(env, Q)

. . . . .
. T T T .
. . . . .
. T . T .
T . G . T


Optimal Policy:
↓ ← ← ← ↓
↓ T T T ↓
→ → ↓ ← ←
↑ T ↓ T ↑
T → G ← T
