# 行为策略和改进策略都是均匀随机策略，所以属于在线策略
# 因为returns用列表保存了所有的累计回报，最后再计算平均值，所以不是增量式更新

In [1]:
import numpy as np
from collections import defaultdict

# 定义迷宫环境
class MazeEnv:
    def __init__(self):
        # 初始化迷宫布局
        self.grid = np.array([
            ['S', '-', '-'],
            ['-', '#', '-'],
            ['-', '-', 'G']
        ])
        # 设置起始状态和目标状态
        self.start_state = (0, 0)
        self.goal_state = (2, 2)
        # 定义可能的动作
        self.actions = ['up', 'down', 'left', 'right']
    
    def is_terminal(self, state):
        # 判断当前状态是否为目标状态
        return state == self.goal_state
    
    def step(self, state, action):
        # 根据当前状态和动作，计算下一个状态和奖励
        x, y = state
        if action == 'up': x = max(0, x - 1)
        if action == 'down': x = min(2, x + 1)
        if action == 'left': y = max(0, y - 1)
        if action == 'right': y = min(2, y + 1)
        
        if self.grid[x, y] == '#':  # 遇到障碍物
            return state, -1
        return (x, y), -1

# 蒙特卡罗控制
def monte_carlo_control(env, episodes=1000, gamma=1.0):
    """
    使用蒙特卡罗控制方法学习迷宫的最优策略。

    参数:
    env (MazeEnv): 迷宫环境对象。
    episodes (int): 训练的episode数量，默认为1000。
    gamma (float): 折扣因子，默认为1.0。

    返回:
    policy (defaultdict): 学习到的最优策略。
    V (defaultdict): 每个状态的值函数。
    """
    # 初始化值函数和回报列表
    V = defaultdict(float)
    returns = defaultdict(list)
    # 初始化策略，每个状态随机选择一个动作
    policy = defaultdict(lambda: np.random.choice(env.actions))

    for _ in range(episodes):
        # 初始化当前状态为起始状态
        state = env.start_state
        # 初始化轨迹
        episode = []
        
        # 生成轨迹
        while not env.is_terminal(state):
            # 在当前状态下随机选择一个动作
            while True:
                action = np.random.choice(env.actions)
                # 执行动作，获取下一个状态和奖励
                next_state, reward = env.step(state, action)
                # 如果下一个状态不等于当前状态，则更新策略并退出循环
                if next_state != state:
                    policy[state] = action
                    break
            # 将当前状态、动作和奖励添加到轨迹中
            episode.append((state, action, reward))
            # 更新当前状态为下一个状态
            state = next_state
        
        # 计算每个状态的累计回报
        G = 0
        visited = set()
        for state, _, reward in reversed(episode):
            G = reward + gamma * G
            if state not in visited:
                visited.add(state)
                returns[state].append(G)
                V[state] = np.mean(returns[state])
        
        # 策略改进
        for state, _, _ in episode:
            policy[state] = max(env.actions, key=lambda a: expected_return(env, state, a, V, gamma))
    
    return policy, V

def expected_return(env, state, action, V, gamma):
    """
    计算给定状态和动作的期望回报。

    参数:
    env (MazeEnv): 迷宫环境对象。
    state (tuple): 当前状态。
    action (str): 当前动作。
    V (defaultdict): 每个状态的值函数。
    gamma (float): 折扣因子。

    返回:
    float: 期望回报。
    """
    # 计算期望回报
    next_state, reward = env.step(state, action)
    return reward + gamma * V[next_state]

# 运行
env = MazeEnv()
policy, value_function = monte_carlo_control(env)
print("Learned Policy:", dict(policy))
print("State Values:", dict(value_function))


Learned Policy: {(0, 0): 'right', (0, 1): 'right', (0, 2): 'down', (1, 2): 'down', (1, 0): 'down', (2, 0): 'right', (2, 1): 'right'}
State Values: {(1, 2): -2.5563258232235704, (0, 2): -4.710059171597633, (0, 1): -5.653610771113831, (0, 0): -5.988, (1, 0): -5.780246913580247, (2, 2): 0.0, (2, 1): -3.4190140845070425, (2, 0): -5.005952380952381}


# 行为策略是均匀随机策略，改进策略为贪心策略，所以属于离线策略
# 值函数采用增量式更新

In [2]:
import numpy as np
from collections import defaultdict

# 行为策略（随机策略）和目标策略（贪婪策略）
def behavior_policy(state):
    return np.random.choice(env.actions)

def target_policy(state, V, gamma=1.0):
    best_action = max(env.actions, key=lambda a: expected_return(env, state, a, V, gamma))
    return best_action

# 计算期望回报（假设确定性）
def expected_return(env, state, action, V, gamma):
    next_state, reward = env.step(state, action)
    return reward + gamma * V[next_state]

# 离线策略蒙特卡罗评估（加权重要性采样）
def off_policy_monte_carlo_control(env, episodes=1000, gamma=1.0):
    Q = defaultdict(float)
    C = defaultdict(float)
    policy = defaultdict(lambda: np.random.choice(env.actions))  # 初始化目标策略为随机
    
    for _ in range(episodes):
        # 生成轨迹
        state = env.start_state
        episode = []
        
        while not env.is_terminal(state):
            action = behavior_policy(state)
            next_state, reward = env.step(state, action)
            episode.append((state, action, reward))
            state = next_state
        
        # 计算每个状态的加权重要性回报
        G = 0
        W = 1  # 累积重要性权重
        for state, action, reward in reversed(episode):
            G = reward + gamma * G  # 累计回报
            
            # 更新累积加权重要性
            C[(state, action)] += W
            Q[(state, action)] += (W / C[(state, action)]) * (G - Q[(state, action)])
            
            # 更新策略为贪婪策略
            policy[state] = max(env.actions, key=lambda a: Q.get((state, a), float('-inf')))

            
            # 计算重要性采样权重
            if action != policy[state]:
                break  # 如果动作偏离目标策略，停止累积权重
            W *= 1.0 / 0.25  # 假设行为策略是随机的，行为策略的概率1/4
    
    return policy, Q

# 运行环境和离线策略控制
env = MazeEnv()
policy, Q = off_policy_monte_carlo_control(env)

print("Learned Policy:")
for state in policy:
    print(f"State {state}: {policy[state]}")

print("\nAction-Value Function:")
for state_action, value in Q.items():
    print(f"State {state_action}: {value:.2f}")


Learned Policy:
State (2, 1): right
State (2, 0): right
State (1, 2): down
State (0, 2): down
State (0, 1): right
State (1, 0): down
State (0, 0): down

Action-Value Function:
State ((2, 1), 'right'): -1.00
State ((2, 0), 'right'): -2.00
State ((2, 0), 'left'): -3.00
State ((1, 2), 'down'): -1.00
State ((1, 2), 'left'): -2.00
State ((2, 1), 'down'): -2.00
State ((1, 2), 'right'): -2.00
State ((2, 0), 'down'): -3.00
State ((0, 2), 'down'): -2.00
State ((0, 1), 'right'): -3.00
State ((0, 1), 'down'): -4.00
State ((0, 2), 'left'): -4.00
State ((0, 2), 'right'): -3.00
State ((1, 0), 'down'): -3.00
State ((0, 0), 'down'): -4.00
State ((0, 1), 'left'): -5.00
State ((2, 1), 'up'): -2.00
State ((1, 2), 'up'): -3.00
State ((0, 0), 'right'): -4.00
State ((0, 2), 'up'): -3.00
State ((2, 0), 'up'): -4.00
State ((1, 0), 'left'): -4.00
State ((0, 0), 'left'): -5.00
State ((2, 1), 'left'): -3.00
State ((0, 1), 'up'): -4.00
State ((1, 0), 'right'): -4.00
State ((0, 0), 'up'): -5.00
State ((1, 0), 'up'

# 行为策略是ε-贪心策略，策略评估用的是求平均值，所以目标策略和行为策略一样，所以属于在线策略
# 策略评估用的是求平均值，但并没有存储所有的累积回报，而是做了累加，所以应该也属于增量式更新

In [3]:
import numpy as np
from collections import defaultdict

# ε-贪婪策略
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    if np.random.rand() < epsilon:
        return np.random.choice(env.actions)  # 随机动作
    else:
        return max(env.actions, key=lambda a: Q[(state, a)])  # 贪婪选择

# 在线策略蒙特卡罗控制
def on_policy_monte_carlo_control(env, episodes=1000, gamma=1.0, epsilon=0.1):
    Q = defaultdict(float)  # 状态-动作值函数
    returns_sum = defaultdict(float)  # 累计回报总和
    returns_count = defaultdict(int)  # 回报计数
    
    policy = defaultdict(lambda: np.random.choice(env.actions))  # 初始化随机策略
    
    for _ in range(episodes):
        # 生成轨迹
        state = env.start_state
        episode = []
        
        while not env.is_terminal(state):
            action = epsilon_greedy_policy(state, Q, epsilon)
            next_state, reward = env.step(state, action)
            episode.append((state, action, reward))
            state = next_state
        
        # 计算回合中的每个状态-动作对的回报
        G = 0
        visited = set()  # 记录首次访问
        for state, action, reward in reversed(episode):
            G = reward + gamma * G  # 累计回报
            
            if (state, action) not in visited:
                returns_sum[(state, action)] += G
                returns_count[(state, action)] += 1
                Q[(state, action)] = returns_sum[(state, action)] / returns_count[(state, action)]
                visited.add((state, action))
        
        # 更新策略（ε-贪婪）
        for state, action, _ in episode:
            policy[state] = max(env.actions, key=lambda a: Q[(state, a)])
    
    return policy, Q

# 运行环境和在线策略控制
env = MazeEnv()
policy, Q = on_policy_monte_carlo_control(env, episodes=5000, epsilon=0.9)

print("Learned Policy:")
for state in sorted(policy):
    print(f"State {state}: {policy[state]}")

print("\nAction-Value Function:")
for state_action, value in Q.items():
    print(f"State {state_action}: {value:.2f}")


Learned Policy:
State (0, 0): down
State (0, 1): right
State (0, 2): down
State (1, 0): down
State (1, 2): down
State (2, 0): right
State (2, 1): right

Action-Value Function:
State ((0, 2), 'up'): -10.33
State ((0, 2), 'down'): -4.51
State ((0, 2), 'left'): -13.44
State ((0, 2), 'right'): -9.74
State ((1, 2), 'down'): -1.00
State ((0, 1), 'right'): -8.99
State ((0, 1), 'down'): -13.83
State ((0, 1), 'up'): -14.26
State ((0, 0), 'right'): -13.22
State ((0, 1), 'left'): -15.37
State ((1, 0), 'up'): -13.91
State ((0, 0), 'down'): -12.01
State ((0, 0), 'up'): -15.37
State ((0, 0), 'left'): -15.32
State ((1, 0), 'down'): -8.06
State ((1, 0), 'left'): -12.99
State ((1, 0), 'right'): -13.05
State ((1, 2), 'up'): -10.57
State ((1, 2), 'left'): -6.16
State ((1, 2), 'right'): -6.12
State ((2, 1), 'up'): -5.76
State ((2, 1), 'down'): -5.99
State ((2, 1), 'left'): -9.80
State ((2, 1), 'right'): -1.00
State ((2, 0), 'right'): -4.09
State ((2, 0), 'up'): -12.22
State ((2, 0), 'down'): -9.57
State (

# 行为策略和目标策略一样，都是ε-贪婪，属于在线策略，但策略改进只保留概率最大的行为
# 策略评估属于增量式更新
# 动态调整ε：随着训练的进行，逐渐线性减小ε的值，以减少探索，增加利用

In [4]:
# 在线策略蒙特卡罗控制，动态调整 ε
def on_policy_monte_carlo_control(env, episodes=1000, gamma=1.0, epsilon_start=1.0, epsilon_min=0.1, decay_rate=0.001):
    Q = defaultdict(float)
    returns_sum = defaultdict(float)
    returns_count = defaultdict(int)
    
    policy = defaultdict(lambda: np.random.choice(env.actions))
    
    for episode_num in range(episodes):
        # 动态调整 ε
        epsilon = max(epsilon_min, epsilon_start - decay_rate * episode_num)
        
        # 生成轨迹
        state = env.start_state
        episode = []
        
        while not env.is_terminal(state):
            action = epsilon_greedy_policy(state, Q, epsilon)
            next_state, reward = env.step(state, action)
            episode.append((state, action, reward))
            state = next_state
        
        # 计算每个状态-动作对的回报
        G = 0
        visited = set()
        for state, action, reward in reversed(episode):
            G = reward + gamma * G
            if (state, action) not in visited:
                returns_sum[(state, action)] += G
                returns_count[(state, action)] += 1
                Q[(state, action)] = returns_sum[(state, action)] / returns_count[(state, action)]
                visited.add((state, action))
        
        # 更新策略
        for state, _, _ in episode:
            policy[state] = max(env.actions, key=lambda a: Q[(state, a)])
    
    return policy, Q

# 运行环境和在线策略控制
env = MazeEnv()
policy, Q = on_policy_monte_carlo_control(env, episodes=1000, epsilon_start=1.0, epsilon_min=0.1, decay_rate=0.001)

print("Learned Policy:")
for state in sorted(policy):
    print(f"State {state}: {policy[state]}")

print("\nAction-Value Function:")
for state_action, value in Q.items():
    print(f"State {state_action}: {value:.2f}")

Learned Policy:
State (0, 0): down
State (0, 1): right
State (0, 2): down
State (1, 0): down
State (1, 2): down
State (2, 0): right
State (2, 1): right

Action-Value Function:
State ((2, 1), 'right'): -1.00
State ((2, 1), 'up'): -4.88
State ((2, 0), 'right'): -2.54
State ((2, 0), 'down'): -7.53
State ((2, 1), 'left'): -6.84
State ((1, 0), 'down'): -4.51
State ((2, 0), 'up'): -8.80
State ((2, 0), 'left'): -6.59
State ((1, 0), 'left'): -10.33
State ((0, 0), 'down'): -6.70
State ((0, 0), 'left'): -11.35
State ((0, 0), 'up'): -11.94
State ((0, 1), 'left'): -13.06
State ((0, 0), 'right'): -10.73
State ((1, 0), 'up'): -10.55
State ((1, 0), 'right'): -10.15
State ((0, 1), 'up'): -13.51
State ((0, 1), 'down'): -13.10
State ((0, 2), 'left'): -12.47
State ((0, 2), 'up'): -9.64
State ((0, 2), 'right'): -10.86
State ((0, 1), 'right'): -7.44
State ((0, 2), 'down'): -3.62
State ((2, 1), 'down'): -4.61
State ((1, 2), 'down'): -1.00
State ((1, 2), 'left'): -3.32
State ((1, 2), 'up'): -8.59
State ((1, 

# 行为策略和目标策略一样，都是ε-贪婪，属于在线策略
# 策略评估属于增量式更新
# 动态调整ε：随着训练的进行，逐渐线性减小ε的值，以减少探索，增加利用

In [5]:
def epsilon_greedy_policy(state, Q, epsilon, actions):
    if np.random.rand() < epsilon:
        return np.random.choice(actions)
    else:
        return max(actions, key=lambda a: Q[(state, a)])

def on_policy_monte_carlo_control(env, episodes=1000, gamma=1.0, epsilon_start=1.0, epsilon_min=0.1, decay_rate=0.001):
    Q = defaultdict(float)
    returns_sum = defaultdict(float)
    returns_count = defaultdict(int)
    
    policy = defaultdict(lambda: env.actions[0])  # 初始化为固定动作
    
    for episode_num in range(episodes):
        # 动态调整 ε
        epsilon = max(epsilon_min, epsilon_start - decay_rate * episode_num)
        
        # 生成一条轨迹
        state = env.start_state
        episode = []
        
        while not env.is_terminal(state):
            action = epsilon_greedy_policy(state, Q, epsilon, env.actions)
            next_state, reward = env.step(state, action)
            episode.append((state, action, reward))
            state = next_state
        
        # 计算每个状态-动作对的回报
        G = 0
        visited = set()
        for state, action, reward in reversed(episode):
            G = reward + gamma * G
            if (state, action) not in visited:
                returns_sum[(state, action)] += G
                returns_count[(state, action)] += 1
                Q[(state, action)] = returns_sum[(state, action)] / returns_count[(state, action)]
                visited.add((state, action))
        
        # 策略改进: 使用 epsilon-greedy 概率更新策略
        for state in set(s for s, _, _ in episode):
            best_action = max(env.actions, key=lambda a: Q[(state, a)])
            policy[state] = {}
            for action in env.actions:
                if action == best_action:
                    policy[state][action] = 1 - epsilon + epsilon / len(env.actions)
                else:
                    policy[state][action] = epsilon / len(env.actions)
    
    # 返回最终策略及 Q 函数
    return policy, Q

env = MazeEnv()
policy, Q = on_policy_monte_carlo_control(env, episodes=1000, epsilon_start=1.0, epsilon_min=0.1, decay_rate=0.001)

print("Learned Policy (Probabilities):")
for state in sorted(policy):
    print(f"State {state}: {policy[state]}")

print("\nAction-Value Function:")
for state_action, value in Q.items():
    print(f"State {state_action}: {value:.2f}")

Learned Policy (Probabilities):
State (0, 0): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (0, 1): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (0, 2): {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}
State (1, 0): {'up': 0.036250000000000004, 'down': 0.89125, 'left': 0.036250000000000004, 'right': 0.036250000000000004}
State (1, 2): {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}
State (2, 0): {'up': 0.036250000000000004, 'down': 0.036250000000000004, 'left': 0.036250000000000004, 'right': 0.89125}
State (2, 1): {'up': 0.036250000000000004, 'down': 0.036250000000000004, 'left': 0.036250000000000004, 'right': 0.89125}

Action-Value Function:
State ((2, 1), 'right'): -1.00
State ((2, 1), 'up'): -5.52
State ((2, 1), 'down'): -7.10
State ((2, 0), 'right'): -3.87
State ((2, 0), 'down'): -8.93
State ((2, 0), 'left'): -10.53
State ((1, 0), 'down'): -8.08
State ((0, 0), 'down'): -11.05
State ((0, 1), 'left'): -10.53
State ((0, 1

# 行为策略和目标策略一样，都是ε-贪婪，属于在线策略
# 策略评估属于增量式更新

In [11]:
def epsilon_greedy_policy(state, Q, epsilon, actions):
    if np.random.rand() < epsilon:
        return np.random.choice(actions)
    else:
        return max(actions, key=lambda a: Q[(state, a)])

def on_policy_monte_carlo_control(env, episodes=1000, gamma=1.0, epsilon=0.1):
    Q = defaultdict(float)
    returns_sum = defaultdict(float)
    returns_count = defaultdict(int)
    
    policy = defaultdict(lambda: env.actions[0])  # 初始化为固定动作
    
    for _ in range(episodes):
        # 生成一条轨迹
        state = env.start_state
        episode = []
        
        while not env.is_terminal(state):
            action = epsilon_greedy_policy(state, Q, epsilon, env.actions)
            next_state, reward = env.step(state, action)
            episode.append((state, action, reward))
            state = next_state
        
        # 计算每个状态-动作对的回报
        G = 0
        visited = set()
        for state, action, reward in reversed(episode):
            G = reward + gamma * G
            if (state, action) not in visited:
                returns_sum[(state, action)] += G
                returns_count[(state, action)] += 1
                Q[(state, action)] = returns_sum[(state, action)] / returns_count[(state, action)]
                visited.add((state, action))
        
        # 策略改进: 使用 epsilon-greedy 概率更新策略
        for state in set(s for s, _, _ in episode):
            best_action = max(env.actions, key=lambda a: Q[(state, a)])
            policy[state] = {}
            for action in env.actions:
                if action == best_action:
                    policy[state][action] = 1 - epsilon + epsilon / len(env.actions)
                else:
                    policy[state][action] = epsilon / len(env.actions)
    
    # 返回最终策略及 Q 函数
    return policy, Q

env = MazeEnv()
policy, Q = on_policy_monte_carlo_control(env, episodes=1000, epsilon=0.1)

print("Learned Policy (Probabilities):")
for state in sorted(policy):
    print(f"State {state}: {policy[state]}")

print("\nAction-Value Function:")
for state_action, value in Q.items():
    print(f"State {state_action}: {value:.2f}")

Learned Policy (Probabilities):
State (0, 0): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (0, 1): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (0, 2): {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}
State (1, 0): {'up': 0.025, 'down': 0.025, 'left': 0.925, 'right': 0.025}
State (1, 2): {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}
State (2, 0): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (2, 1): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}

Action-Value Function:
State ((0, 0), 'up'): -22.05
State ((0, 0), 'down'): -48.00
State ((0, 0), 'left'): -24.29
State ((0, 0), 'right'): -7.02
State ((1, 0), 'up'): -26.87
State ((1, 0), 'down'): -17.73
State ((1, 0), 'left'): -6.69
State ((1, 0), 'right'): -31.72
State ((0, 1), 'up'): -16.12
State ((0, 1), 'down'): -55.04
State ((0, 1), 'left'): -26.70
State ((0, 1), 'right'): -4.32
State ((0, 2), 'up'): -3.09
State ((0, 2), 'down'): -2.11
St