In [1]:
import numpy as np
from collections import defaultdict

# 定义迷宫环境
class MazeEnv:
    def __init__(self):
        # 初始化迷宫布局
        self.grid = np.array([
            ['S', '-', '-'],
            ['-', '#', '-'],
            ['-', '-', 'G']
        ])
        # 设置起始状态和目标状态
        self.start_state = (0, 0)
        self.goal_state = (2, 2)
        # 定义可能的动作
        self.actions = ['up', 'down', 'left', 'right']
    
    def is_terminal(self, state):
        # 判断当前状态是否为目标状态
        return state == self.goal_state
    
    def step(self, state, action):
        # 根据当前状态和动作，计算下一个状态和奖励
        x, y = state
        if action == 'up': x = max(0, x - 1)
        if action == 'down': x = min(2, x + 1)
        if action == 'left': y = max(0, y - 1)
        if action == 'right': y = min(2, y + 1)
        
        if self.grid[x, y] == '#':  # 遇到障碍物
            return state, -1
        return (x, y), -1

# 行为策略是均匀随机策略，改进策略为贪心策略，所以属于离线策略
# 值函数采用增量式更新

In [2]:
import numpy as np
from collections import defaultdict

# 行为策略（随机策略）和目标策略（贪婪策略）
def behavior_policy(state):
    return np.random.choice(env.actions)

def target_policy(state, V, gamma=1.0):
    best_action = max(env.actions, key=lambda a: expected_return(env, state, a, V, gamma))
    return best_action

# 计算期望回报（假设确定性）
def expected_return(env, state, action, V, gamma):
    next_state, reward = env.step(state, action)
    return reward + gamma * V[next_state]

# 离线策略蒙特卡罗评估（加权重要性采样）
def off_policy_monte_carlo_control(env, episodes=1000, gamma=1.0):
    Q = defaultdict(float)
    C = defaultdict(float)
    policy = defaultdict(lambda: np.random.choice(env.actions))  # 初始化目标策略为随机
    
    for _ in range(episodes):
        # 生成轨迹
        state = env.start_state
        episode = []
        
        while not env.is_terminal(state):
            action = behavior_policy(state)
            next_state, reward = env.step(state, action)
            episode.append((state, action, reward))
            state = next_state
        
        # 计算每个状态的加权重要性回报
        G = 0
        W = 1  # 累积重要性权重
        for state, action, reward in reversed(episode):
            G = reward + gamma * G  # 累计回报
            
            # 更新累积加权重要性
            C[(state, action)] += W
            Q[(state, action)] += (W / C[(state, action)]) * (G - Q[(state, action)])
            
            # 更新策略为贪婪策略
            policy[state] = max(env.actions, key=lambda a: Q.get((state, a), float('-inf')))

            
            # 计算重要性采样权重
            if action != policy[state]:
                break  # 如果动作偏离目标策略，停止累积权重
            W *= 1.0 / 0.25  # 假设行为策略是随机的，行为策略的概率1/4
    
    return policy, Q

# 运行环境和离线策略控制
env = MazeEnv()
policy, Q = off_policy_monte_carlo_control(env)

print("Learned Policy:")
for state in policy:
    print(f"State {state}: {policy[state]}")

print("\nAction-Value Function:")
for state_action, value in Q.items():
    print(f"State {state_action}: {value:.2f}")


Learned Policy:
State (2, 1): right
State (1, 2): down
State (2, 0): right
State (0, 2): down
State (1, 0): down
State (0, 0): down
State (0, 1): right

Action-Value Function:
State ((2, 1), 'right'): -1.00
State ((2, 1), 'down'): -2.00
State ((1, 2), 'down'): -1.00
State ((1, 2), 'left'): -2.00
State ((2, 0), 'right'): -2.00
State ((2, 0), 'down'): -3.00
State ((0, 2), 'down'): -2.00
State ((0, 2), 'up'): -3.00
State ((0, 2), 'right'): -3.00
State ((1, 0), 'down'): -3.00
State ((1, 0), 'left'): -4.00
State ((2, 1), 'up'): -2.00
State ((0, 0), 'down'): -4.00
State ((1, 0), 'up'): -5.00
State ((1, 2), 'right'): -2.00
State ((0, 1), 'right'): -3.00
State ((0, 1), 'up'): -4.00
State ((0, 0), 'right'): -4.00
State ((2, 0), 'left'): -3.00
State ((0, 2), 'left'): -4.00
State ((0, 0), 'up'): -5.00
State ((0, 1), 'down'): -4.00
State ((1, 0), 'right'): -4.00
State ((1, 2), 'up'): -3.00
State ((2, 1), 'left'): -3.00
State ((0, 0), 'left'): -5.00
State ((2, 0), 'up'): -4.00
State ((0, 1), 'left'

In [3]:
# ε-贪婪策略
def epsilon_greedy_policy(state, Q, epsilon=0.1):
    if np.random.rand() < epsilon:
        return np.random.choice(env.actions)  # 随机动作
    else:
        return max(env.actions, key=lambda a: Q[(state, a)])  # 贪婪选择

# 行为策略和目标策略一样，都是ε-贪婪，属于在线策略
# 策略评估属于增量式更新
# 动态调整ε：随着训练的进行，逐渐线性减小ε的值，以减少探索，增加利用

In [4]:
def epsilon_greedy_policy(state, Q, epsilon, actions):
    if np.random.rand() < epsilon:
        return np.random.choice(actions)
    else:
        return max(actions, key=lambda a: Q[(state, a)])

def on_policy_monte_carlo_control(env, episodes=1000, gamma=1.0, epsilon_start=1.0, epsilon_min=0.1, decay_rate=0.001):
    Q = defaultdict(float)
    returns_sum = defaultdict(float)
    returns_count = defaultdict(int)
    
    policy = defaultdict(lambda: env.actions[0])  # 初始化为固定动作
    
    for episode_num in range(episodes):
        # 动态调整 ε
        epsilon = max(epsilon_min, epsilon_start - decay_rate * episode_num)
        
        # 生成一条轨迹
        state = env.start_state
        episode = []
        
        while not env.is_terminal(state):
            action = epsilon_greedy_policy(state, Q, epsilon, env.actions)
            next_state, reward = env.step(state, action)
            episode.append((state, action, reward))
            state = next_state
        
        # 计算每个状态-动作对的回报
        G = 0
        visited = set()
        for state, action, reward in reversed(episode):
            G = reward + gamma * G
            if (state, action) not in visited:
                returns_sum[(state, action)] += G
                returns_count[(state, action)] += 1
                Q[(state, action)] = returns_sum[(state, action)] / returns_count[(state, action)]
                visited.add((state, action))
        
        # 策略改进: 使用 epsilon-greedy 概率更新策略
        for state in set(s for s, _, _ in episode):
            best_action = max(env.actions, key=lambda a: Q[(state, a)])
            policy[state] = {}
            for action in env.actions:
                if action == best_action:
                    policy[state][action] = 1 - epsilon + epsilon / len(env.actions)
                else:
                    policy[state][action] = epsilon / len(env.actions)
    
    # 返回最终策略及 Q 函数
    return policy, Q

env = MazeEnv()
policy, Q = on_policy_monte_carlo_control(env, episodes=1000, epsilon_start=1.0, epsilon_min=0.1, decay_rate=0.001)

print("Learned Policy (Probabilities):")
for state in sorted(policy):
    print(f"State {state}: {policy[state]}")

print("\nAction-Value Function:")
for state_action, value in Q.items():
    print(f"State {state_action}: {value:.2f}")

Learned Policy (Probabilities):
State (0, 0): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (0, 1): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (0, 2): {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}
State (1, 0): {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}
State (1, 2): {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}
State (2, 0): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (2, 1): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}

Action-Value Function:
State ((2, 1), 'right'): -1.00
State ((2, 1), 'down'): -8.30
State ((2, 1), 'up'): -7.54
State ((2, 0), 'right'): -4.50
State ((2, 0), 'down'): -11.06
State ((1, 0), 'down'): -7.52
State ((1, 0), 'left'): -13.29
State ((2, 0), 'up'): -13.73
State ((0, 0), 'down'): -10.81
State ((0, 0), 'left'): -13.26
State ((0, 0), 'up'): -12.52
State ((1, 0), 'up'): -13.81
State ((1, 0), 'right'): -14.15
State ((2, 0), 'left'): -10.02
St

# 行为策略和目标策略一样，都是ε-贪婪，属于在线策略
# 策略评估属于增量式更新

In [5]:
def epsilon_greedy_policy(state, Q, epsilon, actions):
    if np.random.rand() < epsilon:
        return np.random.choice(actions)
    else:
        return max(actions, key=lambda a: Q[(state, a)])

def on_policy_monte_carlo_control(env, episodes=1000, gamma=1.0, epsilon=0.1):
    Q = defaultdict(float)
    returns_sum = defaultdict(float)
    returns_count = defaultdict(int)
    
    policy = defaultdict(lambda: env.actions[0])  # 初始化为固定动作
    
    for _ in range(episodes):
        # 生成一条轨迹
        state = env.start_state
        episode = []
        
        while not env.is_terminal(state):
            action = epsilon_greedy_policy(state, Q, epsilon, env.actions)
            next_state, reward = env.step(state, action)
            episode.append((state, action, reward))
            state = next_state
        
        # 计算每个状态-动作对的回报
        G = 0
        visited = set()
        for state, action, reward in reversed(episode):
            G = reward + gamma * G
            if (state, action) not in visited:
                returns_sum[(state, action)] += G
                returns_count[(state, action)] += 1
                Q[(state, action)] = returns_sum[(state, action)] / returns_count[(state, action)]
                visited.add((state, action))
        
        # 策略改进: 使用 epsilon-greedy 概率更新策略
        for state in set(s for s, _, _ in episode):
            best_action = max(env.actions, key=lambda a: Q[(state, a)])
            policy[state] = {}
            for action in env.actions:
                if action == best_action:
                    policy[state][action] = 1 - epsilon + epsilon / len(env.actions)
                else:
                    policy[state][action] = epsilon / len(env.actions)
    
    # 返回最终策略及 Q 函数
    return policy, Q

env = MazeEnv()
policy, Q = on_policy_monte_carlo_control(env, episodes=1000, epsilon=0.1)

print("Learned Policy (Probabilities):")
for state in sorted(policy):
    print(f"State {state}: {policy[state]}")

print("\nAction-Value Function:")
for state_action, value in Q.items():
    print(f"State {state_action}: {value:.2f}")

Learned Policy (Probabilities):
State (0, 0): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (0, 1): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (0, 2): {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}
State (1, 0): {'up': 0.025, 'down': 0.025, 'left': 0.925, 'right': 0.025}
State (1, 2): {'up': 0.025, 'down': 0.925, 'left': 0.025, 'right': 0.025}
State (2, 0): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}
State (2, 1): {'up': 0.025, 'down': 0.025, 'left': 0.025, 'right': 0.925}

Action-Value Function:
State ((0, 0), 'up'): -29.43
State ((0, 0), 'down'): -41.14
State ((0, 0), 'left'): -19.53
State ((0, 0), 'right'): -5.54
State ((1, 0), 'up'): -37.03
State ((1, 0), 'down'): -40.67
State ((1, 0), 'left'): -15.34
State ((1, 0), 'right'): -25.54
State ((0, 1), 'up'): -6.12
State ((0, 1), 'down'): -9.86
State ((0, 1), 'left'): -11.50
State ((0, 1), 'right'): -3.74
State ((2, 0), 'up'): -410.50
State ((2, 0), 'down'): -44.00
