In [50]:
import gymnasium as gym 
from gymnasium import spaces
import numpy as np
import torch as th
import os
import matplotlib.pyplot as plt
from scipy.interpolate import make_interp_spline
from stable_baselines3 import SAC
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.callbacks import BaseCallback
import pickle

In [46]:
class MAB(gym.Env):
    def __init__(self, probs, T):
        """
        初始化多臂老虎机环境
        probs: 每个臂的成功概率
        T: 总时间步数
        """
        super(MAB, self).__init__()
        self.k = len(probs)  # 臂的数量
        self.probs = probs  # 每个臂的成功概率
        self.T = T  # 总时间步数

        # 定义动作空间和观察空间
        self.action_space = spaces.Box(low=1, high=100, shape=(1,), dtype=np.float32)  # 动作空间为alpha\in[1,100]代指UCB算法中的探索因子
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(3 * self.k + 1,))  # 观察空间为(3*k+1)维向量，每个维度包含三个元素：拉臂的次数、获得的奖励、奖励的方差，以及最后一个元素是时间t
        self.reset()

    def reset(self, seed=None, options=None):
        """
        重置环境
        返回: 初始状态
        """
        super().reset(seed=seed)
        self.steps = 0
        self.n = np.zeros(self.k)  # 每个臂的拉动次数
        self.average_rewards = np.zeros(self.k)  # 每个臂奖励的均值
        self.rewards = [[] for _ in range(self.k)]  # 每个臂的奖励列表
        self.var = np.zeros(self.k) # 每个臂奖励的方差
        self.alpha = 1.0  # 初始alpha值
        observation = np.zeros(3 * self.k + 1)  # 初始状态
        info = {}
        return observation, info
    
    def step(self, action):
        """
        执行给定的动作，并返回结果。
        
        参数:
        action: 智能体选择的alpha值
        
        返回:
        - next_state: 执行动作后的新状态
        - reward: 执行动作后获得的奖励
        - terminated: 布尔值，表示是否达到终止状态
        - truncated: 布尔值，表示是否由于时间限制或其他原因而被截断
        - info: 额外的诊断信息（可选）
        """
        self.alpha = action[0]  # 更新alpha值

        ucb_values = self.average_rewards + np.sqrt(self.alpha * np.log(self.steps + 1) / (2 * (self.n + 1e-10)))
        chosen_arm = np.argmax(ucb_values)  # 选择具有最高UCB值的臂
        reward = np.random.binomial(1, self.probs[chosen_arm])  # 根据奖励概率生成奖励
        
        self.steps += 1
        self.n[chosen_arm] += 1
        self.rewards[chosen_arm].append(reward)  # 更新奖励列表
        self.average_rewards[chosen_arm] = np.mean(self.rewards[chosen_arm])  # 更新奖励均值
        self.var[chosen_arm] = np.var(self.rewards[chosen_arm])#更新奖励方差
        terminated = self.steps >= self.T  # 判断是否达到最大步数
        truncated = self.steps >= self.T  # 在这个示例中，截断条件与终止条件相同
        next_state = np.concatenate([self.n, self.average_rewards, self.var, [self.steps]])  # 更新状态
        info = {} # 可选的额外信息

        return next_state, reward, terminated, truncated, info


In [47]:

def get_reward_episode(model_name, env, T, n_episodes,is_print=False):
    """
    用训练出的策略模型,在确定和随机的条件下,与分别环境进行n_episodes交互,分别计算各步累计收益的平均值,并绘制平均收益和平均的alpha随时间步的变化
    """
    # 初始化确定策略和随机策略的各步的平均累计收益(n_episdodes次交互的总和)
    deterministic_rewards_sum = np.zeros(T)
    stochastic_rewards_sum = np.zeros(T)
    # 初始化确定策略和随机策略的各步的平均alpha
    deterministic_alphas_sum = np.zeros(T)
    stochastic_alphas_sum = np.zeros(T) 

    for episode_index in range(n_episodes):
        # 初始化确定策略每步的累计收益
        deterministic_reward_episode = np.zeros(T)
        # 重置环境，获取初始状态
        state, _ = env.reset()
        # 确定策略与环境进行交互
        for t in range(T):
            # 将状态转换为 PyTorch 张量，并添加批量维度
            state_tensor = th.tensor(state, dtype=th.float32).unsqueeze(0)
            # 确定性策略
            action, _ = model_name.predict(state_tensor, deterministic=True)
            next_state, reward, terminated, truncated, info = env.step(action)
            # 更新确定策略的累计收益
            if t == 0:
                deterministic_reward_episode[t] = reward
            else:
                deterministic_reward_episode[t] = reward+deterministic_reward_episode[t-1]
            # 更新确定策略的累计alpha
            deterministic_alphas_sum[t] += action
            # 与环境交互
            state = next_state
        # 加上确定策略的本次交互的累计收益
        deterministic_rewards_sum += deterministic_reward_episode
        # 初始化随机策略每步的累计收益
        stochastic_reward_episode = np.zeros(T)
        # 重置环境，获取初始状态
        state, _ = env.reset()
        # 随机策略与环境进行交互
        for t in range(T):
            # 将状态转换为 PyTorch 张量，并添加批量维度
            state_tensor = th.tensor(state, dtype=th.float32).unsqueeze(0)
            # 随机性策略
            action, _ = model_name.predict(state_tensor, deterministic=False)
            next_state, reward, terminated, truncated, info = env.step(action)
            # 更新随机策略的累计收益
            if t == 0:
                stochastic_reward_episode[t] = reward
            else:
                stochastic_reward_episode[t] = reward+stochastic_reward_episode[t-1]
            # 更新随机策略的累计alpha
            stochastic_alphas_sum[t] += action
            # 与环境交互
            state = next_state
        # 加上随机策略的本次交互的累计收益
        stochastic_rewards_sum += stochastic_reward_episode
        if is_print:
            print('Episode:', episode_index + 1)
    # 计算episode内确定策略和随机策略的每步的平均累计收益
    deterministic_rewards_mean = deterministic_rewards_sum / n_episodes
    stochastic_rewards_mean = stochastic_rewards_sum / n_episodes
    # 计算episode内确定策略和随机策略的每步的平均alpha
    deterministic_alphas_mean = deterministic_alphas_sum / n_episodes
    stochastic_alphas_mean = stochastic_alphas_sum / n_episodes
    return deterministic_rewards_mean, stochastic_rewards_mean, deterministic_alphas_mean, stochastic_alphas_mean

进行多轮训练

按一定轮次的频率保存训练所得的策略

后续读取训练结果，绘出平均最终收益随训练轮次的变化情况


重点关注参数：
参数名      | 描述 |
| ----------- | ----------- |
| probs      | 多臂老虎机设定       |
| T      | 每轮拉臂的次数       |
| total_timesteps      | 总训练轮数       |
| check_freq      | 训练结果保存频率       |

In [None]:
# 创建环境
np.random.seed(1)
probs = np.random.rand(5)
formatted_probs = [f"{prob:.4f}" for prob in probs]
print("伯努利多臂老虎机的概率为：", formatted_probs)
T = 50
env = MAB(probs,T)
class SaveOnBestTrainingRewardCallback(BaseCallback):
    def __init__(self, check_freq: int, save_path: str, verbose: int = 1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.save_path = save_path

    def _init_callback(self) -> None:
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:
            model_filename = f'model_{self.n_calls}.zip'
            model_path = os.path.join(self.save_path, model_filename)
            self.model.save(model_path)
            if self.verbose > 0:
                print(f"Saving model checkpoint to {model_filename}")
                print(f"Model {model_filename} has been saved.")
        return True
    
# 定义保存路径和检查频率
save_path = '/Users/fengyilong/Git/MAB_SAC/UCB_record'
check_freq = 1000

# 创建回调函数实例
callback = SaveOnBestTrainingRewardCallback(check_freq=check_freq, save_path=save_path)

# 创建模型
model = SAC("MlpPolicy", env, gamma=1, verbose=2)

total_timesteps = 100000

# 训练模型并使用回调函数
model.learn(total_timesteps, callback=callback)

model.save("UCB")


与环境交互一次的示例

In [None]:
# 与环境交互一次的示例
# 加载训练好的模型
model_name = SAC.load("UCB")

# 重置环境，获取初始状态
state, _ = env.reset()

# 确保状态是一个 NumPy 数组
state = np.array(state)

# 初始化随机策略记录均值、标准差和动作的列表
means1 = []
stds1 = []
actions1 = []#记录随机性策略的动作

# 注意，分了两个循环，一个是随机性策略，一个是确定性策略；不同情形下的状态游走路径可能是不一样的

# 随机性策略交互
for t in range(T):
    # 将状态转换为 PyTorch 张量，并添加批量维度
    state_tensor = th.tensor(state, dtype=th.float32).unsqueeze(0)
    action1, _ = model_name.predict(state_tensor, deterministic=False) #  随机策略，高斯分布采样+裁剪

    # 获取动作分布参数
    mean_actions, log_std, _ = model_name.policy.actor.get_action_dist_params(state_tensor)
    std = th.exp(log_std)
    # 记录均值和标准差
    means1.append(mean_actions.detach().numpy().flatten())
    stds1.append(std.detach().numpy().flatten())
    actions1.append(action1)
    # 与环境进行交互
    next_state, reward, terminated, truncated, info = env.step(action1)

    # 更新状态
    state = next_state

# 将均值、标准差和实际动作转换为 NumPy 数组
means1 = np.array(means1)
stds1 = np.array(stds1)
actions1 = np.array(actions1)

k = len(probs)
self_n = state[:k]
self_average_rewards = state[k:2*k]
total_reward = np.sum(self_n * self_average_rewards)
# 打印最终状态和总收益
formatted_probs = [f"{prob:.4f}" for prob in probs]
print("伯努利多臂老虎机的概率为：", formatted_probs)
print("随机策略拉臂次数:", state[1:k])
print("随机策略臂收益均值:",np.round(state[k:2*k], 4))
print("随机策略臂收益方差:",np.round(state[2*k:3*k], 4))
print("随机策略总收益:", total_reward)


# 重置环境，获取初始状态
state, _ = env.reset()
# 初始化确定策略记录均值、标准差和动作的列表
means2 = []
stds2 = []
actions2 = []#记录确定性策略的动作

# 确定性策略交互
for t in range(T):
    # 将状态转换为 PyTorch 张量，并添加批量维度
    state_tensor = th.tensor(state, dtype=th.float32).unsqueeze(0)
    action2, _ = model_name.predict(state_tensor, deterministic=True) #  确定性策略，均值动作
    # 记录动作
    actions2.append(action2)
    # 获取动作分布参数
    mean_actions, log_std, _ = model_name.policy.actor.get_action_dist_params(state_tensor)
    std = th.exp(log_std)
    # 记录均值和标准差
    means2.append(mean_actions.detach().numpy().flatten())
    stds2.append(std.detach().numpy().flatten())
    # 与环境进行交互
    next_state, reward, terminated, truncated, info = env.step(action2)

    # 更新状态
    state = next_state
# 将实际动作转换为 NumPy 数组
means2 = np.array(means2)
stds2 = np.array(stds2)
actions2 = np.array(actions2)
time_steps = np.arange(T)

# 绘制随机策略的动作随时间步的变化
plt.subplot(2, 1, 2)
plt.plot(time_steps, actions1.flatten(), '-o', label='stochastic actions')
# 绘制确定策略的动作随时间步的变化
plt.plot(time_steps, actions2.flatten(), '-o', label='deterministic actions')
plt.xlabel('time')
plt.ylabel('Alpha')
plt.title('Stochastic and deterministic actions over time steps in an episode')
plt.legend()

plt.tight_layout()
plt.show()

k = len(probs)
self_n = state[:k]
self_average_rewards = state[k:2*k]
total_reward = np.sum(self_n * self_average_rewards)
# 打印最终状态和总收益
formatted_probs = [f"{prob:.4f}" for prob in probs]
print("伯努利多臂老虎机的概率为：", formatted_probs)

print("确定策略拉臂次数:", state[:k])
print("确定策略臂收益均值:",np.round(state[k:2*k], 4))
print("确定策略臂收益方差:",np.round(state[2*k:3*k], 4))
print("确定策略总收益:", total_reward)

In [None]:
# 计算平均累计收益
deterministic_rewards = []
stochastic_rewards = []
timesteps = []
n_episodes = 10000
for step in range(check_freq, total_timesteps + 1, check_freq):
    print("解压:", step)
    model_filename = f'model_{step}.zip'
    model_path = os.path.join(save_path, model_filename)
    
    # 加载模型
    model = SAC.load(model_path)
    
    # 调用 get_reward_episode 函数
    deterministic_temp_reward, stochastic_temp_reward,_,_= get_reward_episode(model, env, T, n_episodes)
    
    # 记录结果
    deterministic_rewards.append(deterministic_temp_reward[T-1])
    stochastic_rewards.append(stochastic_temp_reward[T-1])
    timesteps.append(step)

# 绘制平均累计收益随训练次数的变化
plt.figure(figsize=(10, 6))
plt.plot(timesteps, deterministic_rewards, label='Deterministic Rewards')
plt.plot(timesteps, stochastic_rewards, label='Stochastic Rewards')
plt.xlabel('Training Timesteps')
plt.ylabel('Average Cumulative Reward')
plt.title('Average Cumulative Reward vs Training Timesteps')
plt.legend()
plt.grid()
plt.show()

In [None]:
# 训练的最终模型
n_episodes = 1000
model_name = SAC.load("UCB")
# 调用 get_reward_episode 函数
deterministic_rewards_mean, stochastic_rewards_mean, deterministic_alphas_mean, stochastic_alphas_mean = get_reward_episode(model_name, env, T, n_episodes,1)

# 定义要保存的数据
data = {
    'deterministic_rewards_mean': deterministic_rewards_mean,
    'stochastic_rewards_mean': stochastic_rewards_mean,
    'deterministic_alphas_mean': deterministic_alphas_mean,
    'stochastic_alphas_mean': stochastic_alphas_mean
}

# 保存数据到本地文件
with open('/Users/fengyilong/Git/MAB_SAC/UCB_record/data.pkl', 'wb') as f:
    pickle.dump(data, f)

print("数据已保存到 /Users/fengyilong/Git/MAB_SAC/UCB_record/data.pkl")


In [None]:
# 从本地文件加载数据
with open('/Users/fengyilong/Git/MAB_SAC/UCB_record/data.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

# 访问加载的数据
deterministic_rewards_mean = loaded_data['deterministic_rewards_mean']
stochastic_rewards_mean = loaded_data['stochastic_rewards_mean']
deterministic_alphas_mean = loaded_data['deterministic_alphas_mean']
stochastic_alphas_mean = loaded_data['stochastic_alphas_mean']

print("数据已从 /Users/fengyilong/Git/MAB_SAC/UCB_reccord_data.pkl 加载")

# 绘制确定策略和随机策略的每步的平均累计收益
plt.figure(figsize=(10, 6))
plt.plot(deterministic_rewards_mean, label='Deterministic Rewards')
plt.plot(stochastic_rewards_mean, label='Stochastic Rewards')
plt.xlabel('Time Steps')
plt.ylabel('Average Cumulative Reward')
plt.title('Average Cumulative Reward vs Time Steps')
plt.legend()
plt.grid()
plt.show()

# 绘制确定策略和随机策略的每步的平均alpha
plt.figure(figsize=(10, 6))
plt.plot(deterministic_alphas_mean, label='Deterministic Alpha')
plt.plot(stochastic_alphas_mean, label='Stochastic Alpha')
plt.xlabel('Time Steps')
plt.ylabel('Average Alpha')
plt.title('Average Alpha vs Time Steps')
plt.legend()
plt.grid()
plt.show()