In [1]:
import numpy as np
np.random.seed(0)
import gym

In [2]:
env = gym.make('FrozenLake-v1', map_name="4x4")
env.seed(0)

[0]

In [3]:
def play_policy(env, policy, render=False):#根据policy策略进行一次游戏循环并返回结果总回报
    total_reward = 0.
    observation = env.reset()
    while True:
        if render:
            env.render() # 此行可显示
        action = np.random.choice(env.action_space.n,
                p=policy[observation])
        observation, reward, done, _ = env.step(action)
        total_reward += reward  # 统计回合奖励
        if done: # 游戏结束
            break
    return total_reward

In [4]:
def monte_carlo_with_soft(env, episode_num=500000, epsilon=0.1):
    policy = np.ones((16,4))*0.25
    q = np.zeros_like(policy)
    c = np.zeros_like(policy)
    for _ in range(episode_num):
        # 玩一回合
        state_actions = []
        observation = env.reset()
        while True:
            action = np.random.choice(env.action_space.n, p=policy[observation])
            state_actions.append((observation, action))
            observation, reward, done, _ = env.step(action)
            if done:
                break # 回合结束
        g = reward # 回报
        for state, action in state_actions:
            c[state][action] += 1.
            q[state][action] += (g - q[state][action]) / c[state][action]
            # 更新策略为柔性策略
            a = q[state].argmax()
            policy[state] = epsilon / 4.
            policy[state][a] += (1. - epsilon)
    return policy, q

In [5]:
policy, q = monte_carlo_with_soft(env)
print('柔性策略蒙特卡洛方法策略')
print(policy)
q=np.multiply(policy,q)
print('柔性策略蒙特卡洛方法状态价值：')
print(q.sum(axis=-1).reshape(4,4))

柔性策略蒙特卡洛方法策略
[[0.025 0.025 0.925 0.025]
 [0.025 0.025 0.025 0.925]
 [0.025 0.025 0.925 0.025]
 [0.025 0.025 0.025 0.925]
 [0.025 0.025 0.925 0.025]
 [0.25  0.25  0.25  0.25 ]
 [0.025 0.025 0.925 0.025]
 [0.25  0.25  0.25  0.25 ]
 [0.025 0.025 0.025 0.925]
 [0.025 0.925 0.025 0.025]
 [0.925 0.025 0.025 0.025]
 [0.25  0.25  0.25  0.25 ]
 [0.25  0.25  0.25  0.25 ]
 [0.025 0.025 0.925 0.025]
 [0.025 0.925 0.025 0.025]
 [0.25  0.25  0.25  0.25 ]]
柔性策略蒙特卡洛方法状态价值：
[[0.13862515 0.15262134 0.19550543 0.17339912]
 [0.12483717 0.         0.20703831 0.        ]
 [0.26985431 0.41478295 0.45345973 0.        ]
 [0.         0.57049746 0.7707072  0.        ]]


In [6]:
print(np.argmax(policy, axis=1).reshape(4, 4))

[[2 3 2 3]
 [2 0 2 0]
 [3 1 0 0]
 [0 2 1 0]]


In [7]:
episode_rewards = [play_policy(env, policy) for _ in range(100)]
print("价值迭代 平均奖励：{}".format(np.mean(episode_rewards)))

价值迭代 平均奖励：0.2


In [8]:
def monte_carlo_importance_sample(env, episode_num=500000):
    policy = np.zeros((16,4))
    policy[:,0]=1
    behavior_policy = np.ones_like(policy) * 0.25 # 柔性策略
    q = np.zeros_like(policy)
    c = np.zeros_like(policy)
    for _ in range(episode_num):
        # 用行为策略玩一回合
        state_actions = []
        observation = env.reset()
        while True:
            action = np.random.choice(env.action_space.n,
                    p=behavior_policy[observation])
            state_actions.append((observation, action))
            observation, reward, done, _ = env.step(action)
            if done:
                break
        g = reward # 回报
        rho = 1. # 重要性采样比率
        for state, action in reversed(state_actions):
            c[state][action] += rho
            q[state][action] += (rho / c[state][action] * (g - q[state][action]))
            # 策略改进
            a = q[state].argmax()
            policy[state] = 0.
            policy[state][a] = 1.
            if a != action: # 提前终止
                break
            rho /= behavior_policy[state][action]
    return policy, q

In [9]:
policy, q = monte_carlo_importance_sample(env)
print('重要性采样蒙特卡洛方法策略')
print(policy)
q=np.multiply(policy,q)
print('重要性采样蒙特卡洛方法状态价值：')
print(q.sum(axis=-1).reshape(4,4))

重要性采样蒙特卡洛方法策略
[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]]
重要性采样蒙特卡洛方法状态价值：
[[0.81916044 0.81268583 0.65838859 0.        ]
 [0.26980756 0.         0.27221172 0.        ]
 [0.03187957 0.94680621 0.99121605 0.        ]
 [0.         0.2922619  0.99831371 0.        ]]


In [10]:
print(np.argmax(policy, axis=1).reshape(4, 4))

[[0 3 3 0]
 [2 0 2 0]
 [3 3 1 0]
 [0 2 3 0]]


In [11]:
episode_rewards = [play_policy(env, policy) for _ in range(100)]
print("价值迭代 平均奖励：{}".format(np.mean(episode_rewards)))


价值迭代 平均奖励：0.07
