In [1]:
%matplotlib inline

import sys
import logging
import itertools

import numpy as np
np.random.seed(0)
import pandas as pd
import gym
import matplotlib.pyplot as plt
import torch
torch.manual_seed(0)
import torch.nn as nn
import torch.optim as optim
import torch.distributions as distributions

logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s',
                    datefmt='%Y/%m/%d %H:%M:%S',
                    stream=sys.stdout,
                    # filemode='w',
                    # filename='log_{}.log'.format{time.strftime('%Y-%m-%d %H-%M-%S',time.localtime(time.time()))},
                    level=logging.DEBUG)

env = gym.make('CartPole-v0')
env.seed(0)
for key in vars(env):
    logging.info('%s: %s', key, vars(env)[key])
for key in vars(env.spec):
    logging.info('%s: %s', key, vars(env.spec)[key])

2021/08/18 14:27:09 [INFO] env: <CartPoleEnv<CartPole-v0>>
2021/08/18 14:27:09 [INFO] action_space: Discrete(2)
2021/08/18 14:27:09 [INFO] observation_space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
2021/08/18 14:27:09 [INFO] reward_range: (-inf, inf)
2021/08/18 14:27:09 [INFO] metadata: {'render.modes': ['human', 'rgb_array'], 'video.frames_per_second': 50}
2021/08/18 14:27:09 [INFO] _max_episode_steps: 200
2021/08/18 14:27:09 [INFO] _elapsed_steps: None
2021/08/18 14:27:09 [INFO] id: CartPole-v0
2021/08/18 14:27:09 [INFO] entry_point: gym.envs.classic_control:CartPoleEnv
2021/08/18 14:27:09 [INFO] reward_threshold: 195.0
2021/08/18 14:27:09 [INFO] nondeterministic: False
2021/08/18 14:27:09 [INFO] max_episode_steps: 200
2021/08/18 14:27:09 [INFO] _kwargs: {}
2021/08/18 14:27:09 [INFO] _env_name: CartPole


In [5]:
# VPG

class VPGAgent:
    def __init__(self, env) -> None:
        self.action_n = env.action_space.n
        self.gamma = 0.99
        self.policy_net = self.build_net(input_size=env.observation_space.shape[0], hidden_sizes=[], output_size=self.action_n, output_activator=nn.Softmax(1))

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=0.005)

    def build_net(self, input_size, hidden_sizes, output_size, output_activator=None, use_bias=False):
        layers = []
        for input_size, output_size in zip([input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size, bias=use_bias))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        if output_activator:
            layers.append(output_activator)
        model = nn.Sequential(*layers)

        print(model)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, done):
        state_tensor = torch.as_tensor(observation, dtype=torch.float).unsqueeze(0)
        prob_tensor = self.policy_net(state_tensor)
        # 创建以参数probs为标准的类别分布，样本是来自“0，...，K-1”的整数，K是probs参数的长度。按照probs的概率，在相应的位置进行采样，采样返回的是该位置的整数索引。
        action_tensor = distributions.Categorical(prob_tensor).sample()   
        action = action_tensor.numpy()[0]
        if self.mode == 'train':
            self.trajectory += [observation, reward, done, action]
        return action

    def close(self):
        if self.mode == 'train':
            self.learn()

    def learn(self):
        state_tensor = torch.as_tensor(self.trajectory[0::4], dtype=torch.float)
        reward_tensor = torch.as_tensor(self.trajectory[1::4], dtype=torch.float)
        action_tensor = torch.as_tensor(self.trajectory[3::4], dtype=torch.long)
        arange_tensor = torch.arange(state_tensor.shape[0], dtype=torch.float)  # [0, 1, 2, 3]
        
        discount_tensor = self.gamma ** arange_tensor
        discounted_reward_tensor = discount_tensor * reward_tensor
        discounted_return_tensor = discounted_reward_tensor.flip(0).cumsum(0).flip(0)
        all_pi_tensor = self.policy_net(state_tensor)
        pi_tensor = torch.gather(all_pi_tensor, 1, action_tensor.unsqueeze(1)).squeeze(1)
        log_pi_tensor = torch.log(torch.clamp(pi_tensor, 1e-6, 1.))   # 夹紧pi_tensor到 1e-6 ~ 1
        loss_tensor = -(discounted_return_tensor * log_pi_tensor).mean()

        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()

agent = VPGAgent(env)

Sequential(
  (0): Linear(in_features=4, out_features=2, bias=False)
  (1): Softmax(dim=1)
)


In [3]:
# VPG with Baseline

class VPGwBaselineAgent:
    def __init__(self, env) -> None:
        self.action_n = env.action_space.n
        self.gamma = 0.99
        self.policy_net = self.build_net(input_size=env.observation_space.shape[0], hidden_sizes=[], output_size=self.action_n, output_activator=nn.Softmax(1))
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=0.005)
        self.baseline_net = self.build_net(input_size=env.observation_space.shape[0], hidden_sizes=[])
        self.baseline_optimizer = optim.Adam(self.baseline_net.parameters(), lr=0.01)
        self.baseline_loss = nn.MSELoss()

    def build_net(self, input_size, hidden_sizes, output_size=1, output_activator=None, use_bias=False):
        layers = []
        for input_size, output_size in zip([input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size, bias=use_bias))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        if output_activator:
            layers.append(output_activator)
        model = nn.Sequential(*layers)

        print(model)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, done):
        state_tensor = torch.as_tensor(observation, dtype=torch.float).unsqueeze(0)
        prob_tensor = self.policy_net(state_tensor)
        # 创建以参数probs为标准的类别分布，样本是来自“0，...，K-1”的整数，K是probs参数的长度。按照probs的概率，在相应的位置进行采样，采样返回的是该位置的整数索引。
        action_tensor = distributions.Categorical(prob_tensor).sample()   
        action = action_tensor.numpy()[0]
        if self.mode == 'train':
            self.trajectory += [observation, reward, done, action]
        return action

    def close(self):
        if self.mode == 'train':
            self.learn()

    # train baseline 
    def learn(self):
        state_tensor = torch.as_tensor(self.trajectory[0::4], dtype=torch.float)
        reward_tensor = torch.as_tensor(self.trajectory[1::4], dtype=torch.float)
        action_tensor = torch.as_tensor(self.trajectory[3::4], dtype=torch.long)  # 
        arange_tensor = torch.arange(state_tensor.shape[0], dtype=torch.float)  # torch.arange(训练一次的轨迹数)
        # print('arange_tensor: ', arange_tensor)
        # train baseline
        discount_tensor = self.gamma ** arange_tensor
        discounted_reward_tensor = discount_tensor * reward_tensor  # 折扣回报
        # print('discounted_reward_tensor: ', discounted_reward_tensor.size(), discounted_reward_tensor)
        discounted_return_tensor = discounted_reward_tensor.flip(0).cumsum(0).flip(0) # 
        # print('discounted_return_tensor: ', discounted_return_tensor.size(), discounted_return_tensor)

        return_tensor = discounted_return_tensor / discount_tensor
        pred_tensor = self.baseline_net(state_tensor)  # baseline_net是value function，通过真实回报return_tensor和汇报估计pred_tensor来预测
        psi_tensor = (discounted_return_tensor - discount_tensor * pred_tensor).detach() # psi_tensor = γ^t(Gt - B(St))
        base_loss_tensor = self.baseline_loss(pred_tensor, return_tensor.unsqueeze(1))
        self.baseline_optimizer.zero_grad()
        base_loss_tensor.backward()
        self.baseline_optimizer.step()

        # train policy
        all_pi_tensor = self.policy_net(state_tensor)  # batch_size * 2
        pi_tensor = torch.gather(all_pi_tensor, 1, action_tensor.unsqueeze(1)).squeeze(1)  # 从策略网络的预测all_pi_tensor中根据轨迹动作选择得到策略pi
        log_pi_tensor = torch.log(torch.clamp(pi_tensor, 1e-6, 1.))   # 夹紧pi_tensor到 1e-6 ~ 1
        policy_loss_tensor = -(psi_tensor * log_pi_tensor).mean()

        self.optimizer.zero_grad()
        policy_loss_tensor.backward()
        self.optimizer.step()

agent = VPGwBaselineAgent(env)

Sequential(
  (0): Linear(in_features=4, out_features=2, bias=False)
  (1): Softmax(dim=1)
)
Sequential(
  (0): Linear(in_features=4, out_features=1, bias=False)
)


In [4]:
# Off-Policy VPG

class OffPolicyVPGAgent:
    def __init__(self, env,):
        self.action_n = env.action_space.n
        self.gamma = 0.99

        self.policy_net = self.build_net(
                input_size=env.observation_space.shape[0],
                hidden_sizes=[],
                output_size=self.action_n, output_activator=nn.Softmax(1))
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=0.06)

    def build_net(self, input_size, hidden_sizes, output_size,
            output_activator=None, use_bias=False):
        layers = []
        for input_size, output_size in zip(
                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size, bias=use_bias))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        if output_activator:
            layers.append(output_activator)
        model = nn.Sequential(*layers)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, done):
        if self.mode == 'train':
            action = np.random.choice(self.action_n) # use random policy
            self.trajectory += [observation, reward, done, action]
        else:
            state_tensor = torch.as_tensor(observation, dtype=torch.float).unsqueeze(0)
            prob_tensor = self.policy_net(state_tensor)
            action_tensor = distributions.Categorical(prob_tensor).sample()
            action = action_tensor.numpy()[0]
        return action

    def close(self):
        if self.mode == 'train':
            self.learn()

    def learn(self):
        state_tensor = torch.as_tensor(self.trajectory[0::4], dtype=torch.float)
        reward_tensor = torch.as_tensor(self.trajectory[1::4], dtype=torch.float)
        action_tensor = torch.as_tensor(self.trajectory[3::4], dtype=torch.long)
        arange_tensor = torch.arange(state_tensor.shape[0], dtype=torch.float)
        discount_tensor = self.gamma ** arange_tensor
        discounted_reward_tensor = discount_tensor * reward_tensor
        discounted_return_tensor = discounted_reward_tensor.flip(0).cumsum(0).flip(0)
        all_pi_tensor = self.policy_net(state_tensor)
        pi_tensor = torch.gather(all_pi_tensor, 1,
                action_tensor.unsqueeze(1)).squeeze(1)
        behavior_prob = 1. / self.action_n   # 均匀的行为策略b
        loss_tensor = -(discounted_return_tensor / behavior_prob * pi_tensor).mean()
        self.optimizer.zero_grad()
        loss_tensor.backward()
        self.optimizer.step()


agent = OffPolicyVPGAgent(env)

In [3]:
# Off-Policy VPG with Baseline

class OffPolicyVPGwBaselineAgent:
    def __init__(self, env,):
        self.action_n = env.action_space.n
        self.gamma = 0.99

        self.policy_net = self.build_net(
                input_size=env.observation_space.shape[0],
                hidden_sizes=[],
                output_size=self.action_n, output_activator=nn.Softmax(1))
        self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=0.06)
        self.baseline_net = self.build_net(
                input_size=env.observation_space.shape[0],
                hidden_sizes=[])
        self.baseline_optimizer = optim.Adam(self.policy_net.parameters(), lr=0.1)
        self.baseline_loss = nn.MSELoss()

    def build_net(self, input_size, hidden_sizes, output_size=1,
            output_activator=None, use_bias=False):
        layers = []
        for input_size, output_size in zip(
                [input_size,] + hidden_sizes, hidden_sizes + [output_size,]):
            layers.append(nn.Linear(input_size, output_size, bias=use_bias))
            layers.append(nn.ReLU())
        layers = layers[:-1]
        if output_activator:
            layers.append(output_activator)
        model = nn.Sequential(*layers)
        return model

    def reset(self, mode=None):
        self.mode = mode
        if self.mode == 'train':
            self.trajectory = []

    def step(self, observation, reward, done):
        if self.mode == 'train':
            action = np.random.choice(self.action_n) # use random policy
            self.trajectory += [observation, reward, done, action]
        else:
            state_tensor = torch.as_tensor(observation,
                    dtype=torch.float).unsqueeze(0)
            prob_tensor = self.policy_net(state_tensor)
            action_tensor = distributions.Categorical(prob_tensor).sample()
            action = action_tensor.numpy()[0]
        return action

    def close(self):
        if self.mode == 'train':
            self.learn()

    def learn(self):
        state_tensor = torch.as_tensor(self.trajectory[0::4], dtype=torch.float)
        reward_tensor = torch.as_tensor(self.trajectory[1::4], dtype=torch.float)
        action_tensor = torch.as_tensor(self.trajectory[3::4], dtype=torch.long)
        arange_tensor = torch.arange(state_tensor.shape[0], dtype=torch.float)

        # train baseline
        discount_tensor = self.gamma ** arange_tensor
        discounted_reward_tensor = discount_tensor * reward_tensor
        discounted_return_tensor = discounted_reward_tensor.flip(
                0).cumsum(0).flip(0)
        return_tensor = discounted_return_tensor / discount_tensor
        pred_tensor = self.baseline_net(state_tensor)
        psi_tensor = (discounted_return_tensor -
                discount_tensor * pred_tensor).detach()
        baseline_loss_tensor = self.baseline_loss(pred_tensor,
                return_tensor.unsqueeze(1))
        self.baseline_optimizer.zero_grad()
        baseline_loss_tensor.backward()
        self.baseline_optimizer.step()

        # train policy
        all_pi_tensor = self.policy_net(state_tensor)
        pi_tensor = torch.gather(all_pi_tensor, 1,
                action_tensor.unsqueeze(1)).squeeze(1)
        behavior_prob = 1. / self.action_n
        policy_loss_tensor = -(psi_tensor / behavior_prob * pi_tensor).mean()
        self.policy_optimizer.zero_grad()
        policy_loss_tensor.backward()
        self.policy_optimizer.step()


agent = OffPolicyVPGwBaselineAgent(env)

In [4]:
def play_episode(env, agent, max_episode_steps=None, mode=None, render=False):
    observation, reward, done = env.reset(), 0., False
    agent.reset(mode=mode)
    episode_reward, elapsed_steps = 0., 0
    while True:
        action = agent.step(observation, reward, done)
        if render:
            env.render()
        if done:
            break
        observation, reward, done, _ = env.step(action)
        episode_reward += reward
        elapsed_steps += 1
        if max_episode_steps and elapsed_steps >= max_episode_steps:
            break
    agent.close()
    return episode_reward, elapsed_steps


logging.info('==== train ====')
episode_rewards = []
for episode in itertools.count():
    episode_reward, elapsed_steps = play_episode(env.unwrapped, agent,
            max_episode_steps=env._max_episode_steps, mode='train')
    episode_rewards.append(episode_reward)
    logging.debug('train episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
    if np.mean(episode_rewards[-20:]) > 199:
        break
plt.plot(episode_rewards)


logging.info('==== test ====')
test_episode_rewards = []
for episode in range(100):
    episode_reward, elapsed_steps = play_episode(env, agent)
    test_episode_rewards.append(episode_reward)
    logging.debug('test episode %d: reward = %.2f, steps = %d',
            episode, episode_reward, elapsed_steps)
logging.info('average episode reward = %.2f ± %.2f',
        np.mean(test_episode_rewards), np.std(test_episode_rewards))

2021/08/18 13:59:46 [INFO] ==== train ====
arange_tensor:  tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15., 16., 17., 18., 19., 20., 21.])
2021/08/18 13:59:46 [DEBUG] train episode 0: reward = 21.00, steps = 21
arange_tensor:  tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27.,
        28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., 40., 41.,
        42., 43., 44., 45.])
2021/08/18 13:59:46 [DEBUG] train episode 1: reward = 45.00, steps = 45
arange_tensor:  tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15., 16., 17.])
2021/08/18 13:59:46 [DEBUG] train episode 2: reward = 17.00, steps = 17
arange_tensor:  tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
        14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26., 27.,
        28., 29., 30., 3