## PG-based CEM method to solve CartPole

$\text{Policy Gradient}: \min\limits_{\pi} \mathcal{L} := -\mathbb{E}_{\tau}[Q(s,a)\log \pi(a|s)], \tau=\{(s_i,a_i,r_i)\}$

$\text{CEM: a kind of PG methods when Q(s,a) = 1 (good episodes) or 0 (filtered epsiodes)}$

In [1]:
$\text{Policy Gradient}: \min\limits_{\pi} \mathcal{L} := -\mathbb{E}_{\tau}[Q(s,a)\log \pi(a|s)], \tau=\{(s_i,a_i,r_i)\}$

$\text{CEM: a kind of PG methods when Q(s,a) = 1 (good episodes) or 0 (filtered epsiodes)}$import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import gym
from torch.utils.tensorboard import SummaryWriter

### step1. define the environment

In [3]:
env = gym.make('CartPole-v1')
obs_size = env.observation_space.shape[0]
n_actions = env.action_space.n

In [4]:
# the observation of CarPole-v0 is a four float array, 
# referring to the x-dim position, velocity, angle and the angle velocity
env.observation_space.sample() 

array([ 1.5679501e+00, -2.8996554e+38, -1.3380405e-01,  1.9239598e+38],
      dtype=float32)

In [5]:
# the action of CarPole-v0 is a binary discrete number 0(left) and 1(right)
env.action_space.sample()

0

### step2. create the policy network

In [6]:
class Net(nn.Module):
    def __init__(self, obs_size, n_actions, hidden_size=128, device="cuda:0"):
        super(Net, self).__init__()
        self.hidden_layer = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )
        self.sm = nn.Softmax(dim=1) # softmax to get the final policy prob
        self.device = device
    def forward(self, x):
        return self.hidden_layer(x) # return the action logits without softmax
    def policy(self, obs):
        obs_v = torch.FloatTensor(obs).unsqueeze(dim=0).to(self.device) # obs_v.shape = (1,4)
        act_prob_v = self.sm(self(obs_v)) # action prob_v.shape = (1,4)
        act_probs = act_prob_v.data.cpu().numpy()[0] # action prob array
        action = np.random.choice(len(act_probs), p=act_probs) # sample the action from the pdf
        return action

In [7]:
net = Net(obs_size, n_actions)

In [8]:
net

Net(
  (hidden_layer): Sequential(
    (0): Linear(in_features=4, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=2, bias=True)
  )
  (sm): Softmax(dim=1)
)

### step3. collect the epsiode batch to iterate

In [9]:
def iterate_batch(env, net, batch_size=16):
    # init
    batch = [] # traing batch
    epi_reward = 0.0 # the total reward for certain episode
    epi_steps = [] # each step in certain episode is a tuple for (obs, act)
    obs, _ = env.reset() # start the env
    # loop one episode
    while True:
        # get action from the policy
        action = net.policy(obs)
        # step the env using the action
        # and get the next observation, immediate reward, and is_done flag
        next_obs, reward, terminated, truncated, _ = env.step(action)
        # accumulate the episode reward
        epi_reward += reward
        # append the episode steps
        epi_steps.append((obs, action))
        # check if the episode is done
        if terminated or truncated:
            # load the batch with this episode
            batch.append({
                "epi_reward": epi_reward,
                "epi_steps": epi_steps
            })
            # if the batch is full, yield it
            if len(batch) == batch_size:
                yield batch
                batch = []
            # reinit for the next episode
            epi_reward = 0.0 
            epi_steps = []
            next_obs,_ = env.reset()
        # refresh the observation for next step
        obs = next_obs

In [10]:
def filter_batch(batch, percent=70):
    rewards = list(map(lambda e: e['epi_reward'], batch))
    reward_bound = np.percentile(rewards, percent) # give the boundary reward at the percent
    reward_mean = float(np.mean(rewards))
    # filter out the episode that the reward is lower than the boundary
    train_obs,train_act = [], []
    for episode in batch:
        if episode['epi_reward'] < reward_bound:
            continue
        # using the fine episode as training dataset
        train_obs.extend(list(map(lambda step: step[0], episode['epi_steps'])))
        train_act.extend(list(map(lambda step: step[1], episode['epi_steps'])))
        train_obs_v, train_act_v = torch.FloatTensor(train_obs), torch.FloatTensor(train_act)
    
    return train_obs_v, train_act_v, reward_bound, reward_mean

### step4: train

In [11]:
device = "cuda:0"
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(params=net.parameters(), lr=0.01)
writer = SummaryWriter()
net = net.to(device)

In [12]:
def train(env, net, loss_func, optimizer, writer, max_reward=199):
    for iter_idx, batch in enumerate(iterate_batch(env, net)):
        # get the training dataset
        train_obs_v, train_act_v, reward_bound, reward_mean = filter_batch(batch)
        train_obs_v = train_obs_v.to(device)
        train_act_v = train_act_v.to(device)
        # forward the policy net
        optimizer.zero_grad()
        action_logits_v = net(train_obs_v)
        # compute the loss and backprop
        loss_v = loss_func(action_logits_v, train_act_v.long())
        loss_v.backward()
        # optimize a step
        optimizer.step()
        
        # log the training process
        print("{}: loss={:.3f}, reward_mean={:.1f}, reward_bound={:.1f}".format(
            iter_idx, loss_v.item(), reward_mean, reward_bound
        ))
        writer.add_scalar("loss", loss_v.item(), iter_idx)
        writer.add_scalar("reward_bound", reward_bound, iter_idx)
        writer.add_scalar("reward_mean", reward_mean, iter_idx)
        
        if reward_mean > max_reward: # the policy is good enough
            print("Solved!")
            break
    writer.close()

In [13]:
train(env, net, loss_func, optimizer, writer)

  train_obs_v, train_act_v = torch.FloatTensor(train_obs), torch.FloatTensor(train_act)


0: loss=0.694, reward_mean=21.1, reward_bound=22.5
1: loss=0.684, reward_mean=29.2, reward_bound=31.0
2: loss=0.668, reward_mean=37.9, reward_bound=41.0
3: loss=0.665, reward_mean=34.1, reward_bound=37.0
4: loss=0.650, reward_mean=34.1, reward_bound=34.5
5: loss=0.644, reward_mean=30.4, reward_bound=31.5
6: loss=0.651, reward_mean=31.9, reward_bound=34.0
7: loss=0.630, reward_mean=45.1, reward_bound=56.5
8: loss=0.637, reward_mean=48.5, reward_bound=43.5
9: loss=0.633, reward_mean=47.6, reward_bound=60.5
10: loss=0.616, reward_mean=66.1, reward_bound=78.0
11: loss=0.614, reward_mean=69.9, reward_bound=83.5
12: loss=0.607, reward_mean=50.6, reward_bound=62.0
13: loss=0.609, reward_mean=63.2, reward_bound=76.5
14: loss=0.594, reward_mean=77.2, reward_bound=92.0
15: loss=0.598, reward_mean=62.4, reward_bound=78.0
16: loss=0.609, reward_mean=70.2, reward_bound=82.0
17: loss=0.608, reward_mean=80.5, reward_bound=92.5
18: loss=0.598, reward_mean=80.7, reward_bound=91.5
19: loss=0.590, reward

### step5: test

In [18]:
def test(net, time_limit=1000, random_policy=False):
    time = 0
    env = gym.make('CartPole-v1', render_mode="human")
    obs, info = env.reset()
    while time < time_limit:
        if random_policy:
            action = env.action_space.sample()
        else:
            action = net.policy(obs)
        obs, reward, terminated, truncated, info = env.step(action)
        if terminated or truncated:
            print("Game over with {} steps".format(time))
            env.close()
            return
        time += 1
    env.close()
    print("Time reaches limits as {} steps".format(time))

In [21]:
test(net)

Game over with 227 steps


In [20]:
test(net, random_policy=True)

Game over with 22 steps


### step6. save the checkpoint

In [23]:
save_path = "./ckpt/cem_policy_net.pt"
torch.save(net.state_dict(), save_path)