In [0]:
import torch
import gym
from collections import deque
import numpy as np
import pandas as pd
import seaborn as sb
import random
from copy import deepcopy
%matplotlib inline

# Solving Pendulum using CEM

In [0]:
def generate_session(agent, env, t_max=200):
  """
    Generate session on environment with agent
  """
  state = env.reset()
  actions = []
  states = []
  agent.random_process.reset_states()
  total_reward = 0
  for _ in range(t_max):
    action = agent.get_action(state)
    new_s, r, done, info = env.step(action)
    states.append(state)
    actions.append(action)
    total_reward += r
    state = new_s
    if done:
      break
  return states, actions, total_reward

def select_elites(states, actions, rewards, percentile=50):
  """
    Select sessions with the most reward
    by percentile
  """
  reward_threshold = np.percentile(rewards, percentile)
  elite_states, elite_actions = [], []
  for i in range(len(rewards)):
    if rewards[i] > reward_threshold:
      elite_states.append(states[i])
      elite_actions.append(actions[i])
  return elite_states, elite_actions

def show_progress():
  pass

In [0]:
class Network(torch.nn.Module):
  def __init__(self, state_shape):
    super().__init__()
    self.linear_1 = torch.nn.Linear(state_shape[0], 400)
    self.linear_2 = torch.nn.Linear(400, 300)
    self.linear_3 = torch.nn.Linear(300, 1)
    # self.linear_4 = torch.nn.Linear(50, 1)
    self.relu = torch.nn.ReLU()
    self.tang = torch.nn.Tanh()


  def forward(self, state):
    predicted = self.relu(self.linear_1(state))
    predicted = self.relu(self.linear_2(predicted))
    predicted = self.tang(self.linear_3(predicted))
    # predicted = self.tang(self.linear_4(predicted))
    return predicted * 2

In [0]:
class CEM_agent(torch.nn.Module):
  def __init__(self, state_shape, action_shape, epsilon=0.2, gamma=0.99,
               tau=1e-3, batch_size=128, learning_rate=1e-2, n_batches=16):
    super().__init__()
    self.batch_size = batch_size
    self.epsilon = epsilon
    self.gamma = gamma
    self.noise_threshold = 1
    self.tau = tau
    self.noise_threshold_decrease = 1.0 / 500
    self.noise_threshold_min = 0.001
    self.batch_size = batch_size
    self.n_batches = 16
    self.loss = torch.nn.MSELoss()
    self.network = Network(state_shape)
    self.optimizer = torch.optim.SGD(self.network.parameters(), lr=learning_rate)
    # self.optimizer = torch.optim.Adam(params=self.network.parameters(), lr=learning_rate)
    self.random_process = OrnsteinUhlenbeckProcess(size=action_shape[0], theta=0.15, mu=0, sigma=0.2)

  def get_batch(self, elite_states, elite_actions):
    # несколько батчей
    batch = random.sample(list(zip(elite_states, elite_actions)),
                          min(len(elite_actions), self.batch_size))
    states, actions = map(np.array, zip(*batch))
    states = torch.tensor(states, dtype=torch.float32)
    actions = torch.tensor(actions, dtype=torch.float32)
    return states, actions
  
  def update_weights(self, old_model):
    for new_parameter, old_parameter in zip(self.network.parameters(),
                                            old_model.parameters()):
      new_parameter.data.copy_((1 - self.tau)*new_parameter + \
                                      (self.tau)*old_parameter)
                                    

  def fit(self, elite_states, elite_actions):
    for _ in range(self.n_batches):
      self.optimizer.zero_grad()
      states, actions = self.get_batch(elite_states, elite_actions)
      predicted = self.network(states)
      loss = self.loss(predicted, actions)
      old_model = deepcopy(self.network)
      loss.backward()
      self.optimizer.step()
      self.update_weights(old_model)
      if self.noise_threshold > self.noise_threshold_min:
        self.noise_threshold -= self.noise_threshold_decrease

  def get_action(self, state):
    state = torch.tensor(np.array([state]), dtype=torch.float)
    action = self.network(state).detach().data.numpy()[0]
    noise = self.noise_threshold * self.random_process.sample()
    action = action + noise
    return np.clip(action, - 2, + 2)

class RandomProcess(object):
    def reset_states(self):
        pass

class AnnealedGaussianProcess(RandomProcess):
    def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
        self.mu = mu
        self.sigma = sigma
        self.n_steps = 0

        if sigma_min is not None:
            self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
            self.c = sigma
            self.sigma_min = sigma_min
        else:
            self.m = 0.
            self.c = sigma
            self.sigma_min = sigma

    @property
    def current_sigma(self):
        sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
        return sigma

class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
    def __init__(self, theta, mu=0., sigma=1., dt=1e-2, x0=None, size=1, sigma_min=None, n_steps_annealing=1000):
        super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
        self.theta = theta
        self.mu = mu
        self.dt = dt
        self.x0 = x0
        self.size = size
        self.reset_states()

    def sample(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
        self.x_prev = x
        self.n_steps += 1
        return x

    def reset_states(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)

env = gym.make('Pendulum-v0')
action_shape = env.action_space.shape
space_shape = env.observation_space.shape
agent = CEM_agent(space_shape, action_shape)

n_epochs = 100
n_sessions = 400
percentile = 70
for epoch in range(n_epochs):
  generated_sessions = [generate_session(agent, env) for _ in range(n_sessions)]
  states, actions, rewards = map(np.array, zip(*generated_sessions))
  elite_states, elite_actions = select_elites(states, actions, rewards, percentile)
  agent.fit(elite_states, elite_actions)
  print(f'Epoch: {epoch}, mean reward: {np.mean(rewards)}')
  show_progress()
  

Epoch: 0, mean reward: -1308.807692035833
Epoch: 1, mean reward: -1258.3305361469697
Epoch: 2, mean reward: -1270.7012948255524
Epoch: 3, mean reward: -1267.210231846825
Epoch: 4, mean reward: -1251.6842664658864
Epoch: 5, mean reward: -1247.588368669621
Epoch: 6, mean reward: -1220.5119605020839
Epoch: 7, mean reward: -1226.50562374322
Epoch: 8, mean reward: -1221.4749116676792
Epoch: 9, mean reward: -1222.4508127687423
Epoch: 10, mean reward: -1224.878925007851
Epoch: 11, mean reward: -1249.772585980991
Epoch: 12, mean reward: -1239.6019319089546
Epoch: 13, mean reward: -1271.5884652449872
Epoch: 14, mean reward: -1212.9109154556636
Epoch: 15, mean reward: -1234.3006045788827
Epoch: 16, mean reward: -1222.5127462456912
Epoch: 17, mean reward: -1241.4015490972859
Epoch: 18, mean reward: -1210.940672721099
Epoch: 19, mean reward: -1224.9003755325323
Epoch: 20, mean reward: -1224.1365279530337
Epoch: 21, mean reward: -1237.7062830298676
Epoch: 22, mean reward: -1209.0056234603364
Epoch:

KeyboardInterrupt: ignored