# CEM
Unlike steepest ascent, CEM is not very prone to get stuck in local maxima. Instead of just proceeding in the direction of steepest ascent, we take the top performers and continue with them. This gives us a better chance of finding the better gradient. Also, unlike last time, I plan to use a neural network to handle the environment. We will be using the Pendulum-v0 environment for this time 

In [1]:
import tensorflow as tf
import gym
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
class HillClimber:
    def __init__(self, env):
        self.n_actions = env.action_space.n
        self.n_observations = env.observation_space.shape[0]
        self.network = self._make_model()
        self.best_reward = -1
        self.noise = 0.5
        self.noise_max = 2
        self.noise_min = 0.001
        self.gamma = 0.98
    
    def _make_model(self):
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(24, 
                                  input_dim=self.n_observations, 
                                  activation='relu'),
            tf.keras.layers.Dense(self.n_actions_,
                                 activation='softmax')
        ])
        model.compile(optimizer='adam',
                     loss='mse')
    
    def get_action(self, state, policy=None):
        if policy is None:
            actions = np.dot(state, self.policy)
        else:
            actions = np.dot(state, policy)
        actions = np.exp(actions)
        actions = actions / np.sum(actions)
        return np.argmax(actions)
            
    def _noise_adder(self, better):
        if better:
            self.policy += np.random.normal(
                loc=(self.noise_min + self.noise)/2,
                scale=(self.noise - self.noise_min),
                size=(self.n_observations, self.n_actions)
            )
        else:
            self.policy = self.best_policy + np.random.normal(
                loc=(self.noise_min + self.noise)/2,
                scale=(self.noise - self.noise_min),
                size=(self.n_observations, self.n_actions)
            )
            
    def learn(self, discounted_reward):
        if discounted_reward > self.best_reward:
            self.best_reward = discounted_reward
            self.best_policy = self.policy
            if self.noise > self.noise_min:
                self._noise_adder(better=True)
                self.noise = max(self.noise_min, self.noise/2)
        else:
            if self.noise < self.noise_max:
                self._noise_adder(better=False)
                self.noise = min(self.noise_max, self.noise*1.1)
        
    def try_policy(self, env, evaluate=False):
        discounted_reward = 0
        state = env.reset()
        state = np.reshape(state, (1,self.n_observations))
        done = False
        time_steps = 0
        while not done:
            time_steps += 1
            action = self.get_action(state)
            next_state, reward, done, _ = env.step(action)
            if not evaluate:
                discounted_reward += (self.gamma ** time_steps) * reward
            else:
                discounted_reward += reward
                env.render()
            state = next_state
            state = np.reshape(state, (1,self.n_observations))
        return discounted_reward