# Cross Entropy Method for RL

The cross-entropy method is model free, policy based and on-policy.  This means that

* It does build any model of the environment.  It just maps state to action.
* It approximates the policy (using a neural network)
* It gets its data from the environment sequentially

In [1]:
#pytorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import numpy as np

#gym 
import gym

from collections import namedtuple

import time

torch.__version__

'1.3.1'

## Constants

In [10]:
HIDDEN_SIZE = 128
BATCH_SIZE = 16
PERCENTILE = 70
TARGET = 200

## Neural network architecture

The model used is neural network with one fully connected layer that maps an input vector size `input_size` to `hidden_size`, a ReLU activation, and a output layer of size `n_actions`

In [2]:
class SingleLayerNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, n_actions):
        # Inherit parent (nn.module) methods using super init
        super(SingleLayerNetwork, self).__init__()
  
        #The first layer maps input features to a second layer
        #with user specified number of neurons.
        self.fc1 = nn.Linear(in_features=input_size, 
                             out_features=hidden_size, 
                             bias=True)
        
        #The output layer has a user specified n_actions
        self.out = nn.Linear(in_features=hidden_size,
                             out_features=n_actions,
                             bias=True)
        
    def forward(self, x):
        # Pass data through net. 
        x = F.relu(self.fc1(x))
        y_pred = self.out(x)
        return y_pred

## Training

In [3]:
def play_episode(env, model):    
    '''
    Generates the batches and episodes
    
    Parameters:
    ------
    env: gym environment
    
    model: nn.module
        neural network model
        
    '''
    
    #softmax layer: so outputs sum to 1.0
    sm = nn.Softmax(dim=1)
    
    #tracking
    batch = [] # list of episode instances
    episode_reward = 0.0 # total reward for current episode 
    episode_steps = [] # episode step objects
    
    #tracking states encountered and actions taken...
    obs_history = []
    action_history = []
    
    #reset gym environment and get initial observation
    obs = env.reset() 
    
    #episode has maximum 200 steps...
    for i in range(200):
        
        #track state/observation
        obs_history.append(obs)
        
        #pytorch requires state as tensor
        obs_v = torch.FloatTensor([obs])
        
        #predict actions probabilities (logits)
        action_probs_v = model(obs_v)
        
        #normalise using softmax
        action_probs_v = sm(action_probs_v)
        
        #convert to numpy
        action_probs = action_probs_v.data.numpy()[0]
 
        #take a random action
        action = np.random.choice(len(action_probs), p=action_probs)
        next_obs, reward, terminal, _ = env.step(action)
        
        #track cumulative reward and action taken
        episode_reward += reward
        action_history.append(action)
        
        #if end of episode
        if terminal:
            break
            
        obs = next_obs
    
    #return episode as a dict
    return {'reward':episode_reward,
            'states':obs_history,
            'actions':action_history}

In [14]:
class CrossEntropyAgent(object):
    '''
    Deep Reinforcement Agent based on Cross Entropy Method.
    
    The cross-entropy method is model free, policy based and on-policy.      
    '''
    def __init__(self, environment, target, batch_size, percentile=70,
                 neural_network=None, lr=0.01, show_every=2):
        '''
        Params:
        -----
        '''
        self.env = environment
        self.target = target
        self.batch_size = batch_size
        self.percentile = percentile
        self.show = show_every
        
        #default model
        if neural_network is None:
            #observation size and no. actions
            obs_size = self.env.observation_space.shape[0]
            n_actions = self.env.action_space.n
            self.model = SingleLayerNetwork(obs_size, HIDDEN_SIZE, n_actions)
        else:
            self.model = neural_network
            
        self.objective = nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(params=self.model.parameters(), 
                                          lr=lr)
    
    def solve(self):

        batch_count = 0
    
        #learn until target is reached.
        while True:

            # Play episodes 
            batch_count += 1
            batch = []
            for episode in range(self.batch_size):
                episode_results = self.play_episode()
                batch.append(episode_results)

            #filter elite episodes
            obs_v, acts_v, reward_b, reward_m = self.elite_episodes(batch)

            #train on elite episodes
            self.optimizer.zero_grad()
            action_scores_v = self.model(obs_v)
            loss = self.objective(action_scores_v, acts_v)
            loss.backward()
            self.optimizer.step()

            
            if batch_count%self.show == 0:
                print(f'{batch_count}: loss={loss.item():.3f},' \
                      + f'\t\tavg reward={reward_m:.1f},\treward_bound={reward_b:.1f}')

                
            if reward_m >= self.target:
                print(f'solved at iter {batch_count}. mean_reward={reward_m}')
                break
        
    
    def play_episode(self):    
        '''
        Generates the batches and episodes

        Parameters:
        ------
        env: gym environment

        model: nn.module
            neural network model

        '''

        #softmax layer: so outputs sum to 1.0
        sm = nn.Softmax(dim=1)

        #tracking
        batch = [] # list of episode instances
        episode_reward = 0.0 # total reward for current episode 
        episode_steps = [] # episode step objects

        #tracking states encountered and actions taken...
        obs_history = []
        action_history = []

        #reset gym environment and get initial observation
        obs = self.env.reset() 

        #episode has maximum 200 steps...
        for i in range(200):

            #track state/observation
            obs_history.append(obs)

            #pytorch requires state as tensor
            obs_v = torch.FloatTensor([obs])

            #predict actions probabilities (logits)
            action_probs_v = self.model(obs_v)

            #normalise using softmax
            action_probs_v = sm(action_probs_v)

            #convert to numpy
            action_probs = action_probs_v.data.numpy()[0]

            #take a random action
            action = np.random.choice(len(action_probs), p=action_probs)
            next_obs, reward, terminal, _ = self.env.step(action)

            #track cumulative reward and action taken
            episode_reward += reward
            action_history.append(action)

            #if end of episode
            if terminal:
                break

            obs = next_obs

        #return episode as a dict
        return {'reward':episode_reward,
                'states':obs_history,
                'actions':action_history}
    
    
    def elite_episodes(self, batch):

        rewards = np.array([b['reward'] for b in batch])
        reward_bound = np.percentile(rewards, self.percentile)
        reward_mean = float(np.mean(rewards))

        train_obs = []
        train_act = []

        for episode in np.array(batch)[rewards >= reward_bound]:
            train_obs.extend(episode['states'])
            train_act.extend(episode['actions'])

        train_obs = torch.FloatTensor(train_obs)
        train_act = torch.LongTensor(train_act)

        return train_obs, train_act, reward_bound, reward_mean      
        

In [None]:
#gym environment
env = gym.make('CartPole-v1')

#agent
agent = CrossEntropyAgent(environment=env, 
                          target=TARGET, 
                          batch_size=BATCH_SIZE,
                          percentile=PERCENTILE)

agent.solve()

2: loss=0.684,		avg reward=19.9,	reward_bound=23.0
4: loss=0.671,		avg reward=18.7,	reward_bound=21.0
6: loss=0.663,		avg reward=24.8,	reward_bound=27.0
8: loss=0.668,		avg reward=42.8,	reward_bound=49.5
10: loss=0.640,		avg reward=71.9,	reward_bound=86.5
12: loss=0.627,		avg reward=50.6,	reward_bound=60.0
14: loss=0.616,		avg reward=76.9,	reward_bound=89.5
16: loss=0.592,		avg reward=93.0,	reward_bound=103.5
18: loss=0.597,		avg reward=90.4,	reward_bound=100.5
20: loss=0.561,		avg reward=91.2,	reward_bound=102.5
22: loss=0.561,		avg reward=108.9,	reward_bound=133.5
24: loss=0.552,		avg reward=119.1,	reward_bound=135.5
26: loss=0.561,		avg reward=153.6,	reward_bound=161.5
28: loss=0.541,		avg reward=145.9,	reward_bound=175.5
