In [1]:
import gym
import pandas as pd
import numpy as np
import torch as tp
import torch.nn as nn
from collections import deque

```
1. Play N number of episodes using our current model and environment.
2. Calculate the total reward for every episode and decide on a reward
boundary. Usually, we use some percentile of all rewards, such as 50th or
70th.
3. Throw away all episodes with a reward below the boundary.
4. Train on the remaining "elite" episodes using observations as the input and
issued actions as the desired output.
5. Repeat from step 1 until we become satisfied with the result.
```

In [2]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, lr):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size) 
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)  
        
        self.criterion = nn.BCELoss()
        self.optimizer = tp.optim.Adam(self.parameters(), lr=lr)  
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return tp.sigmoid(out)
    
    def train(self, episodes, labels):
        outputs = model(episodes)
        loss = self.criterion(outputs, labels)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        

In [3]:
def get_discounted_rewards(rewards, gamma):
    gammas = np.array([gamma**i for i in range(len(rewards))])
    return np.sum(gammas * np.array(rewards))

In [4]:
def train_for_episodes(episodes, model):
    input_x = np.empty((0, 4))
    labels_y = np.empty((0, 1))
    for ep in episodes:
        input_x = np.append(input_x, ep.observations,  axis=0)
        labels_y = np.append(labels_y, ep.actions,  axis=0)
    
    model.train(Variable(tp.from_numpy(input_x)), Variable(tp.from_numpy(labels_y)))
    return model

In [27]:
class Episode:
    def __init__(self, action_shape, observation_shape):
        self.total_reward = 0
        self.actions = np.empty(action_shape)
        self.observations = np.empty(observation_shape)
        self.rewards = np.empty(0)
        
    def add_action(self, action):
        self.actions = np.append(self.actions, [action],  axis=0)
        
    def add_reward(self, reward):
        self.rewards = np.append(self.rewards, [reward],  axis=0)
        
    def add_observation(self, observation):
        self.observations = np.append(self.observations, [observation],  axis=0)
        
    def calculate_total_reward(self):
        self.total_reward = get_discounted_rewards(self.rewards, gamma)

In [28]:
env = gym.make("CartPole-v0")
observation = env.reset()
print("action space: ", env.action_space)
print("state space: ", env.observation_space)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
action space:  Discrete(2)
state space:  Box(4,)




In [46]:
def get_elite_episodes(episodes, percentile=75):
    episodes.sort(key=lambda x: x.total_reward, reverse=True)
    rewards = [ep.total_reward for ep in episodes]
    pt = np.percentile(rewards, percentile)
    
    return [ep for ep in episodes if ep.total_reward >= pt]

In [54]:
device = tp.device('cuda' if tp.cuda.is_available() else 'cpu')
model = NeuralNet(4, 32, 1, 0.15).to(device)
model = model.double()

In [55]:
from torch.autograd import Variable
env = gym.make("CartPole-v0")
scores = deque([], maxlen=100)
counter = 0
gamma = 1

while True:
    counter += 1
    if np.mean(scores) >= 195:
        break
        
    observation = env.reset()
    episode = Episode((0, 1), (0, 4))
    
    for _ in range(1000):
      probs = model.forward(Variable(tp.from_numpy(observation)))
      p = probs.detach().numpy()[0]
      action = np.random.choice(2, 1, p=[1-p, p]) 
      episode.add_observation(observation)
      episode.add_action(action)
      observation, reward, done, info = env.step(action[0])
      episode.add_reward(reward)
      
      if done:
        episode.calculate_total_reward()
        episodes.append(episode)
        scores.append(episode.total_reward)
        if np.mean(scores) >= 195:
            print("Problem solved")
        
        if counter % 100 == 0:
            print("Episode: {}, total reward: {}".format(counter, episode.total_reward))
        break
    
    if counter % 100 == 0:
        print("Average Score: ", np.mean(scores))
        episodes = get_elite_episodes(episodes, percentile=75)
        model = train_for_episodes(episodes, model)
        episodes = []


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
Episode: 100, total reward: 10.0
Average Score:  20.03
Episode: 200, total reward: 11.0
Average Score:  27.58
Episode: 300, total reward: 33.0
Average Score:  51.87
Episode: 400, total reward: 89.0
Average Score:  62.94
Episode: 500, total reward: 69.0
Average Score:  110.11
Episode: 600, total reward: 142.0
Average Score:  124.5
Episode: 700, total reward: 153.0
Average Score:  136.42
Episode: 800, total reward: 200.0
Average Score:  175.45
Episode: 900, total reward: 200.0
Average Score:  184.76
Problem solved
