In [33]:
import gym
import numpy as np
import torch
from collections import deque
from collections import OrderedDict
import pylab as plt
np.random.seed(100)
torch.manual_seed(100)

<torch._C.Generator at 0x7fabe41cdd70>

In [34]:
def getNoise(policyBase, sigma):
    params = policyBase.state_dict()
    noise = OrderedDict()
    for p in params:
        n = torch.from_numpy(sigma * np.random.randn(*params[p].shape)).float()
        noise[p] = n
    return noise

def getPerturbedPolicy(baseParams, policyClass, policyBase, noise):
    newPolicy = policyClass(*baseParams)
    params = policyBase.state_dict()
    for p in params:
        params[p] += noise[p]
    newPolicy.load_state_dict(params)
    return newPolicy

def updateParams(params, noises, rewards, population, sigma, lr):
    # print(rewards)
    rewards = (np.array(rewards) - np.mean(rewards)) / (np.std(rewards) + 1)
    # print(rewards)
    for i in range(len(rewards)):
        for p in params:
            params[p] += lr * noises[i][p] * rewards[i]/(population * sigma)
    return params

def showPolicy(env, policy, max_steps):
    episode_reward = 0
    state = env.reset()
    for s in range(max_steps):
        env.render()
        action = torch.argmax(policy.forward(torch.FloatTensor(state)))
        state, reward, done, _ = env.step(int(action))
        episode_reward += reward
        if done:
            break
    env.close()
    return episode_reward

def getReward(env, policy_network, max_steps):
    episode_reward = 0
    state = env.reset()
    for s in range(max_steps):
        action = torch.argmax(policy_network.forward(torch.FloatTensor(state)))
        state, reward, done, _ = env.step(int(action))
        episode_reward += reward
        if done:
            break
    return episode_reward

def getAvgPerformance(env, policy, episodes, max_steps):
    rewards = []
    
    for i in range(episodes):
        state = env.reset()
        episode_reward = 0
        for s in range(max_steps):
            action = torch.argmax(policy.forward(torch.FloatTensor(state)))
            state, reward, done, _ = env.step(int(action))
            episode_reward += reward
            if done:
                break
        rewards.append(episode_reward)
    return np.mean(episode_reward)
        

In [35]:
def es(env, policyClass, params, sigma, lr, population, iterations, max_steps, threshold, sigma_decay):
    currentPolicy = policyClass(*params)
    best_params = currentPolicy.state_dict()
    best_reward = -10000
    avgRewards = []
    shortAvgRewards = deque(maxlen=5)
    for _ in range(iterations):
        updatedParams = currentPolicy.state_dict()
        rewards = []
        noises = []
        for __ in range(population):
            noise = getNoise(currentPolicy, sigma)
            noises.append(noise)
            individual = getPerturbedPolicy(params, policyClass, currentPolicy, noise)
            reward = getReward(env, individual, max_steps)
            rewards.append(reward)
        updatedParams = updateParams(updatedParams, noises, rewards, population, sigma, lr)
        avgReward = np.mean(rewards)
        if(avgReward >= best_reward):
            print(_, avgReward, best_reward)
            best_reward = avgReward
            best_params = currentPolicy.state_dict()
        currentPolicy.load_state_dict(updatedParams)
        
        
        
        avgRewards.append(avgReward)
        shortAvgRewards.append(avgReward)
        if(np.mean(shortAvgRewards) > threshold):
            print(str(_) + " total iterations")
            break

        print(avgReward, end=", ")
        sigma **= sigma_decay
        
        
    return best_params, avgRewards
        

In [36]:
class Net(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Net,self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.layer1 = torch.nn.Linear(input_size, hidden_size)
        self.layer2 = torch.nn.Linear(hidden_size, output_size)
    
    def forward(self,x):
        x = self.layer1(x)
        x = torch.relu(x)
        x = self.layer2(x)
        return x

In [None]:
policy_params = [4, 20, 2]
env = gym.make('CartPole-v0')
env.seed(100)
best_params, rewards = es(env, Net, policy_params, .01, .00001, 20, 4000, 200, 195, 1)

0 9.3 -10000
9.3, 1 9.3 9.3
9.3, 9.15, 3 9.5 9.3
9.5, 9.05, 8.95, 9.3, 9.15, 9.15, 9 9.95 9.5
9.95, 10 10.0 9.95
10.0, 11 10.25 10.0
10.25, 9.55, 9.3, 9.45, 9.15, 9.3, 9.55, 9.35, 9.45, 9.45, 9.8, 9.45, 9.4, 9.2, 9.35, 9.15, 9.4, 9.4, 9.3, 9.45, 9.5, 9.2, 9.5, 9.45, 9.2, 9.35, 9.35, 9.65, 8.8, 9.35, 9.5, 9.6, 9.25, 9.35, 9.4, 8.95, 9.4, 9.5, 9.5, 9.0, 9.05, 9.45, 9.05, 9.5, 9.15, 9.65, 9.05, 9.45, 9.3, 9.55, 9.65, 9.3, 9.65, 9.45, 9.4, 9.6, 9.3, 9.15, 9.3, 9.6, 9.35, 9.45, 9.3, 9.45, 9.3, 9.6, 9.5, 9.25, 9.5, 9.4, 9.2, 9.55, 9.55, 9.45, 9.5, 9.35, 9.35, 9.45, 9.5, 9.5, 9.45, 9.4, 9.25, 9.45, 9.4, 9.25, 9.35, 9.35, 9.55, 9.2, 9.3, 9.15, 9.45, 9.5, 9.1, 9.35, 9.55, 9.45, 9.35, 9.5, 9.45, 9.45, 9.5, 9.4, 9.45, 9.1, 9.4, 9.05, 9.65, 9.5, 9.3, 9.6, 9.3, 9.15, 9.2, 9.45, 9.35, 9.55, 9.45, 9.35, 9.8, 9.05, 9.2, 9.25, 9.15, 9.2, 9.4, 9.2, 9.35, 9.55, 9.15, 9.35, 9.4, 9.45, 9.45, 9.3, 9.0, 9.4, 9.45, 9.2, 9.55, 9.0, 9.3, 9.6, 9.8, 9.2, 9.4, 9.45, 9.35, 9.35, 9.6, 9.4, 9.3, 9.05, 9.05, 9.15, 9.4

In [None]:
plt.plot(rewards)

In [None]:
bestPolicy = Net(*policy_params)
bestPolicy.load_state_dict(best_params)

In [None]:
showPolicy(env, bestPolicy, 200)

In [None]:
getAvgPerformance(env, bestPolicy, 200, 200)

In [None]:
class Net(torch.nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size):
        super(Net,self).__init__()
        self.input_size = input_size
        self.hidden_size1 = hidden_size1
        self.hidden_size2 = hidden_size2
        self.output_size = output_size
        self.layer1 = torch.nn.Linear(input_size, hidden_size1)
        self.layer2 = torch.nn.Linear(hidden_size1, hidden_size2)
        self.layer3 = torch.nn.Linear(hidden_size2, output_size)
    
    def forward(self,x):
        x = self.layer1(x)
        x = torch.tanh(x)
        x = self.layer2(x)
        x = torch.tanh(x)
        x = self.layer3(x)
        x = torch.nn.Softmax(dim=0)(x)
        return x

In [None]:
# env = gym.make("LunarLander-v2")
# env.action_space.count()

policy_params = [8, 256, 256, 4]
env = gym.make("LunarLander-v2")
env.seed(100)
best_params, rewards = es(env, Net, policy_params, .01, .0001, 10, 5000, 1000, 200, .999)

In [None]:
plt.plot(rewards)

In [None]:
bestPolicy = Net(*policy_params)
bestPolicy.load_state_dict(best_params)
showPolicy(env, bestPolicy, 200)

In [None]:
.9999**5000
