In [314]:
import gym
import numpy as np
import torch
import matplotlib.pyplot as plt
import time

In [315]:
from gym.wrappers import Monitor

In [316]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [317]:
import math
import copy
from torch.distributions import Categorical
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [318]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

In [319]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, n_latent_var):
        super(ActorCritic, self).__init__()

        # actor
        self.action_layer = nn.Sequential(
                nn.Linear(state_dim, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, action_dim),
                nn.Softmax(dim=-1)
                )
        
        # critic
        self.value_layer = nn.Sequential(
                nn.Linear(state_dim, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, 1)
                )
    ##Changed this part    
    def forward(self, inputs):
            x = self.action_layer(inputs)
            return x
    
    def act(self, state, memory):
        state = torch.from_numpy(state).float().to(device) 
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))
        
        return action.item()
    
    def evaluate(self, state, action):
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        
        state_value = self.value_layer(state)
        
        return action_logprobs, torch.squeeze(state_value), dist_entropy

In [320]:
class PPO:
    def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.policy = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def update(self, memory):   
        # Monte Carlo estimate of state rewards:
        rewards = []
        surr_loss = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        
        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        #print(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        #print("Rewardsd Mean: ") 
        #print(rewards.mean())
        # convert list to tensor
        old_states = torch.stack(memory.states).to(device).detach()
        old_actions = torch.stack(memory.actions).to(device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(device).detach()
        
        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            
            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())
            print("State Values : ", state_values.detach())
            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            print("Advantages : ", advantages)
            
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            surr_loss.append(loss)
            print("L Clip : ", loss.mean())
            
            # take gradient step
            #self.optimizer.zero_grad()
            #loss.mean().backward()
            #self.optimizer.step()
            
                  
        # Copy new weights into old policy:       
        self.policy_old.load_state_dict(self.policy.state_dict())
        #print("Surrogate Loss Mean: ",surr1.mean())
        return surr1.mean()

In [321]:
solved_reward = 30         # stop training if avg_reward > solved_reward
log_interval = 20           # print avg reward in the interval
max_episodes = 500        # max training episodes
max_timesteps = 30         # max timesteps in one episode
n_latent_var = 64           # number of variables in hidden layer
update_timestep = 2000      # update policy every n timesteps
lr = 0.002
betas = (0.9, 0.999)
gamma = 0.99                # discount factor
K_epochs = 1                # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
random_seed = None
render = False

In [322]:
env = gym.make('LunarLander-v2')

In [323]:
state_dim = env.observation_space.shape[0]
print(state_dim)

8


In [324]:
print(env.action_space.n)

4


In [325]:
#state_dim = 4
action_dim = 4

In [326]:
def return_random_agents(num_agents):
    
    agents = []
    for _ in range(num_agents):
        ##Change this part
        agent = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        
        for param in agent.parameters():
            param.requires_grad = False
         ##Commented this part   
        #init_weights(agent)
        agents.append(agent)
        
        
    return agents

#### I tried to change this function

In [327]:
def run_agents(agents):
    
    reward_agents = []
    total_loss = []
    env_name = "LunarLander-v2"
    env = gym.make('LunarLander-v2')
    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    #print(lr,betas)
    
    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    surri=0
    s1 = []
    s2 = 0
    s_temp = []
    surr_actual = []
    
    # training loop
    #for agent in agents:
    for i_episode in range(1, max_episodes+1):
        #agent.eval()
        state = env.reset()
        for t in range(max_timesteps):
            timestep += 1
            
            # Running policy_old:
            action = ppo.policy_old.act(state, memory)
            state, reward, done, _ = env.step(action)
            
            # Saving reward and is_terminal:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            
            # update if its time
            if timestep % update_timestep == 0:
                surri=ppo.update(memory)
                print("returned Surrogate Loss ", surri)
                #memory.clear_memory()
                timestep = 0
            
            running_reward += reward
            if render:
                env.render()
            if done:
                break
                
        avg_length += t
        
        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
            
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))
            print(' Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
        s1.append(surri)
        #surr_mean = torch.mean(torch.stack(surr_actual))    
    s1 = [i for i in s1 if i != 0]
    #print(torch.mean(torch.stack(s1)))
    #print("\n")
            
    #print(total_loss)
    return torch.mean(torch.stack(s1))

In [328]:
def return_average_score(agent, runs):
    #score = 0.
    #for i in range(runs):
    #print(run_agents([agent]))
    score = run_agents([agent])
    return score#/runs

#### <--Till Now Only worked -->

In [329]:
def run_agents_n_times(agents, runs):
    avg_score = []
    for agent in agents:
        avg_score.append(return_average_score(agent,runs))
    return avg_score

In [330]:
def mutate(agent):

    child_agent = copy.deepcopy(agent)
    
    mutation_power = 0.02 #hyper-parameter, set from https://arxiv.org/pdf/1712.06567.pdf
            
    for param in child_agent.parameters():
    
        if(len(param.shape)==4): #weights of Conv2D

            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            
                            param[i0][i1][i2][i3]+= mutation_power * np.random.randn()
                                
                                    

        elif(len(param.shape)==2): #weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    
                    param[i0][i1]+= mutation_power * np.random.randn()
                        

        elif(len(param.shape)==1): #biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0]+=mutation_power * np.random.randn()

    return child_agent

In [331]:
def return_children(agents, sorted_parent_indexes, elite_index):
    
    children_agents = []
    
    #first take selected parents from sorted_parent_indexes and generate N-1 children
    for i in range(len(agents)-1):
        
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))

    #now add one elite
    elite_child = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index=len(children_agents)-1 #it is the last one
    
    return children_agents, elite_index

In [332]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if(elite_index is not None):
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        print("Score for elite i ", i, " is ", score)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    print("Elite selected with index ",top_elite_index, " and score", top_score)
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent
    

In [333]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [334]:
game_actions = 2 #2 actions possible: left or right

#disable gradients as we will not use them
#torch.set_grad_enabled(False)

# initialize N number of agents
num_agents = 4
agents = return_random_agents(num_agents)

# How many top agents to consider as parents
top_limit = 2

# run evolution until X generations
generations = 3

elite_index = None

for generation in range(generations):

    # return rewards of agents
    rewards = run_agents_n_times(agents, 1) #return average of 3 runs

    # sort by rewards
    sorted_parent_indexes = np.argsort(rewards)[::-1][:top_limit]#reverses and gives top values (argsort sorts by ascending by default) https://stackoverflow.com/questions/16486252/is-it-possible-to-use-argsort-in-descending-order
    print("Sorting Parent Indexes: ",sorted_parent_indexes)
    print(" Data Type: ", type(sorted_parent_indexes))
    print("Sorting Completed")
    print("Selecting Top Parents")
    
    top_rewards = []
    for best_parent in sorted_parent_indexes:
        top_rewards.append(rewards[best_parent])
    
    print("Generation ", generation, " | Mean rewards: ", np.mean(rewards), " | Mean of top 5: ",np.mean(top_rewards[:5]))
    #print(rewards)
    print("Top ",top_limit," scores", sorted_parent_indexes)
    print("Rewards for top: ",top_rewards)
    
    # setup an empty list for containing children agents
    children_agents, elite_index = return_children(agents, sorted_parent_indexes, elite_index)

    # kill all agents, and replace them with their children
    agents = children_agents

 Episode 20 	 avg length: 29 	 reward: -22
 Episode 40 	 avg length: 29 	 reward: -24
 Episode 60 	 avg length: 29 	 reward: -19
State Values :  tensor([0.0563, 0.0542, 0.0534,  ..., 0.0355, 0.0355, 0.0347])
Advantages :  tensor([0.4568, 0.4432, 0.4988,  ..., 3.0398, 3.0652, 3.1355],
       dtype=torch.float64)
L Clip :  tensor(0.5244, dtype=torch.float64)
returned Surrogate Loss  tensor(-0.0375, dtype=torch.float64)
 Episode 80 	 avg length: 29 	 reward: -22
 Episode 100 	 avg length: 29 	 reward: -26
 Episode 120 	 avg length: 29 	 reward: -25
State Values :  tensor([0.0563, 0.0542, 0.0534,  ..., 0.0476, 0.0473, 0.0479])
Advantages :  tensor([0.7584, 0.7426, 0.8059,  ..., 3.3939, 3.5631, 3.5565],
       dtype=torch.float64)
L Clip :  tensor(0.5247, dtype=torch.float64)
returned Surrogate Loss  tensor(-0.0375, dtype=torch.float64)
 Episode 140 	 avg length: 29 	 reward: -24
 Episode 160 	 avg length: 29 	 reward: -22
 Episode 180 	 avg length: 29 	 reward: -20
State Values :  tensor([

State Values :  tensor([0.1974, 0.1973, 0.1952,  ..., 0.1792, 0.1794, 0.1786])
Advantages :  tensor([-0.9867, -1.0004, -1.0521,  ...,  2.9809,  3.0881,  3.1719],
       dtype=torch.float64)
L Clip :  tensor(0.6957, dtype=torch.float64)
returned Surrogate Loss  tensor(-0.1911, dtype=torch.float64)
 Episode 340 	 avg length: 29 	 reward: -25
 Episode 360 	 avg length: 29 	 reward: -31
 Episode 380 	 avg length: 29 	 reward: -25
State Values :  tensor([0.1974, 0.1973, 0.1952,  ..., 0.1932, 0.1939, 0.1919])
Advantages :  tensor([-0.7975, -0.8101, -0.8573,  ...,  3.0551,  2.9398,  2.9525],
       dtype=torch.float64)
L Clip :  tensor(0.6950, dtype=torch.float64)
returned Surrogate Loss  tensor(-0.1909, dtype=torch.float64)
 Episode 400 	 avg length: 29 	 reward: -24
 Episode 420 	 avg length: 29 	 reward: -23
 Episode 440 	 avg length: 29 	 reward: -30
 Episode 460 	 avg length: 29 	 reward: -24
State Values :  tensor([0.1974, 0.1973, 0.1952,  ..., 0.1902, 0.1896, 0.1910])
Advantages :  ten

 Episode 80 	 avg length: 29 	 reward: -27
 Episode 100 	 avg length: 29 	 reward: -21
 Episode 120 	 avg length: 29 	 reward: -24
State Values :  tensor([-0.1553, -0.1535, -0.1518,  ..., -0.1796, -0.1776, -0.1762])
Advantages :  tensor([1.3954, 1.3645, 1.3551,  ..., 3.5991, 3.5544, 3.6007],
       dtype=torch.float64)
L Clip :  tensor(0.3548, dtype=torch.float64)
returned Surrogate Loss  tensor(0.1438, dtype=torch.float64)
 Episode 140 	 avg length: 29 	 reward: -29
 Episode 160 	 avg length: 29 	 reward: -24
 Episode 180 	 avg length: 29 	 reward: -35
State Values :  tensor([-0.1553, -0.1535, -0.1518,  ..., -0.1423, -0.1412, -0.1396])
Advantages :  tensor([1.4421, 1.4134, 1.4045,  ..., 3.4587, 3.4823, 3.4718],
       dtype=torch.float64)
L Clip :  tensor(0.3540, dtype=torch.float64)
returned Surrogate Loss  tensor(0.1441, dtype=torch.float64)
 Episode 200 	 avg length: 29 	 reward: -21
 Episode 220 	 avg length: 29 	 reward: -28
 Episode 240 	 avg length: 29 	 reward: -23
 Episode 26

 Episode 360 	 avg length: 29 	 reward: -24
 Episode 380 	 avg length: 29 	 reward: -21
State Values :  tensor([-0.1018, -0.1005, -0.1007,  ..., -0.0977, -0.0958, -0.0931])
Advantages :  tensor([1.8316, 1.7363, 1.6691,  ..., 2.8480, 2.9389, 3.0852],
       dtype=torch.float64)
L Clip :  tensor(0.4133, dtype=torch.float64)
returned Surrogate Loss  tensor(0.0745, dtype=torch.float64)
 Episode 400 	 avg length: 29 	 reward: -22
 Episode 420 	 avg length: 29 	 reward: -27
 Episode 440 	 avg length: 29 	 reward: -25
 Episode 460 	 avg length: 29 	 reward: -21
State Values :  tensor([-0.1018, -0.1005, -0.1007,  ..., -0.1295, -0.1286, -0.1270])
Advantages :  tensor([1.8677, 1.7738, 1.7075,  ..., 3.2696, 3.2494, 3.2225],
       dtype=torch.float64)
L Clip :  tensor(0.4112, dtype=torch.float64)
returned Surrogate Loss  tensor(0.0754, dtype=torch.float64)
 Episode 480 	 avg length: 29 	 reward: -14
 Episode 500 	 avg length: 29 	 reward: -24
 Episode 20 	 avg length: 29 	 reward: -18
 Episode 40

 Episode 140 	 avg length: 29 	 reward: -18
 Episode 160 	 avg length: 29 	 reward: -17
 Episode 180 	 avg length: 29 	 reward: -23
State Values :  tensor([-0.2981, -0.2999, -0.2997,  ..., -0.3071, -0.3086, -0.3067])
Advantages :  tensor([1.0140, 1.0571, 1.1405,  ..., 3.5210, 3.6379, 3.5556],
       dtype=torch.float64)
L Clip :  tensor(0.2478, dtype=torch.float64)
returned Surrogate Loss  tensor(0.2788, dtype=torch.float64)
 Episode 200 	 avg length: 29 	 reward: -20
 Episode 220 	 avg length: 29 	 reward: -23
 Episode 240 	 avg length: 29 	 reward: -18
 Episode 260 	 avg length: 29 	 reward: -22
State Values :  tensor([-0.2981, -0.2999, -0.2997,  ..., -0.2527, -0.2579, -0.2580])
Advantages :  tensor([0.9855, 1.0270, 1.1069,  ..., 3.5066, 3.4390, 3.4448],
       dtype=torch.float64)
L Clip :  tensor(0.2470, dtype=torch.float64)
returned Surrogate Loss  tensor(0.2796, dtype=torch.float64)
 Episode 280 	 avg length: 29 	 reward: -18
 Episode 300 	 avg length: 29 	 reward: -17
 Episode 3

 Episode 420 	 avg length: 29 	 reward: -25
 Episode 440 	 avg length: 29 	 reward: -21
 Episode 460 	 avg length: 29 	 reward: -24
State Values :  tensor([0.1609, 0.1606, 0.1640,  ..., 0.1766, 0.1800, 0.1782])
Advantages :  tensor([0.0450, 0.1969, 0.1154,  ..., 3.4920, 3.4674, 3.4293],
       dtype=torch.float64)
L Clip :  tensor(0.6632, dtype=torch.float64)
returned Surrogate Loss  tensor(-0.1635, dtype=torch.float64)
 Episode 480 	 avg length: 29 	 reward: -25
 Episode 500 	 avg length: 29 	 reward: -21
Score for elite i  3  is  tensor(-0.1624, dtype=torch.float64)
Elite selected with index  2  and score tensor(0.2797, dtype=torch.float64)
 Episode 20 	 avg length: 29 	 reward: -19
 Episode 40 	 avg length: 29 	 reward: -27
 Episode 60 	 avg length: 29 	 reward: -27
State Values :  tensor([0.2658, 0.2651, 0.2666,  ..., 0.2733, 0.2750, 0.2734])
Advantages :  tensor([-0.4221, -0.3903, -0.3342,  ...,  2.5576,  2.5721,  2.5487],
       dtype=torch.float64)
L Clip :  tensor(0.7839, dtype

 Episode 220 	 avg length: 29 	 reward: -22
 Episode 240 	 avg length: 29 	 reward: -27
 Episode 260 	 avg length: 29 	 reward: -33
State Values :  tensor([-0.0520, -0.0508, -0.0517,  ..., -0.0854, -0.0867, -0.0881])
Advantages :  tensor([2.4910, 2.4059, 2.3228,  ..., 3.4497, 3.5343, 3.6172],
       dtype=torch.float64)
L Clip :  tensor(0.4059, dtype=torch.float64)
returned Surrogate Loss  tensor(0.0844, dtype=torch.float64)
 Episode 280 	 avg length: 29 	 reward: -19
 Episode 300 	 avg length: 29 	 reward: -25
 Episode 320 	 avg length: 29 	 reward: -21
State Values :  tensor([-0.0520, -0.0508, -0.0517,  ..., -0.0534, -0.0523, -0.0543])
Advantages :  tensor([2.6640, 2.5743, 2.4867,  ..., 3.4480, 3.6385, 3.6707],
       dtype=torch.float64)
L Clip :  tensor(0.4077, dtype=torch.float64)
returned Surrogate Loss  tensor(0.0833, dtype=torch.float64)
 Episode 340 	 avg length: 29 	 reward: -34
 Episode 360 	 avg length: 29 	 reward: -23
 Episode 380 	 avg length: 29 	 reward: -22
State Valu

 Episode 480 	 avg length: 29 	 reward: -25
 Episode 500 	 avg length: 29 	 reward: -27
Score for elite i  2  is  tensor(-0.0336, dtype=torch.float64)
 Episode 20 	 avg length: 29 	 reward: -30
 Episode 40 	 avg length: 29 	 reward: -29
 Episode 60 	 avg length: 29 	 reward: -25
State Values :  tensor([0.0696, 0.0622, 0.0605,  ..., 0.0096, 0.0122, 0.0146])
Advantages :  tensor([-1.0240, -1.0718, -1.0750,  ...,  3.6210,  3.6364,  3.6471],
       dtype=torch.float64)
L Clip :  tensor(0.5349, dtype=torch.float64)
returned Surrogate Loss  tensor(-0.0466, dtype=torch.float64)
 Episode 80 	 avg length: 29 	 reward: -23
 Episode 100 	 avg length: 29 	 reward: -24
 Episode 120 	 avg length: 29 	 reward: -21
State Values :  tensor([0.0696, 0.0622, 0.0605,  ..., 0.0600, 0.0549, 0.0572])
Advantages :  tensor([-1.1525, -1.2038, -1.2073,  ...,  3.8936,  3.9353,  3.8856],
       dtype=torch.float64)
L Clip :  tensor(0.5347, dtype=torch.float64)
returned Surrogate Loss  tensor(-0.0452, dtype=torch.fl

In [335]:
def play_agent(agent):
        env = gym.make("LunarLander-v2")
        
        env_record = Monitor(env, './video', force=True)
        observation = env_record.reset()
        last_observation = observation
        r=0
        for _ in range(250):
            env_record.render()
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env_record.step(action)
            r=r+reward
            observation = new_observation

            if(done):
                break

        env_record.close()
        print("Rewards: ",r)      

In [336]:
play_agent(agents[1])

ValueError: 'a' and 'p' must have same size