In [2]:
import gym
import numpy as np
import torch
import matplotlib.pyplot as plt
import time

In [3]:
from gym.wrappers import Monitor

In [4]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [5]:
import math
import copy
from torch.distributions import Categorical
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
solved_reward = 30          # stop training if avg_reward > solved_reward
log_interval = 20           # print avg reward in the interval
max_episodes = 500          # max training episodes
max_timesteps = 300         # max timesteps in one episode
n_latent_var = 64           # number of variables in hidden layer
update_timestep = 2000      # update policy every n timesteps
lr = 0.002
betas = (0.9, 0.999)
gamma = 0.99                # discount factor
K_epochs = 1                # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
random_seed = None
render = False
epsilon = 0.2               #need to change it to max(advantage)
d_kl=1                      #need to change it to KL divergence between old and new policies

In [7]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

In [8]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, n_latent_var):
        super(ActorCritic, self).__init__()

        # actor
        self.action_layer = nn.Sequential(
                nn.Linear(state_dim, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, action_dim),
                nn.Softmax(dim=-1)
                )
        
        # critic
        self.value_layer = nn.Sequential(
                nn.Linear(state_dim, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, 1)
                )
    ##Changed this part    
    def forward(self, inputs):
            x = self.action_layer(inputs)
            return x
    
    def act(self, state, memory):
        state = torch.from_numpy(state).float().to(device) 
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))
        
        return action.item()
    
    def evaluate(self, state, action):
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        
        state_value = self.value_layer(state)
        
        return action_logprobs, torch.squeeze(state_value), dist_entropy

In [9]:
class PPO:
    def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.policy = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def update(self, memory):   
        # Monte Carlo estimate of state rewards:
        rewards = []
        surr_loss = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        
        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        #print(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        #print("Rewardsd Mean: ") 
        #print(rewards.mean())
        # convert list to tensor
        old_states = torch.stack(memory.states).to(device).detach()
        old_actions = torch.stack(memory.actions).to(device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(device).detach()
        
        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            
            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())
            print("State Values : ", state_values.detach())
            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            print("Advantages : ", advantages)
            
            surr1 = ratios * rewards
            surr1-=4*epsilon*gamma*d_kl/np.square(1-gamma)#epsilon needs to be changed to max(advantage) and d_kl needs to be calculated for behavioural and target policy
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            surr_loss.append(loss)
            print("L Clip : ", loss.mean())
            
            # take gradient step
            #self.optimizer.zero_grad()
            #loss.mean().backward()
            #self.optimizer.step()
            
                  
        # Copy new weights into old policy:       
        self.policy_old.load_state_dict(self.policy.state_dict())
        #print("Surrogate Loss Mean: ",surr1.mean())
        return surr1.mean()

In [10]:
env = gym.make('LunarLander-v2')

In [11]:
state_dim = env.observation_space.shape[0]
print(state_dim)

8


In [12]:
print(env.action_space.n)

4


In [13]:
#state_dim = 4
action_dim = 4

In [14]:
def return_random_agents(num_agents):
    
    agents = []
    for _ in range(num_agents):
        ##Change this part
        agent = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        
        for param in agent.parameters():
            param.requires_grad = False
         ##Commented this part   
        #init_weights(agent)
        agents.append(agent)
        
        
    return agents

#### I tried to change this function

In [15]:
def run_agents(agents):
    
    reward_agents = []
    total_loss = []
    env_name = "LunarLander-v2"
    env = gym.make('LunarLander-v2')
    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    #print(lr,betas)
    
    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    surri=0
    s1 = []
    s2 = 0
    s_temp = []
    surr_actual = []
    
    # training loop
    #for agent in agents:
    for i_episode in range(1, max_episodes+1):
        #agent.eval()
        state = env.reset()
        for t in range(max_timesteps):
            timestep += 1
            
            # Running policy_old:
            action = ppo.policy_old.act(state, memory)
            state, reward, done, _ = env.step(action)
            
            # Saving reward and is_terminal:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            
            # update if its time
            if timestep % update_timestep == 0:
                surri=ppo.update(memory)
                print("Surrogate Loss(L_pi) ", surri)
                #memory.clear_memory()
                timestep = 0
            
            running_reward += reward
            if render:
                env.render()
            if done:
                break
                
        avg_length += t
        
        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
            
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))
            print(' Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
        s1.append(surri)
        #surr_mean = torch.mean(torch.stack(surr_actual))    
    s1 = [i for i in s1 if i != 0]
    #print(torch.mean(torch.stack(s1)))
    #print("\n")
            
    #print(total_loss)
    return torch.mean(torch.stack(s1))

In [16]:
def return_average_score(agent, runs):
    #score = 0.
    #for i in range(runs):
    #print(run_agents([agent]))
    score = run_agents([agent])
    return score#/runs

#### <--Till Now Only worked -->

In [17]:
def run_agents_n_times(agents, runs):
    avg_score = []
    for agent in agents:
        avg_score.append(return_average_score(agent,runs))
    return avg_score

In [18]:
def mutate(agent):

    child_agent = copy.deepcopy(agent)
    
    mutation_power = 0.02 #hyper-parameter, set from https://arxiv.org/pdf/1712.06567.pdf
            
    for param in child_agent.parameters():
    
        if(len(param.shape)==4): #weights of Conv2D

            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            
                            param[i0][i1][i2][i3]+= mutation_power * np.random.randn()
                                
                                    

        elif(len(param.shape)==2): #weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    
                    param[i0][i1]+= mutation_power * np.random.randn()
                        

        elif(len(param.shape)==1): #biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0]+=mutation_power * np.random.randn()

    return child_agent

In [19]:
def return_children(agents, sorted_parent_indexes, elite_index):
    
    children_agents = []
    
    #first take selected parents from sorted_parent_indexes and generate N-1 children
    for i in range(len(agents)-1):
        
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))

    #now add one elite
    elite_child = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index=len(children_agents)-1 #it is the last one
    
    return children_agents, elite_index

In [20]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if(elite_index is not None):
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        print("Score for elite i ", i, " is ", score)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    print("Elite selected with index ",top_elite_index, " and score", top_score)
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent
    

In [21]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [22]:
game_actions = 4 

#disable gradients as we will not use them
#torch.set_grad_enabled(False)

# initialize N number of agents
num_agents = 4
agents = return_random_agents(num_agents)


# How many top agents to consider as parents
top_limit = 2

# run evolution until X generations
generations = 3

elite_index = None

for generation in range(generations):

    # return rewards of agents
    rewards = run_agents_n_times(agents, 1) #return average of 1 run
    print("REWARDS::::::",rewards)
    # sort by rewards
    sorted_parent_indexes = np.argsort(rewards)[::-1][:top_limit]#reverses and gives top values (argsort sorts by ascending by default) https://stackoverflow.com/questions/16486252/is-it-possible-to-use-argsort-in-descending-order
    print("Sorting Parent Indexes: ",sorted_parent_indexes)
    print(" Data Type: ", type(sorted_parent_indexes))
    print("Sorting Completed")
    print("Selecting Top Parents")
    
    top_rewards = []
    for best_parent in sorted_parent_indexes:
        top_rewards.append(rewards[best_parent])
    
    print("Generation ", generation, " | Mean rewards: ", torch.mean(torch.stack(rewards)), " | Mean of top 5: ",torch.mean(torch.stack(top_rewards[:5])))
    #print(rewards)
    print("Top ",top_limit," scores", sorted_parent_indexes)
    print("Rewards for top: ",top_rewards)
    
    # setup an empty list for containing children agents
    children_agents, elite_index = return_children(agents, sorted_parent_indexes, elite_index)

    # kill all agents, and replace them with their children
    agents = children_agents

 Episode 20 	 avg length: 97 	 reward: -176
State Values :  tensor([-0.0582, -0.0589, -0.0598,  ..., -0.0189, -0.0192, -0.0196])
Advantages :  tensor([-1.5494, -1.5387, -1.5792,  ...,  1.7200,  1.7828,  1.7938],
       dtype=torch.float64)
L Clip :  tensor(7920.4680, dtype=torch.float64, grad_fn=<MeanBackward0>)
Surrogate Loss(L_pi)  tensor(-7920.0000, dtype=torch.float64, grad_fn=<MeanBackward0>)
State Values :  tensor([-0.0582, -0.0589, -0.0598,  ..., -0.0772, -0.0779, -0.0788])
Advantages :  tensor([-1.3349, -1.3252, -1.3613,  ...,  1.6400,  1.6744,  1.6969],
       dtype=torch.float64)
L Clip :  tensor(7920.4818, dtype=torch.float64, grad_fn=<MeanBackward0>)
Surrogate Loss(L_pi)  tensor(-7920.0000, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 40 	 avg length: 101 	 reward: -178
 Episode 60 	 avg length: 95 	 reward: -211
State Values :  tensor([-0.0582, -0.0589, -0.0598,  ..., -0.0469, -0.0478, -0.0488])
Advantages :  tensor([-1.1296, -1.1205, -1.1543,  ...,  1.8032,  1.7

 Episode 440 	 avg length: 99 	 reward: -185
State Values :  tensor([-0.0582, -0.0589, -0.0598,  ...,  0.0155,  0.0158,  0.0163])
Advantages :  tensor([-0.9234, -0.9149, -0.9460,  ...,  1.5362,  1.5813,  1.6270],
       dtype=torch.float64)
L Clip :  tensor(7920.4887, dtype=torch.float64, grad_fn=<MeanBackward0>)
Surrogate Loss(L_pi)  tensor(-7920.0000, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 460 	 avg length: 100 	 reward: -208
State Values :  tensor([-0.0582, -0.0589, -0.0598,  ...,  0.0152,  0.0147,  0.0155])
Advantages :  tensor([-0.9203, -0.9118, -0.9428,  ...,  1.5958,  1.6302,  1.6320],
       dtype=torch.float64)
L Clip :  tensor(7920.4887, dtype=torch.float64, grad_fn=<MeanBackward0>)
Surrogate Loss(L_pi)  tensor(-7920.0000, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 480 	 avg length: 100 	 reward: -194
State Values :  tensor([-0.0582, -0.0589, -0.0598,  ..., -0.1702, -0.1708, -0.1718])
Advantages :  tensor([-0.9351, -0.9265, -0.9579,  ...,  1.7425, 

KeyboardInterrupt: 

In [23]:
def play_agent(agent):
    env = gym.make("LunarLander-v2")
    env_record = Monitor(env, './video', force=True)
    observation = env_record.reset()
    last_observation = observation
    r=0
    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    for _ in range(250):
        env_record.render()
        action = ppo.policy_old.act(observation, memory)
        new_observation, reward, done, _ = env_record.step(action)
        r=r+reward
        observation = new_observation

        if(done):
            break
    env_record.close()
    print("Rewards: ",r)     

In [24]:
play_agent(agents[1])

Rewards:  -93.14878062769856
