In [1]:
import gym
import numpy as np
import torch
import matplotlib.pyplot as plt
import time

In [2]:
from gym.wrappers import Monitor

In [3]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [4]:
import math
import copy
from torch.distributions import Categorical
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

In [6]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, n_latent_var):
        super(ActorCritic, self).__init__()

        # actor
        self.action_layer = nn.Sequential(
                nn.Linear(state_dim, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, action_dim),
                nn.Softmax(dim=-1)
                )
        
        # critic
        self.value_layer = nn.Sequential(
                nn.Linear(state_dim, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, 1)
                )
    ##Changed this part    
    def forward(self, inputs):
            x = self.action_layer(inputs)
            return x
    
    def act(self, state, memory):
        state = torch.from_numpy(state).float().to(device) 
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))
        
        return action.item()
    
    def evaluate(self, state, action):
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        
        state_value = self.value_layer(state)
        
        return action_logprobs, torch.squeeze(state_value), dist_entropy

In [7]:
class PPO:
    def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.policy = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def update(self, memory):   
        # Monte Carlo estimate of state rewards:
        rewards = []
        surr_loss = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        
        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        #print(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        #print("Rewardsd Mean: ") 
        #print(rewards.mean())
        # convert list to tensor
        old_states = torch.stack(memory.states).to(device).detach()
        old_actions = torch.stack(memory.actions).to(device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(device).detach()
        
        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            
            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())
            print("State Values : ", state_values.detach())
            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            print("Advantages : ", advantages)
            
            surr1 = ratios * advantages
            #surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            #loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            #surr_loss.append(loss)
            print("Surrogate Loss: ", surr1)
            print("Surrogate Loss Mean : ", surr1.mean())
            print("Surrogate Loss Absolute Value : ", torch.abs(surr1.mean()))
            # take gradient step
            #self.optimizer.zero_grad()
            #surr1.mean().backward()
            #self.optimizer.step()
            
                  
        # Copy new weights into old policy:       
        self.policy_old.load_state_dict(self.policy.state_dict())
        #print("Surrogate Loss Mean: ",surr1.mean())
        return torch.abs(surr1.mean())

In [8]:
solved_reward = 30         # stop training if avg_reward > solved_reward
log_interval = 100           # print avg reward in the interval
max_episodes = 1000        # max training episodes
max_timesteps = 30         # max timesteps in one episode
n_latent_var = 64           # number of variables in hidden layer
update_timestep = 2000      # update policy every n timesteps
lr = 0.002
betas = (0.9, 0.999)
gamma = 0.99                # discount factor
K_epochs = 1                # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
random_seed = None
render = False

In [10]:
env = gym.make('LunarLander-v2')

In [11]:
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
print("State Dimension ", state_dim, "\t Action Dimension ", action_dim)

State Dimension  8 	 Action Dimension  4


In [12]:
def return_random_agents(num_agents):
    
    agents = []
    for _ in range(num_agents):
        ##Change this part
        agent = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        
        for param in agent.parameters():
            param.requires_grad = False
         ##Commented this part   
        #init_weights(agent)
        agents.append(agent)
        
        
    return agents

#### I tried to change this function

In [13]:
def run_agents(agents):
    
    reward_agents = []
    total_loss = []
    env_name = "LunarLander-v2"
    env = gym.make('LunarLander-v2')
    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    print(lr,betas)
    
    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    surri=0
    s1 = []
    s2 = 0
    s_temp = []
    surr_actual = []
    
    # training loop
    #for agent in agents:
    for i_episode in range(1, max_episodes+1):
        #agent.eval()
        state = env.reset()
        for t in range(max_timesteps):
            timestep += 1
            
            # Running policy_old:
            action = ppo.policy_old.act(state, memory)
            state, reward, done, _ = env.step(action)
            
            # Saving reward and is_terminal:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            
            # update if its time
            if timestep % update_timestep == 0:
                surri=ppo.update(memory)
                #print("Returned Surrogate Loss ", surri)
                memory.clear_memory()
                timestep = 0
            
            running_reward += reward
            if render:
                env.render()
            if done:
                break
                
        avg_length += t
        
        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
            
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))
            print(' Episode {} '.format(i_episode))
            running_reward = 0
            avg_length = 0
        s1.append(surri)
        #surr_mean = torch.mean(torch.stack(surr_actual))    
    s1 = [i for i in s1 if i != 0]
    #print(torch.mean(torch.stack(s1)))
    #print("\n")
            
    #print(total_loss)
    return torch.mean(torch.stack(s1))

In [14]:
def return_average_score(agent, runs):
    #score = 0.
    #for i in range(runs):
    #print(run_agents([agent]))
    score = run_agents([agent])
    return score#/runs

#### <--Till Now Only worked -->

In [15]:
def run_agents_n_times(agents, runs):
    avg_score = []
    for agent in agents:
        avg_score.append(return_average_score(agent,runs))
    return avg_score

In [16]:
def mutate(agent):

    child_agent = copy.deepcopy(agent)
    
    mutation_power = 0.02 #hyper-parameter, set from https://arxiv.org/pdf/1712.06567.pdf
            
    for param in child_agent.parameters():
    
        if(len(param.shape)==4): #weights of Conv2D

            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            
                            param[i0][i1][i2][i3]+= mutation_power * np.random.randn()
                                
                                    

        elif(len(param.shape)==2): #weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    
                    param[i0][i1]+= mutation_power * np.random.randn()
                        

        elif(len(param.shape)==1): #biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0]+=mutation_power * np.random.randn()

    return child_agent

In [17]:
def return_children(agents, sorted_parent_indexes, elite_index):
    
    children_agents = []
    
    #first take selected parents from sorted_parent_indexes and generate N-1 children
    for i in range(len(agents)-1):
        
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))

    #now add one elite
    elite_child = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index=len(children_agents)-1 #it is the last one
    
    return children_agents, elite_index

In [18]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if(elite_index is not None):
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        print("Score for elite i ", i, " is ", score)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    print("Elite selected with index ",top_elite_index, " and score", top_score)
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent
    

In [19]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [20]:
game_actions = 2 #2 actions possible: left or right

#disable gradients as we will not use them
#torch.set_grad_enabled(False)

# initialize N number of agents
num_agents = 10
agents = return_random_agents(num_agents)

# How many top agents to consider as parents
top_limit = 3

# run evolution until X generations
generations = 5

elite_index = None

for generation in range(generations):

    # return rewards of agents
    losses = run_agents_n_times(agents, 1) #return average of 3 runs

    # sort by rewards
    sorted_parent_indexes = np.argsort(losses)[::-1][:top_limit]#reverses and gives top values (argsort sorts by ascending by default) https://stackoverflow.com/questions/16486252/is-it-possible-to-use-argsort-in-descending-order
    print("\n Sorting Parent Indexes: ",sorted_parent_indexes)
    print(" Data Type: ", type(sorted_parent_indexes))
    print("Sorting Completed")
    print("Selecting Top Parents")
    
    top_losses = []
    for best_parent in sorted_parent_indexes:
        top_losses.append(losses[best_parent])
   
    print("\n \n Generation ", generation, " | Mean Losses: ", torch.mean(torch.stack(losses)), " | Mean of top 5: ",torch.mean(torch.stack(top_losses[:5])))
    
    print("Top ",top_limit," scores", sorted_parent_indexes)
    print("Rewards for top: ",top_losses)
    
    # setup an empty list for containing children agents
    children_agents, elite_index = return_children(agents, sorted_parent_indexes, elite_index)

    # kill all agents, and replace them with their children
    agents = children_agents

0.002 (0.9, 0.999)
State Values :  tensor([-0.0111, -0.0111, -0.0115,  ..., -0.0194, -0.0214, -0.0215])
Advantages :  tensor([0.0808, 0.0804, 0.0756,  ..., 2.3533, 2.3395, 2.3178])
Surrogate Loss:  tensor([0.0808, 0.0804, 0.0756,  ..., 2.3533, 2.3395, 2.3178],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0135, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0135, grad_fn=<AbsBackward>)
 Episode 100 
State Values :  tensor([-0.0233, -0.0257, -0.0230,  ..., -0.0164, -0.0129, -0.0123])
Advantages :  tensor([0.4348, 0.3742, 0.4090,  ..., 2.7026, 2.6727, 2.7633])
Surrogate Loss:  tensor([0.4348, 0.3742, 0.4090,  ..., 2.7026, 2.6727, 2.7633],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0129, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0129, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0111, -0.0130, -0.0118,  ..., -0.0213, -0.0210, -0.0228])
Advantages :  tensor([-0.5446, -0.5164, -0.3781,  ...,  2.3196,  2.3

State Values :  tensor([-0.0667, -0.0678, -0.0643,  ..., -0.0931, -0.0958, -0.0948])
Advantages :  tensor([0.1460, 0.1795, 0.1123,  ..., 2.6977, 2.8119, 2.8366])
Surrogate Loss:  tensor([0.1460, 0.1795, 0.1123,  ..., 2.6977, 2.8119, 2.8366],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0497, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0497, grad_fn=<AbsBackward>)
 Episode 400 
State Values :  tensor([-0.0876, -0.0868, -0.0839,  ..., -0.0468, -0.0438, -0.0421])
Advantages :  tensor([-1.1879, -1.1632, -1.2350,  ...,  2.6477,  2.7025,  2.8264])
Surrogate Loss:  tensor([-1.1879, -1.1632, -1.2350,  ...,  2.6477,  2.7025,  2.8264],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0434, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0434, grad_fn=<AbsBackward>)
 Episode 500 
State Values :  tensor([-0.0405, -0.0391, -0.0368,  ..., -0.0162, -0.0174, -0.0176])
Advantages :  tensor([-0.3254, -0.2207, -0.1510,  ...,  3.268

 Episode 700 
State Values :  tensor([0.2635, 0.2660, 0.2655,  ..., 0.3650, 0.3680, 0.3661])
Advantages :  tensor([0.5159, 0.5286, 0.5683,  ..., 3.4035, 3.4809, 3.5304])
Surrogate Loss:  tensor([0.5159, 0.5286, 0.5683,  ..., 3.4035, 3.4809, 3.5304],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.3059, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.3059, grad_fn=<AbsBackward>)
State Values :  tensor([0.3648, 0.3651, 0.3655,  ..., 0.3065, 0.3055, 0.3064])
Advantages :  tensor([-2.0009, -2.0288, -2.0536,  ...,  2.5731,  2.6869,  2.6705])
Surrogate Loss:  tensor([-2.0009, -2.0288, -2.0536,  ...,  2.5731,  2.6869,  2.6705],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.3085, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.3085, grad_fn=<AbsBackward>)
 Episode 800 
State Values :  tensor([0.2151, 0.2126, 0.2089,  ..., 0.3035, 0.3059, 0.3057])
Advantages :  tensor([-2.5013, -2.5682, -2.6220,  ...,  2.5054,  2.6014,  2.6

State Values :  tensor([0.0618, 0.0619, 0.0655,  ..., 0.1205, 0.1187, 0.1200])
Advantages :  tensor([-0.4341, -0.3304, -0.3991,  ...,  2.6977,  2.6380,  2.6698])
Surrogate Loss:  tensor([-0.4341, -0.3304, -0.3991,  ...,  2.6977,  2.6380,  2.6698],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1276, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1276, grad_fn=<AbsBackward>)
 Episode 100 
State Values :  tensor([0.1200, 0.1185, 0.1169,  ..., 0.0957, 0.0958, 0.0962])
Advantages :  tensor([-0.7817, -0.7398, -0.6903,  ...,  2.4585,  2.4748,  2.4938])
Surrogate Loss:  tensor([-0.7817, -0.7398, -0.6903,  ...,  2.4585,  2.4748,  2.4938],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1314, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1314, grad_fn=<AbsBackward>)
State Values :  tensor([0.0944, 0.0924, 0.0924,  ..., 0.1751, 0.1741, 0.1751])
Advantages :  tensor([-0.9317, -0.7696, -0.7673,  ...,  2.3232,  2.4028,  2.448

State Values :  tensor([-0.0109, -0.0133, -0.0172,  ..., -0.0955, -0.0956, -0.0927])
Advantages :  tensor([-1.1471, -1.1849, -1.0843,  ...,  3.7521,  3.8743,  3.9608])
Surrogate Loss:  tensor([-1.1471, -1.1849, -1.0843,  ...,  3.7521,  3.8743,  3.9608],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0422, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0422, grad_fn=<AbsBackward>)
 Episode 400 
State Values :  tensor([0.0226, 0.0244, 0.0299,  ..., 0.0085, 0.0144, 0.0202])
Advantages :  tensor([-1.4250, -1.4245, -1.3699,  ...,  3.8098,  3.8730,  3.9460])
Surrogate Loss:  tensor([-1.4250, -1.4245, -1.3699,  ...,  3.8098,  3.8730,  3.9460],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0267, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0267, grad_fn=<AbsBackward>)
 Episode 500 
State Values :  tensor([ 0.0224,  0.0191,  0.0252,  ..., -0.0693, -0.0707, -0.0680])
Advantages :  tensor([0.4856, 0.2541, 0.3030,  ..., 4.2

 Episode 700 
State Values :  tensor([-0.0351, -0.0351, -0.0321,  ..., -0.0570, -0.0614, -0.0659])
Advantages :  tensor([ 0.1417,  0.0753, -0.0927,  ...,  2.4106,  2.4611,  2.5122])
Surrogate Loss:  tensor([ 0.1417,  0.0753, -0.0927,  ...,  2.4106,  2.4611,  2.5122],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0006, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0006, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0703, -0.0747, -0.0788,  ..., -0.0856, -0.0833, -0.0845])
Advantages :  tensor([0.6462, 0.6264, 0.6475,  ..., 3.4536, 3.3269, 3.2795])
Surrogate Loss:  tensor([0.6462, 0.6264, 0.6475,  ..., 3.4536, 3.3269, 3.2795],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0159, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0159, grad_fn=<AbsBackward>)
 Episode 800 
State Values :  tensor([ 0.0620,  0.0601,  0.0595,  ..., -0.0316, -0.0328, -0.0311])
Advantages :  tensor([-1.1965, -1.3039, -1.3285,  ...,  3.79

State Values :  tensor([0.0142, 0.0177, 0.0193,  ..., 0.0337, 0.0353, 0.0371])
Advantages :  tensor([1.3155, 1.3220, 1.3565,  ..., 3.0401, 3.0531, 3.0623])
Surrogate Loss:  tensor([1.3155, 1.3220, 1.3565,  ..., 3.0401, 3.0531, 3.0623],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0103, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0103, grad_fn=<AbsBackward>)
 Episode 100 
State Values :  tensor([0.0405, 0.0414, 0.0432,  ..., 0.0401, 0.0370, 0.0373])
Advantages :  tensor([-1.7185, -1.7578, -1.8061,  ...,  3.5707,  3.3814,  3.4342])
Surrogate Loss:  tensor([-1.7185, -1.7578, -1.8061,  ...,  3.5707,  3.3814,  3.4342],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0034, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0034, grad_fn=<AbsBackward>)
State Values :  tensor([ 0.0429,  0.0460,  0.0468,  ..., -0.0093, -0.0070, -0.0023])
Advantages :  tensor([0.2869, 0.3304, 0.3504,  ..., 2.1431, 2.1853, 2.2561])
Surrogate 

State Values :  tensor([-0.1767, -0.1769, -0.1769,  ..., -0.1979, -0.1965, -0.1951])
Advantages :  tensor([-0.8500, -0.8140, -0.7325,  ...,  2.6073,  2.6632,  2.7176])
Surrogate Loss:  tensor([-0.8500, -0.8140, -0.7325,  ...,  2.6073,  2.6632,  2.7176],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1883, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1883, grad_fn=<AbsBackward>)
 Episode 400 
State Values :  tensor([-0.1578, -0.1588, -0.1599,  ..., -0.1245, -0.1229, -0.1213])
Advantages :  tensor([-1.2065, -1.2843, -1.3688,  ...,  2.7695,  2.7796,  2.8520])
Surrogate Loss:  tensor([-1.2065, -1.2843, -1.3688,  ...,  2.7695,  2.7796,  2.8520],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1832, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1832, grad_fn=<AbsBackward>)
 Episode 500 
State Values :  tensor([-0.1195, -0.1176, -0.1138,  ..., -0.1797, -0.1785, -0.1776])
Advantages :  tensor([-0.1206, -0.1431, -0.1532, 

State Values :  tensor([-0.0132, -0.0119, -0.0129,  ..., -0.0282, -0.0284, -0.0282])
Advantages :  tensor([ 0.0286,  0.0713, -0.0067,  ...,  2.4324,  2.3871,  2.3780])
Surrogate Loss:  tensor([ 0.0286,  0.0713, -0.0067,  ...,  2.4324,  2.3871,  2.3780],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0257, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0257, grad_fn=<AbsBackward>)
 Episode 700 
State Values :  tensor([-0.0281, -0.0269, -0.0254,  ..., -0.0665, -0.0660, -0.0662])
Advantages :  tensor([-0.3772, -0.2989, -0.2145,  ...,  3.1441,  3.2124,  3.3525])
Surrogate Loss:  tensor([-0.3772, -0.2989, -0.2145,  ...,  3.1441,  3.2124,  3.3525],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0290, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0290, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0654, -0.0655, -0.0634,  ...,  0.0070,  0.0083,  0.0109])
Advantages :  tensor([-0.1480, -0.0290, -0.1272,  ...,  3.8777,

State Values :  tensor([-0.1218, -0.1219, -0.1228,  ..., -0.1350, -0.1346, -0.1346])
Advantages :  tensor([-0.6011, -0.5609, -0.5384,  ...,  2.6855,  2.6335,  2.6568])
Surrogate Loss:  tensor([-0.6011, -0.5609, -0.5384,  ...,  2.6855,  2.6335,  2.6568],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1391, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1391, grad_fn=<AbsBackward>)
 Episode 1000 
Score for elite i  7  is  tensor(0.1404, grad_fn=<MeanBackward0>)
0.002 (0.9, 0.999)
State Values :  tensor([-0.0007,  0.0005,  0.0026,  ..., -0.0401, -0.0383, -0.0420])
Advantages :  tensor([-1.0937, -1.0443, -1.2112,  ...,  4.1588,  4.2617,  4.3321])
Surrogate Loss:  tensor([-1.0937, -1.0443, -1.2112,  ...,  4.1588,  4.2617,  4.3321],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0216, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0216, grad_fn=<AbsBackward>)
 Episode 100 
State Values :  tensor([-0.0499, -0.0480, -0.051

State Values :  tensor([0.2262, 0.2275, 0.2248,  ..., 0.1723, 0.1666, 0.1665])
Advantages :  tensor([ 0.0111, -0.0197, -0.0630,  ...,  2.0863,  2.0443,  2.1272])
Surrogate Loss:  tensor([ 0.0111, -0.0197, -0.0630,  ...,  2.0863,  2.0443,  2.1272],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.2148, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2148, grad_fn=<AbsBackward>)
 Episode 300 
State Values :  tensor([0.1662, 0.1584, 0.1589,  ..., 0.2365, 0.2385, 0.2357])
Advantages :  tensor([-0.2741, -0.2565, -0.2579,  ...,  2.8525,  2.8970,  2.8609])
Surrogate Loss:  tensor([-0.2741, -0.2565, -0.2579,  ...,  2.8525,  2.8970,  2.8609],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.2177, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2177, grad_fn=<AbsBackward>)
State Values :  tensor([0.2377, 0.2402, 0.2416,  ..., 0.2544, 0.2555, 0.2570])
Advantages :  tensor([-1.8421, -1.7974, -1.8352,  ...,  2.2693,  2.3149,  2.376

State Values :  tensor([-0.0068, -0.0071, -0.0065,  ..., -0.0221, -0.0218, -0.0204])
Advantages :  tensor([1.4632, 1.3437, 1.4926,  ..., 3.3375, 3.1847, 3.2846])
Surrogate Loss:  tensor([1.4632, 1.3437, 1.4926,  ..., 3.3375, 3.1847, 3.2846],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0186, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0186, grad_fn=<AbsBackward>)
 Episode 600 
State Values :  tensor([-0.0244, -0.0274, -0.0290,  ..., -0.0115, -0.0122, -0.0113])
Advantages :  tensor([-1.8910, -1.8481, -1.9352,  ...,  3.5694,  3.6693,  3.7597])
Surrogate Loss:  tensor([-1.8910, -1.8481, -1.9352,  ...,  3.5694,  3.6693,  3.7597],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0242, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0242, grad_fn=<AbsBackward>)
 Episode 700 
State Values :  tensor([-0.0114, -0.0124, -0.0111,  ..., -0.0207, -0.0231, -0.0225])
Advantages :  tensor([-0.6765, -0.7304, -0.6899,  ...,  3.239

 Episode 900 
State Values :  tensor([-0.2079, -0.2064, -0.2051,  ..., -0.1699, -0.1670, -0.1668])
Advantages :  tensor([0.3767, 0.4465, 0.4628,  ..., 2.5442, 2.6297, 2.6283])
Surrogate Loss:  tensor([0.3767, 0.4465, 0.4628,  ..., 2.5442, 2.6297, 2.6283],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.2175, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2175, grad_fn=<AbsBackward>)
State Values :  tensor([-0.1673, -0.1642, -0.1639,  ..., -0.1791, -0.1779, -0.1767])
Advantages :  tensor([0.2410, 0.3420, 0.3489,  ..., 3.0844, 3.1460, 3.2091])
Surrogate Loss:  tensor([0.2410, 0.3420, 0.3489,  ..., 3.0844, 3.1460, 3.2091],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.2155, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2155, grad_fn=<AbsBackward>)
 Episode 1000 
0.002 (0.9, 0.999)
State Values :  tensor([-0.1010, -0.1015, -0.0984,  ..., -0.1006, -0.0954, -0.0914])
Advantages :  tensor([-0.4306, -0.4725, -0.3887,  ...

State Values :  tensor([-0.1075, -0.1036, -0.1065,  ..., -0.0790, -0.0791, -0.0821])
Advantages :  tensor([0.6692, 0.8201, 0.7473,  ..., 3.4878, 3.5399, 3.4560])
Surrogate Loss:  tensor([0.6692, 0.8201, 0.7473,  ..., 3.4878, 3.5399, 3.4560],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1292, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1292, grad_fn=<AbsBackward>)
 Episode 300 
State Values :  tensor([-0.0938, -0.0886, -0.0997,  ..., -0.0844, -0.0962, -0.0917])
Advantages :  tensor([1.0285, 0.9157, 0.8146,  ..., 3.2561, 3.3248, 3.2363])
Surrogate Loss:  tensor([1.0285, 0.9157, 0.8146,  ..., 3.2561, 3.3248, 3.2363],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1226, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1226, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0945, -0.0878, -0.0821,  ..., -0.1754, -0.1784, -0.1909])
Advantages :  tensor([-1.0690, -0.9807, -0.8848,  ...,  3.4398,  3.4642,  3.5694])
Surr

State Values :  tensor([-0.0379, -0.0398, -0.0384,  ..., -0.1021, -0.0965, -0.0911])
Advantages :  tensor([-2.3827e-03, -1.1057e-01, -4.7830e-02,  ...,  4.3203e+00,
         4.3208e+00,  4.3077e+00])
Surrogate Loss:  tensor([-2.3827e-03, -1.1057e-01, -4.7830e-02,  ...,  4.3203e+00,
         4.3208e+00,  4.3077e+00], grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0268, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0268, grad_fn=<AbsBackward>)
 Episode 600 
State Values :  tensor([-0.0579, -0.0536, -0.0562,  ..., -0.0817, -0.0799, -0.0747])
Advantages :  tensor([0.3885, 0.3399, 0.2571,  ..., 2.6094, 2.6283, 2.6087])
Surrogate Loss:  tensor([0.3885, 0.3399, 0.2571,  ..., 2.6094, 2.6283, 2.6087],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0266, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0266, grad_fn=<AbsBackward>)
 Episode 700 
State Values :  tensor([-0.0758, -0.0740, -0.0745,  ..., -0.0054, -0.0067, -0.0076])
Adva

 Episode 900 
State Values :  tensor([0.2271, 0.2278, 0.2259,  ..., 0.2300, 0.2275, 0.2256])
Advantages :  tensor([0.5006, 0.4113, 0.3552,  ..., 2.6826, 2.5570, 2.6161])
Surrogate Loss:  tensor([0.5006, 0.4113, 0.3552,  ..., 2.6826, 2.5570, 2.6161],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.2468, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2468, grad_fn=<AbsBackward>)
State Values :  tensor([0.2236, 0.2195, 0.2174,  ..., 0.1628, 0.1617, 0.1587])
Advantages :  tensor([0.0449, 0.0639, 0.0848,  ..., 2.4767, 2.4495, 2.4047])
Surrogate Loss:  tensor([0.0449, 0.0639, 0.0848,  ..., 2.4767, 2.4495, 2.4047],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.2438, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2438, grad_fn=<AbsBackward>)
 Episode 1000 
0.002 (0.9, 0.999)
State Values :  tensor([0.1481, 0.1482, 0.1472,  ..., 0.1499, 0.1507, 0.1506])
Advantages :  tensor([0.1100, 0.0687, 0.1221,  ..., 2.6622, 2.5804, 2

State Values :  tensor([0.1733, 0.1739, 0.1754,  ..., 0.2027, 0.1997, 0.2011])
Advantages :  tensor([0.3164, 0.3229, 0.2889,  ..., 3.6772, 3.4441, 3.4723])
Surrogate Loss:  tensor([0.3164, 0.3229, 0.2889,  ..., 3.6772, 3.4441, 3.4723],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.2150, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2150, grad_fn=<AbsBackward>)
 Episode 300 
State Values :  tensor([0.2025, 0.2019, 0.2019,  ..., 0.2434, 0.2429, 0.2447])
Advantages :  tensor([-0.1676, -0.1253, -0.1019,  ...,  2.6368,  2.6637,  2.7465])
Surrogate Loss:  tensor([-0.1676, -0.1253, -0.1019,  ...,  2.6368,  2.6637,  2.7465],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.2182, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2182, grad_fn=<AbsBackward>)
State Values :  tensor([0.2445, 0.2462, 0.2469,  ..., 0.1627, 0.1621, 0.1623])
Advantages :  tensor([1.3981, 1.4741, 1.5018,  ..., 3.2968, 3.4039, 3.4706])
Surrogate Loss

State Values :  tensor([0.3307, 0.3338, 0.3365,  ..., 0.2336, 0.2348, 0.2370])
Advantages :  tensor([0.1096, 0.1357, 0.1628,  ..., 3.3744, 3.4172, 3.4281])
Surrogate Loss:  tensor([0.1096, 0.1357, 0.1628,  ..., 3.3744, 3.4172, 3.4281],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.2777, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2777, grad_fn=<AbsBackward>)
 Episode 600 
State Values :  tensor([0.2917, 0.2926, 0.2937,  ..., 0.2536, 0.2526, 0.2550])
Advantages :  tensor([-0.2834, -0.3230, -0.3616,  ...,  2.9107,  2.8996,  2.8430])
Surrogate Loss:  tensor([-0.2834, -0.3230, -0.3616,  ...,  2.9107,  2.8996,  2.8430],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.2740, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2740, grad_fn=<AbsBackward>)
 Episode 700 
State Values :  tensor([0.2536, 0.2547, 0.2570,  ..., 0.3126, 0.3155, 0.3184])
Advantages :  tensor([0.7995, 0.7706, 0.7097,  ..., 4.0058, 4.1325, 4.2689])


State Values :  tensor([0.1091, 0.1115, 0.1132,  ..., 0.1189, 0.1223, 0.1248])
Advantages :  tensor([-1.6358, -1.6577, -1.6353,  ...,  2.6546,  2.6068,  2.6244])
Surrogate Loss:  tensor([-1.6358, -1.6577, -1.6353,  ...,  2.6546,  2.6068,  2.6244],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1532, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1532, grad_fn=<AbsBackward>)
 Episode 900 
State Values :  tensor([0.1265, 0.1289, 0.1313,  ..., 0.0872, 0.0894, 0.0905])
Advantages :  tensor([0.3577, 0.3873, 0.4245,  ..., 2.8416, 2.7980, 2.7899])
Surrogate Loss:  tensor([0.3577, 0.3873, 0.4245,  ..., 2.8416, 2.7980, 2.7899],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1454, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1454, grad_fn=<AbsBackward>)
State Values :  tensor([0.0927, 0.0939, 0.0953,  ..., 0.1984, 0.2008, 0.2032])
Advantages :  tensor([-1.0798, -1.1156, -1.1426,  ...,  3.0933,  3.1278,  3.1631])
Surrogat

State Values :  tensor([-0.1162, -0.1139, -0.1145,  ..., -0.0664, -0.0632, -0.0638])
Advantages :  tensor([1.1937, 1.1724, 1.1776,  ..., 2.9884, 3.0822, 3.1484])
Surrogate Loss:  tensor([1.1937, 1.1724, 1.1776,  ..., 2.9884, 3.0822, 3.1484],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0857, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0857, grad_fn=<AbsBackward>)
 Episode 200 
State Values :  tensor([-0.1434, -0.1437, -0.1467,  ..., -0.1136, -0.1145, -0.1146])
Advantages :  tensor([-0.1660, -0.1590, -0.1140,  ...,  2.3757,  2.3727,  2.4359])
Surrogate Loss:  tensor([-0.1660, -0.1590, -0.1140,  ...,  2.3757,  2.3727,  2.4359],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0942, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0942, grad_fn=<AbsBackward>)
 Episode 300 
State Values :  tensor([-0.1155, -0.1149, -0.1150,  ..., -0.0402, -0.0394, -0.0405])
Advantages :  tensor([-0.2634, -0.2061, -0.1840,  ...,  2.227

 Episode 500 
State Values :  tensor([0.0191, 0.0196, 0.0222,  ..., 0.0007, 0.0042, 0.0079])
Advantages :  tensor([0.2084, 0.2074, 0.2386,  ..., 3.0548, 3.0235, 3.0032])
Surrogate Loss:  tensor([0.2084, 0.2074, 0.2386,  ..., 3.0548, 3.0235, 3.0032],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0149, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0149, grad_fn=<AbsBackward>)
State Values :  tensor([0.0073, 0.0059, 0.0071,  ..., 0.0115, 0.0121, 0.0127])
Advantages :  tensor([0.6214, 0.5328, 0.4581,  ..., 2.3300, 2.4038, 2.4773])
Surrogate Loss:  tensor([0.6214, 0.5328, 0.4581,  ..., 2.3300, 2.4038, 2.4773],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0152, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0152, grad_fn=<AbsBackward>)
 Episode 600 
State Values :  tensor([0.0098, 0.0091, 0.0091,  ..., 0.0144, 0.0144, 0.0166])
Advantages :  tensor([-0.3409, -0.4224, -0.4179,  ...,  3.0040,  2.9722,  3.0172])
Surrog

State Values :  tensor([ 0.0363,  0.0359,  0.0330,  ..., -0.0637, -0.0670, -0.0655])
Advantages :  tensor([1.2689, 1.2299, 1.1591,  ..., 2.6182, 2.6473, 2.6067])
Surrogate Loss:  tensor([1.2689, 1.2299, 1.1591,  ..., 2.6182, 2.6473, 2.6067],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0193, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0193, grad_fn=<AbsBackward>)
 Episode 900 
State Values :  tensor([-0.0661, -0.0639, -0.0631,  ..., -0.0825, -0.0853, -0.0886])
Advantages :  tensor([0.3880, 0.3166, 0.2855,  ..., 2.4648, 2.5259, 2.5971])
Surrogate Loss:  tensor([0.3880, 0.3166, 0.2855,  ..., 2.4648, 2.5259, 2.5971],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0064, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0064, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0892, -0.0922, -0.0954,  ..., -0.1115, -0.1102, -0.1086])
Advantages :  tensor([-0.9023, -0.8473, -0.7842,  ...,  3.6525,  3.6161,  3.5691])
Surr

State Values :  tensor([0.0611, 0.0629, 0.0647,  ..., 0.0849, 0.0835, 0.0864])
Advantages :  tensor([-0.5302, -0.5086, -0.4689,  ...,  3.0257,  2.9580,  3.0056])
Surrogate Loss:  tensor([-0.5302, -0.5086, -0.4689,  ...,  3.0257,  2.9580,  3.0056],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0504, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0504, grad_fn=<AbsBackward>)
 Episode 200 
State Values :  tensor([0.0640, 0.0630, 0.0640,  ..., 0.0393, 0.0417, 0.0450])
Advantages :  tensor([-0.0482, -0.2798, -0.2214,  ...,  3.2184,  3.3164,  3.3706])
Surrogate Loss:  tensor([-0.0482, -0.2798, -0.2214,  ...,  3.2184,  3.3164,  3.3706],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0556, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0556, grad_fn=<AbsBackward>)
 Episode 300 
State Values :  tensor([0.0474, 0.0506, 0.0538,  ..., 0.0488, 0.0509, 0.0523])
Advantages :  tensor([-2.1797, -2.1623, -2.1529,  ...,  4.0537,  

 Episode 500 
State Values :  tensor([-0.0456, -0.0424, -0.0432,  ..., -0.1000, -0.1060, -0.1037])
Advantages :  tensor([0.5498, 0.4292, 0.3645,  ..., 2.9770, 3.0748, 3.1240])
Surrogate Loss:  tensor([0.5498, 0.4292, 0.3645,  ..., 2.9770, 3.0748, 3.1240],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0859, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0859, grad_fn=<AbsBackward>)
State Values :  tensor([-0.1055, -0.1072, -0.1121,  ..., -0.1160, -0.1178, -0.1161])
Advantages :  tensor([-1.4315, -1.3967, -1.3415,  ...,  2.7531,  2.8544,  2.7624])
Surrogate Loss:  tensor([-1.4315, -1.3967, -1.3415,  ...,  2.7531,  2.8544,  2.7624],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0866, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0866, grad_fn=<AbsBackward>)
 Episode 600 
State Values :  tensor([-0.0653, -0.0637, -0.0615,  ..., -0.0573, -0.0627, -0.0616])
Advantages :  tensor([0.2702, 0.3226, 0.1616,  ..., 3.6718, 3

State Values :  tensor([0.0627, 0.0594, 0.0592,  ..., 0.0690, 0.0677, 0.0678])
Advantages :  tensor([1.4609, 1.4255, 1.4175,  ..., 2.8123, 2.8243, 2.8842])
Surrogate Loss:  tensor([1.4609, 1.4255, 1.4175,  ..., 2.8123, 2.8243, 2.8842],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0771, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0771, grad_fn=<AbsBackward>)
 Episode 900 
State Values :  tensor([0.0639, 0.0609, 0.0626,  ..., 0.0863, 0.0869, 0.0833])
Advantages :  tensor([0.6997, 0.7117, 0.6058,  ..., 2.5854, 2.6309, 2.7257])
Surrogate Loss:  tensor([0.6997, 0.7117, 0.6058,  ..., 2.5854, 2.6309, 2.7257],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0778, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0778, grad_fn=<AbsBackward>)
State Values :  tensor([0.0828, 0.0835, 0.0831,  ..., 0.0478, 0.0440, 0.0399])
Advantages :  tensor([0.3098, 0.3424, 0.2489,  ..., 2.0057, 2.0738, 2.1623])
Surrogate Loss:  tensor([0

State Values :  tensor([-0.2236, -0.2232, -0.2253,  ..., -0.1663, -0.1589, -0.1652])
Advantages :  tensor([1.1001, 0.9927, 1.0143,  ..., 3.4673, 3.5399, 3.3895])
Surrogate Loss:  tensor([1.1001, 0.9927, 1.0143,  ..., 3.4673, 3.5399, 3.3895],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1688, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1688, grad_fn=<AbsBackward>)
 Episode 200 
State Values :  tensor([-0.2899, -0.2910, -0.2838,  ..., -0.2820, -0.2740, -0.2700])
Advantages :  tensor([-2.4235, -2.4006, -2.4863,  ...,  3.4303,  3.4245,  3.4774])
Surrogate Loss:  tensor([-2.4235, -2.4006, -2.4863,  ...,  3.4303,  3.4245,  3.4774],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1797, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1797, grad_fn=<AbsBackward>)
 Episode 300 
State Values :  tensor([-0.2688, -0.2683, -0.2607,  ..., -0.2634, -0.2595, -0.2614])
Advantages :  tensor([-0.4674, -0.3841, -0.4064,  ...,  2.974

 Episode 500 
State Values :  tensor([ 0.1015,  0.1055,  0.1072,  ..., -0.0253, -0.0270, -0.0288])
Advantages :  tensor([-0.7317, -0.7472, -0.7781,  ...,  3.5819,  3.5976,  3.6009])
Surrogate Loss:  tensor([-0.7317, -0.7472, -0.7781,  ...,  3.5819,  3.5976,  3.6009],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0081, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0081, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0289, -0.0294, -0.0296,  ...,  0.0424,  0.0459,  0.0494])
Advantages :  tensor([ 0.1185, -0.0868, -0.0877,  ...,  3.7863,  3.8562,  3.9126])
Surrogate Loss:  tensor([ 0.1185, -0.0868, -0.0877,  ...,  3.7863,  3.8562,  3.9126],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0092, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0092, grad_fn=<AbsBackward>)
 Episode 600 
State Values :  tensor([0.0477, 0.0480, 0.0487,  ..., 0.0050, 0.0046, 0.0023])
Advantages :  tensor([-1.5420, -1.4999, -1.4519,  ...

State Values :  tensor([-0.1771, -0.1764, -0.1739,  ..., -0.1811, -0.1780, -0.1748])
Advantages :  tensor([ 0.0158, -0.0816, -0.1235,  ...,  3.5430,  3.5399,  3.5257])
Surrogate Loss:  tensor([ 0.0158, -0.0816, -0.1235,  ...,  3.5430,  3.5399,  3.5257],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1188, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1188, grad_fn=<AbsBackward>)
 Episode 900 
State Values :  tensor([-0.1762, -0.1782, -0.1791,  ..., -0.0563, -0.0539, -0.0538])
Advantages :  tensor([-0.2950, -0.2189, -0.1586,  ...,  4.1307,  4.2287,  4.2690])
Surrogate Loss:  tensor([-0.2950, -0.2189, -0.1586,  ...,  4.1307,  4.2287,  4.2690],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1256, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1256, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0533, -0.0531, -0.0504,  ..., -0.0650, -0.0643, -0.0641])
Advantages :  tensor([-1.7626, -1.7806, -1.7494,  ...,  2.6040,

 Episode 100 
State Values :  tensor([0.2276, 0.2232, 0.2194,  ..., 0.1491, 0.1495, 0.1461])
Advantages :  tensor([0.1233, 0.0753, 0.0256,  ..., 3.1813, 3.2626, 3.2716])
Surrogate Loss:  tensor([0.1233, 0.0753, 0.0256,  ..., 3.1813, 3.2626, 3.2716],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1880, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1880, grad_fn=<AbsBackward>)
State Values :  tensor([0.1514, 0.1572, 0.1581,  ..., 0.1698, 0.1645, 0.1640])
Advantages :  tensor([-0.0855, -0.1647, -0.2182,  ...,  1.9925,  2.1019,  2.1760])
Surrogate Loss:  tensor([-0.0855, -0.1647, -0.2182,  ...,  1.9925,  2.1019,  2.1760],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1882, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1882, grad_fn=<AbsBackward>)
 Episode 200 
State Values :  tensor([0.2433, 0.2388, 0.2391,  ..., 0.1898, 0.1971, 0.1982])
Advantages :  tensor([-1.6971, -1.7374, -1.7486,  ...,  2.1253,  2.0816,  2.0

State Values :  tensor([-0.0565, -0.0541, -0.0545,  ..., -0.0719, -0.0734, -0.0745])
Advantages :  tensor([-0.4909, -0.4074, -0.4470,  ...,  2.3404,  2.3370,  2.3595])
Surrogate Loss:  tensor([-0.4909, -0.4074, -0.4470,  ...,  2.3404,  2.3370,  2.3595],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0838, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0838, grad_fn=<AbsBackward>)
 Episode 500 
State Values :  tensor([-0.0757, -0.0768, -0.0780,  ..., -0.0426, -0.0454, -0.0445])
Advantages :  tensor([-0.3349, -0.3250, -0.3098,  ...,  2.4200,  2.4579,  2.4815])
Surrogate Loss:  tensor([-0.3349, -0.3250, -0.3098,  ...,  2.4200,  2.4579,  2.4815],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0836, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0836, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0456, -0.0467, -0.0485,  ..., -0.1122, -0.1136, -0.1106])
Advantages :  tensor([0.2403, 0.1708, 0.1353,  ..., 2.5590, 2.5

State Values :  tensor([-0.1867, -0.1858, -0.1834,  ..., -0.0528, -0.0496, -0.0451])
Advantages :  tensor([0.5054, 0.4891, 0.4122,  ..., 2.6071, 2.6699, 2.7088])
Surrogate Loss:  tensor([0.5054, 0.4891, 0.4122,  ..., 2.6071, 2.6699, 2.7088],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1437, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1437, grad_fn=<AbsBackward>)
 Episode 800 
State Values :  tensor([-0.1770, -0.1759, -0.1752,  ..., -0.1555, -0.1570, -0.1523])
Advantages :  tensor([-0.3874, -0.4780, -0.5245,  ...,  3.8656,  3.9807,  4.0513])
Surrogate Loss:  tensor([-0.3874, -0.4780, -0.5245,  ...,  3.8656,  3.9807,  4.0513],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1619, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1619, grad_fn=<AbsBackward>)
 Episode 900 
State Values :  tensor([-0.1470, -0.1455, -0.1397,  ..., -0.1629, -0.1600, -0.1589])
Advantages :  tensor([-2.2565, -2.2643, -2.1937,  ...,  2.402

State Values :  tensor([-0.0305, -0.0270, -0.0259,  ..., -0.0030,  0.0007, -0.0012])
Advantages :  tensor([-0.4528, -0.4823, -0.4751,  ...,  2.3130,  2.3235,  2.3783])
Surrogate Loss:  tensor([-0.4528, -0.4823, -0.4751,  ...,  2.3130,  2.3235,  2.3783],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0043, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0043, grad_fn=<AbsBackward>)
 Episode 100 
State Values :  tensor([-0.0007, -0.0011,  0.0028,  ..., -0.0301, -0.0264, -0.0249])
Advantages :  tensor([0.2889, 0.0702, 0.0474,  ..., 3.0747, 3.0725, 3.1191])
Surrogate Loss:  tensor([0.2889, 0.0702, 0.0474,  ..., 3.0747, 3.0725, 3.1191],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0016, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0016, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0244, -0.0232, -0.0199,  ...,  0.0130,  0.0127,  0.0146])
Advantages :  tensor([-1.2650, -1.1847, -1.2360,  ...,  2.7676,  2.8570,  2

State Values :  tensor([0.0828, 0.0945, 0.0902,  ..., 0.0771, 0.0727, 0.0809])
Advantages :  tensor([-0.5123, -0.4598, -0.4653,  ...,  2.6484,  2.5503,  2.5941])
Surrogate Loss:  tensor([-0.5123, -0.4598, -0.4653,  ...,  2.6484,  2.5503,  2.5941],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0694, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0694, grad_fn=<AbsBackward>)
 Episode 400 
State Values :  tensor([0.0150, 0.0164, 0.0149,  ..., 0.0282, 0.0230, 0.0141])
Advantages :  tensor([-1.1296, -1.0912, -1.1494,  ...,  2.9413,  2.9542,  2.9341])
Surrogate Loss:  tensor([-1.1296, -1.0912, -1.1494,  ...,  2.9413,  2.9542,  2.9341],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0723, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0723, grad_fn=<AbsBackward>)
 Episode 500 
State Values :  tensor([0.0164, 0.0186, 0.0209,  ..., 0.0439, 0.0461, 0.0482])
Advantages :  tensor([-0.2895, -0.2395, -0.1875,  ...,  2.6401,  

 Episode 700 
State Values :  tensor([-0.1935, -0.1918, -0.1939,  ..., -0.1012, -0.0992, -0.0970])
Advantages :  tensor([-0.3209, -0.3248, -0.2647,  ...,  2.7781,  2.8201,  2.8625])
Surrogate Loss:  tensor([-0.3209, -0.3248, -0.2647,  ...,  2.7781,  2.8201,  2.8625],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1507, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1507, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0984, -0.0963, -0.0957,  ..., -0.0521, -0.0511, -0.0501])
Advantages :  tensor([1.0426, 1.0722, 1.0387,  ..., 3.1370, 3.1575, 3.1692])
Surrogate Loss:  tensor([1.0426, 1.0722, 1.0387,  ..., 3.1370, 3.1575, 3.1692],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1407, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1407, grad_fn=<AbsBackward>)
 Episode 800 
State Values :  tensor([-0.1761, -0.1769, -0.1785,  ..., -0.2049, -0.2035, -0.2033])
Advantages :  tensor([0.1475, 0.0462, 0.1582,  ..., 2.8884, 2

State Values :  tensor([0.0899, 0.0893, 0.0886,  ..., 0.0722, 0.0722, 0.0713])
Advantages :  tensor([-0.4454, -0.4828, -0.5165,  ...,  2.7743,  2.8708,  2.8847])
Surrogate Loss:  tensor([-0.4454, -0.4828, -0.5165,  ...,  2.7743,  2.8708,  2.8847],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1252, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1252, grad_fn=<AbsBackward>)
 Episode 100 
State Values :  tensor([0.0664, 0.0639, 0.0628,  ..., 0.1871, 0.1869, 0.1885])
Advantages :  tensor([-0.5968, -0.6073, -0.6461,  ...,  2.5628,  2.6205,  2.5994])
Surrogate Loss:  tensor([-0.5968, -0.6073, -0.6461,  ...,  2.5628,  2.6205,  2.5994],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1219, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1219, grad_fn=<AbsBackward>)
State Values :  tensor([0.1881, 0.1895, 0.1875,  ..., 0.1647, 0.1628, 0.1624])
Advantages :  tensor([-0.6447, -0.7007, -0.8440,  ...,  3.1974,  3.0165,  3.114

State Values :  tensor([-0.1893, -0.1864, -0.1833,  ..., -0.1724, -0.1713, -0.1736])
Advantages :  tensor([-1.3573, -1.3990, -1.4260,  ...,  3.5440,  3.6055,  3.7444])
Surrogate Loss:  tensor([-1.3573, -1.3990, -1.4260,  ...,  3.5440,  3.6055,  3.7444],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1632, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1632, grad_fn=<AbsBackward>)
 Episode 400 
State Values :  tensor([-0.1493, -0.1459, -0.1420,  ..., -0.2265, -0.2224, -0.2209])
Advantages :  tensor([-0.4342, -0.5010, -0.5777,  ...,  3.3521,  3.4443,  3.4881])
Surrogate Loss:  tensor([-0.4342, -0.5010, -0.5777,  ...,  3.3521,  3.4443,  3.4881],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1752, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1752, grad_fn=<AbsBackward>)
 Episode 500 
State Values :  tensor([-0.2193, -0.2212, -0.2202,  ..., -0.2030, -0.2023, -0.2016])
Advantages :  tensor([-0.5414, -0.4927, -0.4961, 

 Episode 700 
State Values :  tensor([0.0149, 0.0181, 0.0152,  ..., 0.0029, 0.0017, 0.0070])
Advantages :  tensor([-0.5469, -0.4952, -0.5485,  ...,  2.8953,  3.0111,  3.0439])
Surrogate Loss:  tensor([-0.5469, -0.4952, -0.5485,  ...,  2.8953,  3.0111,  3.0439],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0088, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0088, grad_fn=<AbsBackward>)
State Values :  tensor([0.0120, 0.0174, 0.0193,  ..., 0.0357, 0.0400, 0.0396])
Advantages :  tensor([0.3306, 0.3192, 0.3473,  ..., 2.5931, 2.5716, 2.5376])
Surrogate Loss:  tensor([0.3306, 0.3192, 0.3473,  ..., 2.5931, 2.5716, 2.5376],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0083, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0083, grad_fn=<AbsBackward>)
 Episode 800 
State Values :  tensor([-0.0425, -0.0426, -0.0363,  ...,  0.0385,  0.0382,  0.0414])
Advantages :  tensor([-1.2146, -1.2166, -1.3125,  ...,  4.0212,  4.0812

State Values :  tensor([-0.0007, -0.0004, -0.0015,  ..., -0.0220, -0.0203, -0.0222])
Advantages :  tensor([-1.9583, -1.9833, -2.0276,  ...,  2.4666,  2.3921,  2.4559])
Surrogate Loss:  tensor([-1.9583, -1.9833, -2.0276,  ...,  2.4666,  2.3921,  2.4559],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0397, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0397, grad_fn=<AbsBackward>)
 Episode 100 
State Values :  tensor([-0.0241, -0.0273, -0.0293,  ..., -0.0629, -0.0644, -0.0659])
Advantages :  tensor([-1.6102, -1.5979, -1.5666,  ...,  3.9079,  3.9779,  3.9298])
Surrogate Loss:  tensor([-1.6102, -1.5979, -1.5666,  ...,  3.9079,  3.9779,  3.9298],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0365, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0365, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0675, -0.0707, -0.0725,  ..., -0.0562, -0.0600, -0.0627])
Advantages :  tensor([-0.5576, -0.5316, -0.4966,  ...,  2.2863,

State Values :  tensor([-0.0281, -0.0297, -0.0363,  ..., -0.0317, -0.0330, -0.0401])
Advantages :  tensor([0.2699, 0.2564, 0.2987,  ..., 3.1664, 3.1602, 3.2358])
Surrogate Loss:  tensor([0.2699, 0.2564, 0.2987,  ..., 3.1664, 3.1602, 3.2358],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0085, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0085, grad_fn=<AbsBackward>)
 Episode 400 
State Values :  tensor([-0.0346, -0.0358, -0.0340,  ...,  0.0083,  0.0125,  0.0154])
Advantages :  tensor([1.1336, 1.1499, 1.1323,  ..., 3.4877, 3.5753, 3.5098])
Surrogate Loss:  tensor([1.1336, 1.1499, 1.1323,  ..., 3.4877, 3.5753, 3.5098],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0198, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0198, grad_fn=<AbsBackward>)
 Episode 500 
State Values :  tensor([ 0.0190,  0.0194,  0.0144,  ..., -0.0098, -0.0077, -0.0081])
Advantages :  tensor([-3.9437e-03,  2.7987e-02,  1.0841e-03,  ...,  3.909

State Values :  tensor([-0.0895, -0.0915, -0.0940,  ..., -0.0920, -0.0914, -0.0910])
Advantages :  tensor([0.1046, 0.1332, 0.1796,  ..., 2.4923, 2.5590, 2.4583])
Surrogate Loss:  tensor([0.1046, 0.1332, 0.1796,  ..., 2.4923, 2.5590, 2.4583],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0783, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0783, grad_fn=<AbsBackward>)
 Episode 700 
State Values :  tensor([-0.0922, -0.0921, -0.0923,  ..., -0.0485, -0.0495, -0.0531])
Advantages :  tensor([-0.6885, -0.8170, -0.8984,  ...,  2.6883,  2.7204,  2.7139])
Surrogate Loss:  tensor([-0.6885, -0.8170, -0.8984,  ...,  2.6883,  2.7204,  2.7139],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0655, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0655, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0570, -0.0555, -0.0538,  ..., -0.1001, -0.1006, -0.0994])
Advantages :  tensor([1.0577, 1.1208, 1.1984,  ..., 2.6934, 2.6653, 2.7250]

State Values :  tensor([ 1.9576e-01,  2.0340e-01,  2.0168e-01,  ..., -3.7989e-03,
         3.5541e-03,  2.7612e-05])
Advantages :  tensor([-0.4740, -0.4017, -0.3744,  ...,  3.5594,  3.5309,  3.6078])
Surrogate Loss:  tensor([-0.4740, -0.4017, -0.3744,  ...,  3.5594,  3.5309,  3.6078],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0897, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0897, grad_fn=<AbsBackward>)
 Episode 1000 
Score for elite i  2  is  tensor(0.0818, grad_fn=<MeanBackward0>)
0.002 (0.9, 0.999)
State Values :  tensor([-0.1506, -0.1512, -0.1520,  ..., -0.1724, -0.1696, -0.1677])
Advantages :  tensor([0.3438, 0.2616, 0.1744,  ..., 3.4430, 3.4836, 3.4058])
Surrogate Loss:  tensor([0.3438, 0.2616, 0.1744,  ..., 3.4430, 3.4836, 3.4058],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.1740, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1740, grad_fn=<AbsBackward>)
 Episode 100 
State Values :  tensor([-0.

State Values :  tensor([-0.1154, -0.1170, -0.1131,  ..., -0.0505, -0.0512, -0.0531])
Advantages :  tensor([-0.7351, -0.7566, -0.8585,  ...,  2.9980,  2.9537,  3.0531])
Surrogate Loss:  tensor([-0.7351, -0.7566, -0.8585,  ...,  2.9980,  2.9537,  3.0531],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0750, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0750, grad_fn=<AbsBackward>)
 Episode 300 
State Values :  tensor([-0.0531, -0.0523, -0.0514,  ..., -0.1015, -0.1013, -0.1018])
Advantages :  tensor([0.0342, 0.1028, 0.1696,  ..., 3.3051, 3.3594, 3.4387])
Surrogate Loss:  tensor([0.0342, 0.1028, 0.1696,  ..., 3.3051, 3.3594, 3.4387],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0759, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0759, grad_fn=<AbsBackward>)
State Values :  tensor([-0.1006, -0.1015, -0.1002,  ..., -0.0857, -0.0878, -0.0864])
Advantages :  tensor([-0.1060, -0.1661, -0.1577,  ...,  3.6688,  3.5957,  3

State Values :  tensor([ 0.0083,  0.0103,  0.0124,  ..., -0.0391, -0.0413, -0.0443])
Advantages :  tensor([-1.3340, -1.3175, -1.2890,  ...,  2.7844,  2.7462,  2.7199])
Surrogate Loss:  tensor([-1.3340, -1.3175, -1.2890,  ...,  2.7844,  2.7462,  2.7199],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0213, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0213, grad_fn=<AbsBackward>)
 Episode 600 
State Values :  tensor([-0.0665, -0.0639, -0.0640,  ...,  0.0685,  0.0687,  0.0705])
Advantages :  tensor([0.1347, 0.1020, 0.0483,  ..., 2.7960, 2.7559, 2.7389])
Surrogate Loss:  tensor([0.1347, 0.1020, 0.0483,  ..., 2.7960, 2.7559, 2.7389],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0194, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0194, grad_fn=<AbsBackward>)
 Episode 700 
State Values :  tensor([0.0720, 0.0734, 0.0748,  ..., 0.0199, 0.0227, 0.0259])
Advantages :  tensor([1.8543, 1.8168, 1.8492,  ..., 3.6464, 3.6883,

 Episode 900 
State Values :  tensor([-0.2160, -0.2155, -0.2189,  ..., -0.2058, -0.2093, -0.2110])
Advantages :  tensor([-1.0610, -1.0169, -0.9542,  ...,  3.0725,  3.1424,  3.1759])
Surrogate Loss:  tensor([-1.0610, -1.0169, -0.9542,  ...,  3.0725,  3.1424,  3.1759],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.2102, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2102, grad_fn=<AbsBackward>)
State Values :  tensor([-0.2126, -0.2159, -0.2175,  ..., -0.2104, -0.2141, -0.2158])
Advantages :  tensor([-0.4899, -0.4402, -0.4291,  ...,  3.1810,  3.2625,  3.3006])
Surrogate Loss:  tensor([-0.4899, -0.4402, -0.4291,  ...,  3.1810,  3.2625,  3.3006],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.2191, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2191, grad_fn=<AbsBackward>)
 Episode 1000 
0.002 (0.9, 0.999)
State Values :  tensor([0.1256, 0.1229, 0.1232,  ..., 0.1943, 0.1949, 0.1956])
Advantages :  tensor([-0.9102, -0.8

State Values :  tensor([-0.0112, -0.0111, -0.0126,  ...,  0.0529,  0.0516,  0.0518])
Advantages :  tensor([ 0.0348, -0.0945, -0.0442,  ...,  2.6468,  2.6678,  2.6460])
Surrogate Loss:  tensor([ 0.0348, -0.0945, -0.0442,  ...,  2.6468,  2.6678,  2.6460],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0062, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0062, grad_fn=<AbsBackward>)
 Episode 300 
State Values :  tensor([ 0.0534,  0.0505,  0.0505,  ..., -0.0936, -0.0960, -0.1010])
Advantages :  tensor([-2.3456, -2.3337, -2.4078,  ...,  2.7008,  2.8076,  2.8371])
Surrogate Loss:  tensor([-2.3456, -2.3337, -2.4078,  ...,  2.7008,  2.8076,  2.8371],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0092, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0092, grad_fn=<AbsBackward>)
State Values :  tensor([-0.1060, -0.1102, -0.1131,  ..., -0.0613, -0.0643, -0.0618])
Advantages :  tensor([1.0406, 1.0376, 1.0530,  ..., 3.1006, 3.1

State Values :  tensor([0.1623, 0.1598, 0.1618,  ..., 0.1880, 0.1859, 0.1851])
Advantages :  tensor([-0.1414, -0.0789, -0.0483,  ...,  2.8885,  2.9139,  2.9758])
Surrogate Loss:  tensor([-0.1414, -0.0789, -0.0483,  ...,  2.8885,  2.9139,  2.9758],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1632, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1632, grad_fn=<AbsBackward>)
 Episode 600 
State Values :  tensor([0.1665, 0.1662, 0.1637,  ..., 0.1871, 0.1873, 0.1863])
Advantages :  tensor([-1.5224, -1.4573, -1.3838,  ...,  3.0258,  3.1839,  3.2184])
Surrogate Loss:  tensor([-1.5224, -1.4573, -1.3838,  ...,  3.0258,  3.1839,  3.2184],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1636, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1636, grad_fn=<AbsBackward>)
 Episode 700 
State Values :  tensor([0.1873, 0.1863, 0.1870,  ..., 0.1719, 0.1694, 0.1665])
Advantages :  tensor([-1.5493, -1.5508, -1.5008,  ...,  2.5745,  

 Episode 900 
State Values :  tensor([-0.2594, -0.2606, -0.2635,  ..., -0.1452, -0.1480, -0.1512])
Advantages :  tensor([-0.8212, -0.7774, -0.8001,  ...,  2.9905,  2.9823,  3.0684])
Surrogate Loss:  tensor([-0.8212, -0.7774, -0.8001,  ...,  2.9905,  2.9823,  3.0684],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.2142, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2142, grad_fn=<AbsBackward>)
State Values :  tensor([-0.1501, -0.1531, -0.1559,  ..., -0.2768, -0.2786, -0.2799])
Advantages :  tensor([-1.5005, -1.4635, -1.4626,  ...,  3.1632,  3.1707,  3.1793])
Surrogate Loss:  tensor([-1.5005, -1.4635, -1.4626,  ...,  3.1632,  3.1707,  3.1793],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.2113, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2113, grad_fn=<AbsBackward>)
 Episode 1000 
0.002 (0.9, 0.999)
State Values :  tensor([0.1452, 0.1445, 0.1455,  ..., 0.0657, 0.0593, 0.0522])
Advantages :  tensor([-0.8377, -0.8

State Values :  tensor([0.1572, 0.1576, 0.1574,  ..., 0.1030, 0.1037, 0.1044])
Advantages :  tensor([0.2623, 0.2834, 0.2893,  ..., 3.3956, 3.3992, 3.4045])
Surrogate Loss:  tensor([0.2623, 0.2834, 0.2893,  ..., 3.3956, 3.3992, 3.4045],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1757, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1757, grad_fn=<AbsBackward>)
 Episode 300 
State Values :  tensor([0.1040, 0.1056, 0.1073,  ..., 0.1492, 0.1503, 0.1502])
Advantages :  tensor([-1.9086, -1.9020, -1.8862,  ...,  2.9343,  3.0374,  3.0759])
Surrogate Loss:  tensor([-1.9086, -1.9020, -1.8862,  ...,  2.9343,  3.0374,  3.0759],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.1780, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.1780, grad_fn=<AbsBackward>)
State Values :  tensor([0.1499, 0.1509, 0.1487,  ..., 0.2395, 0.2421, 0.2413])
Advantages :  tensor([1.5774, 1.6538, 1.5352,  ..., 2.5614, 2.6656, 2.7342])
Surrogate Loss

State Values :  tensor([0.0682, 0.0704, 0.0703,  ..., 0.0475, 0.0471, 0.0483])
Advantages :  tensor([-1.8662, -1.8792, -1.9116,  ...,  2.3371,  2.4187,  2.2792])
Surrogate Loss:  tensor([-1.8662, -1.8792, -1.9116,  ...,  2.3371,  2.4187,  2.2792],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0557, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0557, grad_fn=<AbsBackward>)
 Episode 600 
State Values :  tensor([0.0534, 0.0535, 0.0519,  ..., 0.0274, 0.0252, 0.0227])
Advantages :  tensor([0.6979, 0.6661, 0.6789,  ..., 3.6443, 3.7589, 3.8538])
Surrogate Loss:  tensor([0.6979, 0.6661, 0.6789,  ..., 3.6443, 3.7589, 3.8538],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0553, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0553, grad_fn=<AbsBackward>)
 Episode 700 
State Values :  tensor([0.0203, 0.0173, 0.0147,  ..., 0.0747, 0.0738, 0.0722])
Advantages :  tensor([-0.1081, -0.0636, -0.0044,  ...,  3.6177,  3.5722,  3.4

 Episode 900 
State Values :  tensor([0.0257, 0.0274, 0.0252,  ..., 0.0278, 0.0293, 0.0344])
Advantages :  tensor([-0.6005, -0.5709, -0.5107,  ...,  2.6391,  2.7249,  2.8184])
Surrogate Loss:  tensor([-0.6005, -0.5709, -0.5107,  ...,  2.6391,  2.7249,  2.8184],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0225, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0225, grad_fn=<AbsBackward>)
State Values :  tensor([0.0360, 0.0411, 0.0428,  ..., 0.0570, 0.0611, 0.0592])
Advantages :  tensor([0.0570, 0.1592, 0.2524,  ..., 2.6415, 2.7894, 2.9359])
Surrogate Loss:  tensor([0.0570, 0.1592, 0.2524,  ..., 2.6415, 2.7894, 2.9359],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(-0.0221, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0221, grad_fn=<AbsBackward>)
 Episode 1000 

 Sorting Parent Indexes:  [5 1 7]
 Data Type:  <class 'numpy.ndarray'>
Sorting Completed
Selecting Top Parents

 
 Generation  4  | Mean Losses:  tensor(0

State Values :  tensor([-0.0261, -0.0235, -0.0232,  ..., -0.0264, -0.0267, -0.0275])
Advantages :  tensor([1.0333, 1.1300, 1.2462,  ..., 3.2885, 3.4335, 3.6633])
Surrogate Loss:  tensor([1.0333, 1.1300, 1.2462,  ..., 3.2885, 3.4335, 3.6633],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0253, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0253, grad_fn=<AbsBackward>)
 Episode 200 
State Values :  tensor([-0.0461, -0.0479, -0.0486,  ..., -0.0180, -0.0179, -0.0179])
Advantages :  tensor([-0.2421, -0.2792, -0.1587,  ...,  2.6310,  2.7346,  2.8514])
Surrogate Loss:  tensor([-0.2421, -0.2792, -0.1587,  ...,  2.6310,  2.7346,  2.8514],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0239, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0239, grad_fn=<AbsBackward>)
 Episode 300 
State Values :  tensor([-0.0157, -0.0157, -0.0161,  ...,  0.0022,  0.0027,  0.0031])
Advantages :  tensor([-2.5520, -2.4959, -2.4480,  ...,  1.659

 Episode 500 
State Values :  tensor([-0.2545, -0.2536, -0.2527,  ..., -0.2507, -0.2497, -0.2488])
Advantages :  tensor([-0.6963, -0.6686, -0.6201,  ...,  4.1717,  4.0250,  3.8140])
Surrogate Loss:  tensor([-0.6963, -0.6686, -0.6201,  ...,  4.1717,  4.0250,  3.8140],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.2464, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2464, grad_fn=<AbsBackward>)
State Values :  tensor([-0.2483, -0.2475, -0.2469,  ..., -0.2066, -0.2047, -0.2028])
Advantages :  tensor([0.4366, 0.4590, 0.2726,  ..., 2.9746, 3.0485, 3.1032])
Surrogate Loss:  tensor([0.4366, 0.4590, 0.2726,  ..., 2.9746, 3.0485, 3.1032],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.2471, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.2471, grad_fn=<AbsBackward>)
 Episode 600 
State Values :  tensor([-0.2474, -0.2465, -0.2457,  ..., -0.2132, -0.2115, -0.2095])
Advantages :  tensor([0.5514, 0.6054, 0.6342,  ..., 3.2362, 3

State Values :  tensor([-0.0335, -0.0316, -0.0323,  ...,  0.0031,  0.0029,  0.0046])
Advantages :  tensor([0.2752, 0.1651, 0.2049,  ..., 2.8553, 2.7118, 2.8004])
Surrogate Loss:  tensor([0.2752, 0.1651, 0.2049,  ..., 2.8553, 2.7118, 2.8004],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0161, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0161, grad_fn=<AbsBackward>)
 Episode 900 
State Values :  tensor([ 0.0063,  0.0079,  0.0089,  ..., -0.0033, -0.0036, -0.0035])
Advantages :  tensor([-1.0151, -0.9535, -1.0442,  ...,  3.6519,  3.5529,  3.6160])
Surrogate Loss:  tensor([-1.0151, -0.9535, -1.0442,  ...,  3.6519,  3.5529,  3.6160],
       grad_fn=<MulBackward0>)
Surrogate Loss Mean :  tensor(0.0139, grad_fn=<MeanBackward0>)
Surrogate Loss Absolute Value :  tensor(0.0139, grad_fn=<AbsBackward>)
State Values :  tensor([-0.0030, -0.0029, -0.0021,  ..., -0.0456, -0.0436, -0.0455])
Advantages :  tensor([1.4815, 1.5096, 1.5535,  ..., 2.9079, 2.9302, 3.2122]

In [None]:
def play_agent(agent):
    env = gym.make("LunarLander-v2")
    env_record = Monitor(env, './video', force=True)
    observation = env_record.reset()
    last_observation = observation
    r=0
    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    for _ in range(250):
        env_record.render()
        action = ppo.policy_old.act(observation, memory)
        new_observation, reward, done, _ = env_record.step(action)
        r=r+reward
        observation = new_observation

        if(done):
            break
    env_record.close()
    print("Rewards: ",r)      

In [None]:
play_agent(agents[1])