In [43]:
import gym
import numpy as np
import torch
import matplotlib.pyplot as plt
import time

In [44]:
from gym.wrappers import Monitor

In [45]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [46]:
import math
import copy
from torch.distributions import Categorical
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [47]:
class Memory:
    def __init__(self):
        self.actions = []
        self.states = []
        self.logprobs = []
        self.rewards = []
        self.is_terminals = []
    
    def clear_memory(self):
        del self.actions[:]
        del self.states[:]
        del self.logprobs[:]
        del self.rewards[:]
        del self.is_terminals[:]

In [48]:
class ActorCritic(nn.Module):
    def __init__(self, state_dim, action_dim, n_latent_var):
        super(ActorCritic, self).__init__()

        # actor
        self.action_layer = nn.Sequential(
                nn.Linear(state_dim, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, action_dim),
                nn.Softmax(dim=-1)
                )
        
        # critic
        self.value_layer = nn.Sequential(
                nn.Linear(state_dim, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, n_latent_var),
                nn.Tanh(),
                nn.Linear(n_latent_var, 1)
                )
    ##Changed this part    
    def forward(self, inputs):
            x = self.action_layer(inputs)
            return x
    
    def act(self, state, memory):
        state = torch.from_numpy(state).float().to(device) 
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        action = dist.sample()
        
        memory.states.append(state)
        memory.actions.append(action)
        memory.logprobs.append(dist.log_prob(action))
        
        return action.item()
    
    def evaluate(self, state, action):
        action_probs = self.action_layer(state)
        dist = Categorical(action_probs)
        
        action_logprobs = dist.log_prob(action)
        dist_entropy = dist.entropy()
        
        state_value = self.value_layer(state)
        
        return action_logprobs, torch.squeeze(state_value), dist_entropy

In [49]:
class PPO:
    def __init__(self, state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip):
        self.lr = lr
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs
        
        self.policy = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr, betas=betas)
        self.policy_old = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())
        
        self.MseLoss = nn.MSELoss()
    
    def update(self, memory):   
        # Monte Carlo estimate of state rewards:
        rewards = []
        surr_loss = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)
        
        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(device)
        #print(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)
        #print("Rewardsd Mean: ") 
        #print(rewards.mean())
        # convert list to tensor
        old_states = torch.stack(memory.states).to(device).detach()
        old_actions = torch.stack(memory.actions).to(device).detach()
        old_logprobs = torch.stack(memory.logprobs).to(device).detach()
        
        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            logprobs, state_values, dist_entropy = self.policy.evaluate(old_states, old_actions)
            
            # Finding the ratio (pi_theta / pi_theta__old):
            ratios = torch.exp(logprobs - old_logprobs.detach())
            print("State Values : ", state_values.detach())
            # Finding Surrogate Loss:
            advantages = rewards - state_values.detach()
            print("Advantages : ", advantages)
            
            surr1 = ratios * advantages
            surr2 = torch.clamp(ratios, 1-self.eps_clip, 1+self.eps_clip) * advantages
            loss = -torch.min(surr1, surr2) + 0.5*self.MseLoss(state_values, rewards) - 0.01*dist_entropy
            surr_loss.append(loss)
            print("L Clip : ", loss.mean())
            
            # take gradient step
            #self.optimizer.zero_grad()
            #loss.mean().backward()
            #self.optimizer.step()
            
                  
        # Copy new weights into old policy:       
        self.policy_old.load_state_dict(self.policy.state_dict())
        #print("Surrogate Loss Mean: ",surr1.mean())
        return surr1.mean()

In [50]:
solved_reward = 30         # stop training if avg_reward > solved_reward
log_interval = 20           # print avg reward in the interval
max_episodes = 500        # max training episodes
max_timesteps = 30         # max timesteps in one episode
n_latent_var = 64           # number of variables in hidden layer
update_timestep = 2000      # update policy every n timesteps
lr = 0.002
betas = (0.9, 0.999)
gamma = 0.99                # discount factor
K_epochs = 1                # update policy for K epochs
eps_clip = 0.2              # clip parameter for PPO
random_seed = None
render = False

In [51]:
env = gym.make('LunarLander-v2')

In [52]:
state_dim = env.observation_space.shape[0]
print(state_dim)

8


In [53]:
print(env.action_space.n)

4


In [54]:
#state_dim = 4
action_dim = 4

In [55]:
def return_random_agents(num_agents):
    
    agents = []
    for _ in range(num_agents):
        ##Change this part
        agent = ActorCritic(state_dim, action_dim, n_latent_var).to(device)
        
        for param in agent.parameters():
            param.requires_grad = False
         ##Commented this part   
        #init_weights(agent)
        agents.append(agent)
        
        
    return agents

#### I tried to change this function

In [56]:
def run_agents(agents):
    
    reward_agents = []
    total_loss = []
    env_name = "LunarLander-v2"
    env = gym.make('LunarLander-v2')
    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)
    
    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    #print(lr,betas)
    
    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    surri=0
    s1 = []
    s2 = 0
    s_temp = []
    surr_actual = []
    
    # training loop
    #for agent in agents:
    for i_episode in range(1, max_episodes+1):
        #agent.eval()
        state = env.reset()
        for t in range(max_timesteps):
            timestep += 1
            
            # Running policy_old:
            action = ppo.policy_old.act(state, memory)
            state, reward, done, _ = env.step(action)
            
            # Saving reward and is_terminal:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)
            
            # update if its time
            if timestep % update_timestep == 0:
                surri=ppo.update(memory)
                print("returned Surrogate Loss ", surri)
                #memory.clear_memory()
                timestep = 0
            
            running_reward += reward
            if render:
                env.render()
            if done:
                break
                
        avg_length += t
        
        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            break
            
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))
            print(' Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
        s1.append(surri)
        #surr_mean = torch.mean(torch.stack(surr_actual))    
    s1 = [i for i in s1 if i != 0]
    #print(torch.mean(torch.stack(s1)))
    #print("\n")
            
    #print(total_loss)
    return torch.mean(torch.stack(s1))

In [57]:
def return_average_score(agent, runs):
    #score = 0.
    #for i in range(runs):
    #print(run_agents([agent]))
    score = run_agents([agent])
    return score#/runs

#### <--Till Now Only worked -->

In [58]:
def run_agents_n_times(agents, runs):
    avg_score = []
    for agent in agents:
        avg_score.append(return_average_score(agent,runs))
    return avg_score

In [59]:
def mutate(agent):

    child_agent = copy.deepcopy(agent)
    
    mutation_power = 0.02 #hyper-parameter, set from https://arxiv.org/pdf/1712.06567.pdf
            
    for param in child_agent.parameters():
    
        if(len(param.shape)==4): #weights of Conv2D

            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    for i2 in range(param.shape[2]):
                        for i3 in range(param.shape[3]):
                            
                            param[i0][i1][i2][i3]+= mutation_power * np.random.randn()
                                
                                    

        elif(len(param.shape)==2): #weights of linear layer
            for i0 in range(param.shape[0]):
                for i1 in range(param.shape[1]):
                    
                    param[i0][i1]+= mutation_power * np.random.randn()
                        

        elif(len(param.shape)==1): #biases of linear layer or conv layer
            for i0 in range(param.shape[0]):
                
                param[i0]+=mutation_power * np.random.randn()

    return child_agent

In [60]:
def return_children(agents, sorted_parent_indexes, elite_index):
    
    children_agents = []
    
    #first take selected parents from sorted_parent_indexes and generate N-1 children
    for i in range(len(agents)-1):
        
        selected_agent_index = sorted_parent_indexes[np.random.randint(len(sorted_parent_indexes))]
        children_agents.append(mutate(agents[selected_agent_index]))

    #now add one elite
    elite_child = add_elite(agents, sorted_parent_indexes, elite_index)
    children_agents.append(elite_child)
    elite_index=len(children_agents)-1 #it is the last one
    
    return children_agents, elite_index

In [61]:
def add_elite(agents, sorted_parent_indexes, elite_index=None, only_consider_top_n=10):
    
    candidate_elite_index = sorted_parent_indexes[:only_consider_top_n]
    
    if(elite_index is not None):
        candidate_elite_index = np.append(candidate_elite_index,[elite_index])
        
    top_score = None
    top_elite_index = None
    
    for i in candidate_elite_index:
        score = return_average_score(agents[i],runs=5)
        print("Score for elite i ", i, " is ", score)
        
        if(top_score is None):
            top_score = score
            top_elite_index = i
        elif(score > top_score):
            top_score = score
            top_elite_index = i
            
    print("Elite selected with index ",top_elite_index, " and score", top_score)
    
    child_agent = copy.deepcopy(agents[top_elite_index])
    return child_agent
    

In [62]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

In [63]:
game_actions = 2 #2 actions possible: left or right

#disable gradients as we will not use them
#torch.set_grad_enabled(False)

# initialize N number of agents
num_agents = 4
agents = return_random_agents(num_agents)

# How many top agents to consider as parents
top_limit = 2

# run evolution until X generations
generations = 3

elite_index = None

for generation in range(generations):

    # return rewards of agents
    rewards = run_agents_n_times(agents, 1) #return average of 3 runs
    print("REWARDS::::::",rewards)
    # sort by rewards
    sorted_parent_indexes = np.argsort(rewards)[::-1][:top_limit]#reverses and gives top values (argsort sorts by ascending by default) https://stackoverflow.com/questions/16486252/is-it-possible-to-use-argsort-in-descending-order
    print("Sorting Parent Indexes: ",sorted_parent_indexes)
    print(" Data Type: ", type(sorted_parent_indexes))
    print("Sorting Completed")
    print("Selecting Top Parents")
    
    top_rewards = []
    for best_parent in sorted_parent_indexes:
        top_rewards.append(rewards[best_parent])
    
    print("Generation ", generation, " | Mean rewards: ", torch.mean(torch.stack(rewards)), " | Mean of top 5: ",torch.mean(torch.stack(top_rewards[:5])))
    #print(rewards)
    print("Top ",top_limit," scores", sorted_parent_indexes)
    print("Rewards for top: ",top_rewards)
    
    # setup an empty list for containing children agents
    children_agents, elite_index = return_children(agents, sorted_parent_indexes, elite_index)

    # kill all agents, and replace them with their children
    agents = children_agents

 Episode 20 	 avg length: 29 	 reward: -23
 Episode 40 	 avg length: 29 	 reward: -17
 Episode 60 	 avg length: 29 	 reward: -20
State Values :  tensor([0.0507, 0.0542, 0.0543,  ..., 0.0867, 0.0867, 0.0842])
Advantages :  tensor([-0.1833, -0.3635, -0.3604,  ...,  2.7754,  2.7542,  2.7764],
       dtype=torch.float64)
L Clip :  tensor(0.5846, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(-0.0928, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 80 	 avg length: 29 	 reward: -26
 Episode 100 	 avg length: 29 	 reward: -24
 Episode 120 	 avg length: 29 	 reward: -20
State Values :  tensor([0.0507, 0.0542, 0.0543,  ..., 0.0651, 0.0660, 0.0631])
Advantages :  tensor([0.2647, 0.0899, 0.0928,  ..., 3.0370, 3.0281, 3.1138],
       dtype=torch.float64)
L Clip :  tensor(0.5860, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(-0.0914, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 140 	 avg length: 29 	 reward: -28
 Episo

 Episode 220 	 avg length: 29 	 reward: -24
 Episode 240 	 avg length: 29 	 reward: -23
 Episode 260 	 avg length: 29 	 reward: -29
State Values :  tensor([0.1413, 0.1441, 0.1438,  ..., 0.0794, 0.0819, 0.0757])
Advantages :  tensor([0.3890, 0.4710, 0.4380,  ..., 3.1137, 3.1878, 3.2302],
       dtype=torch.float64)
L Clip :  tensor(0.6014, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(-0.1079, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 280 	 avg length: 29 	 reward: -29
 Episode 300 	 avg length: 29 	 reward: -28
 Episode 320 	 avg length: 29 	 reward: -27
State Values :  tensor([0.1413, 0.1441, 0.1438,  ..., 0.0809, 0.0840, 0.0784])
Advantages :  tensor([0.4771, 0.5580, 0.5254,  ..., 3.1549, 3.2310, 3.2533],
       dtype=torch.float64)
L Clip :  tensor(0.6007, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(-0.1077, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 340 	 avg length: 29 	 reward: -28
 Episode

 Episode 280 	 avg length: 29 	 reward: -21
 Episode 300 	 avg length: 29 	 reward: -31
 Episode 320 	 avg length: 29 	 reward: -27
State Values :  tensor([-0.2446, -0.2462, -0.2469,  ..., -0.2886, -0.2910, -0.2939])
Advantages :  tensor([0.1626, 0.2374, 0.3108,  ..., 3.5827, 3.5736, 3.5775],
       dtype=torch.float64)
L Clip :  tensor(0.2678, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(0.2492, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 340 	 avg length: 29 	 reward: -20
 Episode 360 	 avg length: 29 	 reward: -20
 Episode 380 	 avg length: 29 	 reward: -23
State Values :  tensor([-0.2446, -0.2462, -0.2469,  ..., -0.2503, -0.2506, -0.2463])
Advantages :  tensor([0.1261, 0.2029, 0.2784,  ..., 3.5230, 3.5022, 3.6207],
       dtype=torch.float64)
L Clip :  tensor(0.2672, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(0.2492, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 400 	 avg length: 29 	 reward: -2

 Episode 420 	 avg length: 29 	 reward: -21
 Episode 440 	 avg length: 29 	 reward: -19
 Episode 460 	 avg length: 29 	 reward: -22
State Values :  tensor([0.0506, 0.0509, 0.0492,  ..., 0.0184, 0.0175, 0.0188])
Advantages :  tensor([ 0.1856,  0.0528, -0.0729,  ...,  3.2083,  3.3792,  3.4184],
       dtype=torch.float64)
L Clip :  tensor(0.5408, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(-0.0544, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 480 	 avg length: 29 	 reward: -27
 Episode 500 	 avg length: 29 	 reward: -26
 Episode 20 	 avg length: 29 	 reward: -26
 Episode 40 	 avg length: 29 	 reward: -23
 Episode 60 	 avg length: 29 	 reward: -22
State Values :  tensor([-0.0843, -0.0776, -0.0813,  ..., -0.0265, -0.0210, -0.0230])
Advantages :  tensor([-0.2809, -0.2180, -0.4024,  ...,  2.7669,  2.8840,  2.9451],
       dtype=torch.float64)
L Clip :  tensor(0.3899, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(0.10

 Episode 120 	 avg length: 29 	 reward: -20
State Values :  tensor([-0.2040, -0.2087, -0.2094,  ..., -0.2411, -0.2380, -0.2404])
Advantages :  tensor([-0.1817, -0.2503, -0.2919,  ...,  2.9619,  2.9429,  3.0095],
       dtype=torch.float64)
L Clip :  tensor(0.2984, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(0.2128, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 140 	 avg length: 29 	 reward: -19
 Episode 160 	 avg length: 29 	 reward: -25
 Episode 180 	 avg length: 29 	 reward: -13
State Values :  tensor([-0.2040, -0.2087, -0.2094,  ..., -0.2307, -0.2309, -0.2308])
Advantages :  tensor([-0.1862, -0.2599, -0.3043,  ...,  3.2743,  3.2714,  3.2807],
       dtype=torch.float64)
L Clip :  tensor(0.2992, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(0.2128, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 200 	 avg length: 29 	 reward: -27
 Episode 220 	 avg length: 29 	 reward: -20
 Episode 240 	 avg length: 29 

 Episode 160 	 avg length: 29 	 reward: -20
 Episode 180 	 avg length: 29 	 reward: -25
State Values :  tensor([0.1316, 0.1322, 0.1327,  ..., 0.1216, 0.1215, 0.1209])
Advantages :  tensor([-0.6465, -0.6555, -0.6587,  ...,  2.6780,  2.7313,  2.6577],
       dtype=torch.float64)
L Clip :  tensor(0.6297, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(-0.1346, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 200 	 avg length: 29 	 reward: -20
 Episode 220 	 avg length: 29 	 reward: -17
 Episode 240 	 avg length: 29 	 reward: -25
 Episode 260 	 avg length: 29 	 reward: -30
State Values :  tensor([0.1316, 0.1322, 0.1327,  ..., 0.1367, 0.1362, 0.1358])
Advantages :  tensor([-0.5536, -0.5626, -0.5658,  ...,  2.5404,  2.6015,  2.7110],
       dtype=torch.float64)
L Clip :  tensor(0.6295, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(-0.1345, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 280 	 avg length: 29 	 reward: 

 Episode 280 	 avg length: 29 	 reward: -23
 Episode 300 	 avg length: 29 	 reward: -25
 Episode 320 	 avg length: 29 	 reward: -21
State Values :  tensor([-0.0287, -0.0346, -0.0318,  ..., -0.1141, -0.1201, -0.1168])
Advantages :  tensor([-0.1045, -0.2055, -0.1408,  ...,  3.6680,  3.7615,  3.5958],
       dtype=torch.float64)
L Clip :  tensor(0.4085, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(0.0802, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 340 	 avg length: 29 	 reward: -24
 Episode 360 	 avg length: 29 	 reward: -22
 Episode 380 	 avg length: 29 	 reward: -19
State Values :  tensor([-0.0287, -0.0346, -0.0318,  ..., -0.1030, -0.0980, -0.0959])
Advantages :  tensor([-0.1341, -0.2386, -0.1716,  ...,  4.0505,  3.8676,  3.9298],
       dtype=torch.float64)
L Clip :  tensor(0.4102, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(0.0790, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 400 	 avg length: 29 

 Episode 460 	 avg length: 29 	 reward: -19
State Values :  tensor([-0.2175, -0.2140, -0.2156,  ..., -0.2099, -0.2118, -0.2104])
Advantages :  tensor([1.0147, 0.8894, 0.8930,  ..., 3.4518, 3.4304, 3.3743],
       dtype=torch.float64)
L Clip :  tensor(0.3075, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(0.1964, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 480 	 avg length: 29 	 reward: -22
 Episode 500 	 avg length: 29 	 reward: -22
 Episode 20 	 avg length: 29 	 reward: -28
 Episode 40 	 avg length: 29 	 reward: -25
 Episode 60 	 avg length: 29 	 reward: -25
State Values :  tensor([0.2008, 0.1973, 0.1994,  ..., 0.1850, 0.1832, 0.1844])
Advantages :  tensor([-0.2955, -0.2624, -0.2254,  ...,  3.5380,  3.6496,  3.6605],
       dtype=torch.float64)
L Clip :  tensor(0.7263, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(-0.2079, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 80 	 avg length: 29 	 reward: -27
 

Score for elite i  2  is  tensor(-0.0270, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 20 	 avg length: 29 	 reward: -17
 Episode 40 	 avg length: 29 	 reward: -22
 Episode 60 	 avg length: 29 	 reward: -19
State Values :  tensor([-0.0187, -0.0232, -0.0272,  ..., -0.0530, -0.0520, -0.0520])
Advantages :  tensor([0.8107, 0.8481, 0.8963,  ..., 3.1529, 3.1141, 3.1122],
       dtype=torch.float64)
L Clip :  tensor(0.4762, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(0.0111, dtype=torch.float64, grad_fn=<MeanBackward0>)
 Episode 80 	 avg length: 29 	 reward: -20
 Episode 100 	 avg length: 29 	 reward: -25
 Episode 120 	 avg length: 29 	 reward: -21
State Values :  tensor([-0.0187, -0.0232, -0.0272,  ...,  0.0195,  0.0162,  0.0163])
Advantages :  tensor([1.2524, 1.2903, 1.3392,  ..., 3.3058, 3.4212, 3.4941],
       dtype=torch.float64)
L Clip :  tensor(0.4756, dtype=torch.float64, grad_fn=<MeanBackward0>)
returned Surrogate Loss  tensor(0.0097, dtyp

In [64]:
def play_agent(agent):
        env = gym.make("LunarLander-v2")
        
        env_record = Monitor(env, './video', force=True)
        observation = env_record.reset()
        last_observation = observation
        r=0
        for _ in range(250):
            env_record.render()
            inp = torch.tensor(observation).type('torch.FloatTensor').view(1,-1)
            output_probabilities = agent(inp).detach().numpy()[0]
            action = np.random.choice(range(game_actions), 1, p=output_probabilities).item()
            new_observation, reward, done, info = env_record.step(action)
            r=r+reward
            observation = new_observation

            if(done):
                break

        env_record.close()
        print("Rewards: ",r)      

In [65]:
play_agent(agents[1])

ValueError: 'a' and 'p' must have same size