# PPO in Gym Lunar Lander

This notebook will have the implementation for PPO applied to Gym environment for later using it to the Transformer problem.

In [1]:
import os, sys
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
import math
import random as rd
import matplotlib.pyplot as plt
from scipy import integrate
from mpl_toolkits.mplot3d import Axes3D
import pandas as pd
from tensorflow import keras
from tensorflow.keras.layers import *
from tensorflow.keras import activations
from collections import deque
from tqdm import tqdm
import shutil
import time

In [2]:
import gym

In [3]:
tf.config.list_physical_devices('GPU')

[]

In [4]:
ENV_NAME = 'LunarLanderContinuous-v2'
CONTINUOUS = True

## Reinforcement Learning
In the section below, a reinforcement learning agent shall be implemented and trained to perform a landing in the environment defined above. The reward function is already given.

The agent that will be implemented in this notebook is a PPO agent. 

###  Actor Model

In [5]:
def get_value_network(input_size):
    '''
    In this case, the input should be the env.observation_space.shape[0]
    '''
    x = keras.Input(input_size)
    
    v = Dense(10*input_size[0], activation = 'relu')(x)
    v = Dense(30, activation = 'relu')(v)
    v = Dense(17, activation = 'relu')(v)
    v = Dense(5, activation = 'relu')(v)
    v = Dense(1, activation = 'linear')(v)
    
    model = keras.Model(x,v)
    
    return model

In [6]:
def get_policy_network(agent, input_size, output_size):
    '''
    Input: env.observation_space.shape 
    Output_size: should be env.action_space.shape[0]
    '''
    input_x = keras.Input(input_size)
    
    x = Dense(10*input_size[0],activation='relu')(input_x)
    x = Dense(100,activation='relu')(x)
    x = Dense(50,activation='relu')(x)
    x = Dense(10*output_size[0],activation='relu')(x)

    mu = Dense(output_size[0], activation = 'tanh')(x)
    log_std = Dense(output_size[0], activation = 'linear')(x) # We get the logarithm of the std

    model = keras.Model(input_x,[mu, log_std])
    
    return model   

In [7]:
def get_actions(mu, log_std):
    log_std = tf.clip_by_value(log_std, -9, 2)
    std = tf.exp(log_std)
    gaussian = tfp.distributions.Normal(loc = mu, scale = std)
    actions = gaussian.sample(1)
    
    #####
    #print('_ _ _ _ _ _')
    #print('GET ACTIONS')
    #print('actions shape', actions.shape)
    #print('actions before squeeze', actions)
    #print('_ _ _ _ _ _')
    #####
    
    actions = tf.squeeze(actions,axis=0) # Corrected (before, axis = 1)
    assert mu.shape == actions.shape, 'mu and actions shape is not the same, be careful with squeeze'
    actions = tf.clip_by_value(actions,-1,1)
    return actions 

## Losses

In [8]:
def calc_logprob(mu, log_std, action):
    
    #action = tf.squeeze(action,axis=1)
    action = tf.cast(action, dtype=tf.float32)
    
    assert mu.shape == action.shape, 'mu and actions shape is not the same in calcprob: action ' + str(action.shape) + ' mu ' + str(mu.shape)
    assert log_std.shape == action.shape, 'mu and actions shape is not the same in calcprob'
    
    #p1 = - tf.math.divide(((mu - action) ** 2),(2*tf.exp(log_std)))
    #p2 = - tf.math.log(tf.math.sqrt(2 * np.pi * tf.exp(log_std)))
    
    #assert p1.shape == p2.shape, 'p1 and p2 have different shapes'
    #assert not np.isinf(np.sum(p1.numpy())), 'p1 is infinite'
    #assert not np.isinf(np.sum(p2.numpy())), 'p2 is infinite'
    #assert tf.reduce_all(tf.math.is_nan(p1[0])) == False, "p1 have NaNs"
    #assert tf.reduce_all(tf.math.is_nan(p2[0]))== False, "p2 have NaNs"

    std = tf.exp(log_std) # Corrected
    gaussian = tfp.distributions.Normal(loc = mu, scale = std) # Corrected
    p3 = gaussian.log_prob(action) # Corrected
    
    assert not np.isinf(np.sum(p3.numpy())), 'log_prob is infinite'
    
    #####
    #print('CALC_LOGPROB')
    #print('action', action[0])
    #print('mu', mu.shape)
    #print('log_std', log_std.shape)
    #print('p1', p1.shape)
    #print('_______________')
    #if tf.math.is_nan(tf.math.reduce_sum(p1)):
        #print('p1',p1)
        #print('mu',mu)
    #if tf.math.is_nan(tf.math.reduce_sum(p2)):
        #print('p2',p2)
        #print('log_std',log_std)
    #####

    return p3 # Corrected (before, return p1+p2)

In [9]:
def get_adv_ref(agent, trajectory):
    # gives the adv and ref from a single trajectory
    gamma = agent.gamma
    gae_lambda = agent.gae_lambda

    states, actions, rewards, terminals, next_states = map(list, zip(*trajectory))
    
    #####
    #print('_ _ _ _ _ _ _')
    #print('GET_ADV_REF')
    #print('things when getting them from the trajectory')
    #print('states shape',np.shape(states))
    #print('states',states)
    #print('actions shape',np.shape(actions))
    #print('actions',actions)
    #print('rewards shape',np.shape(rewards))
    #print('rewards',rewards)
    #####
    
    states = np.concatenate(states)
    next_states = np.concatenate(next_states)
    rewards = np.array(rewards)
    
    #####
    #print('things after concatenating them:')
    #print('states shape',states.shape)
    #print('states',states)
    #print('rewards shape',rewards.shape)
    #print('rewards',rewards)
    #####
    
    value_v = agent.critic(states)
    
    #####
    #print('value function after NN')
    #print('value_v shape', value_v.shape)
    #print('value_v',value_v)
    #####
    
    values = tf.squeeze(value_v).numpy()

    next_value_v = agent.critic(next_states)
    next_values = tf.squeeze(next_value_v).numpy()

    #####
    #print('value function after squeeze')
    #print('value_v shape squeezed', value_v.shape)
    #print('value_v squeezed',value_v)
    #print('states when getting adv and ref',states.shape)
    #print('values', values.shape)
    #print('rewards in adv and ref', rewards.shape)
    #print('next_values', values.shape)
    #####

    # generalized advantage estimator: smoothed version of the advantage
    last_gae = 0.0
    result_adv = []
    result_ref = []
    
    for val, next_val, reward, terminal in zip(reversed(values),reversed(next_values),reversed(rewards),reversed(terminals)):
        
        #####
        #print('inside the loop')
        #print('val',val)
        #print('next_val', next_val)
        #print('reward',reward)
        #print('terminal',terminal)
        #####
        
        if terminal:
            delta = reward - val
            last_gae = delta
            
            #####
            #print('inside the if')
            #print('delta',delta)
            #print('last_gae',last_gae)
            #####
            
        else:
            delta = reward + gamma*next_val - val
            last_gae = delta + gamma*gae_lambda*last_gae
            
            #####
            #print('inside the else')
            #print('delta',delta)
            #print('last_gae',last_gae)
            #####

        result_adv.append(last_gae)
        result_ref.append(last_gae + val)

        #####
        #print('element in adv',last_gae)
        #print('element in ref', last_gae + val)
        #print('_______________')
        #####    
    
    #####
    #print('adv_v NOT reversed shape',np.shape(result_adv))
    #print('adv_v NOT reversed',result_adv)
    #print('ref_v NOT reversed shape',np.shape(result_ref))
    #print('ref_v NOT reversed',result_ref)
    #####
    
    adv_v = list(reversed(result_adv))
    ref_v = list(reversed(result_ref))
    
    #####
    #print('adv_v reversed shape',np.shape(adv_v))
    #print('adv_v reversed',adv_v)
    #print('ref_v reversed shape',np.shape(ref_v))
    #print('ref_v reversed',ref_v)
    #print('_ _ _ _ _ _ _')
    #####
    
    return adv_v, ref_v

In [10]:
def loss_critic(ref_val,value_v):
    loss = tf.keras.losses.MSE(value_v,ref_val)
    
    return loss

# Implementation

In [11]:
def optimize_network(agent,states,actions,adv_batch,ref_batch,batch_old_logprob_v):
    with tf.GradientTape() as tape1, tf.GradientTape() as tape2:

        # critic training
        value_v = agent.critic(states)
        assert ref_batch.shape == value_v.shape, "ref_batch and value_v have different shapes"
        loss_value_v = loss_critic(ref_batch,value_v)

        # actor training
        mu, log_std = agent.actor(states)
        log_std = tf.clip_by_value(log_std, -9, 2)
        logprob_pi_v = calc_logprob(mu,log_std,actions)
        assert logprob_pi_v.shape == batch_old_logprob_v.shape, "logprob_pi_v.shape and batch_old_logprob_v have different shapes"
        ratio_v = tf.exp(logprob_pi_v - batch_old_logprob_v) # only positive values
        surr_obj_v = tf.math.multiply(adv_batch,ratio_v)

        #####
        #print('OPTIMIZE NETWORK')
        #print('Critic')
        #print('value_v', value_v.shape)
        #print('ref_batch', ref_batch.shape)
        #print('loss_value_v', loss_value_v.shape)
        #print('Actor')
        #print('mu', mu.shape, 'log_std', log_std.shape)
        #print('states', states.shape)
        #print('actions', actions.shape)
        #print('log_prob_pi', logprob_pi_v.shape)
        #print('batch_old_logprob_v', batch_old_logprob_v.shape)
        #print('ratio_v', ratio_v.shape)
        #print('surr_obj_v',surr_obj_v.shape)
        #print('adv_batch',adv_batch.shape)
        #print('ref_batch',ref_batch.shape)
        #print('_______________')
        #####
        
        
        clipped_surr_v = tf.math.multiply(adv_batch,tf.clip_by_value(ratio_v, 1.0 - agent.ppo_eps, 1 + agent.ppo_eps))
        loss_policy_v = - tf.math.reduce_mean(tf.math.minimum(surr_obj_v,clipped_surr_v))
        
        #####
        #print('surr_obj_v',surr_obj_v.shape)
        #if loss_policy_v > 1.2:
            #print('ratio_v',tf.math.reduce_mean(ratio_v))
            #print('surr_obj_v',tf.math.reduce_mean(surr_obj_v))
            #print('loss',loss_policy_v)
        if tf.reduce_all(tf.math.is_nan(loss_policy_v)):
            print('adv_batch',adv_batch)
            print('ratio_v',ratio_v)
            print('clipped_ratio_v',clipped_ratio_v)
            print('logprob_pi',logprob_pi_v)
            print('batch_old_logprob_v',batch_old_logprob_v)
            print('mu',mu)
            print('log_std',log_std)
            print('actions',actions)
        #####
        
    grads_crit = tape1.gradient(loss_value_v, agent.critic.trainable_variables)
    grads_act = tape2.gradient(loss_policy_v, agent.actor.trainable_variables)
    
    grads_act, _ = tf.clip_by_global_norm(grads_act, 0.5)
    assert tf.reduce_all(tf.math.is_nan(grads_crit[0])) == False, "grads_crit have NaNs"
    assert tf.reduce_all(tf.math.is_nan(grads_act[0])) == False, "grads_act have NaNs"
    
    agent.critic_optimizer.apply_gradients(zip(grads_crit, agent.critic.trainable_variables))
    agent.actor_optimizer.apply_gradients(zip(grads_act, agent.actor.trainable_variables))

In [12]:
 class Agent():
    def __init__(self):
        self.name = 'A2C_Agent'
        
    def agent_init(self):
        self.name = 'A2C_Agent'
        self.state_size = STATE_SIZE
        self.output_size = OUTPUT_SIZE
        self.test_iterations = TEST_ITERATIONS
        self.batch_size = BATCH_SIZE
        self.critic_lr = ADAM_LR_CRITIC
        self.actor_lr = ADAM_LR_ACTOR
        self.gamma = GAMMA
        self.gae_lambda = GAE_LAMBDA
        self.tolerance = TOLERANCE
        self.buffer_size = BUFFER_SIZE
        self.timeout = TIMEOUT
        self.ppo_eps = PPO_EPS
        self.test_episode = TEST_EPISODE
        self.epochs = TRAIN_EPOCHS
        
        # Generate NNs
        self.actor = get_policy_network(self,self.state_size,self.output_size)
        self.actor_optimizer = keras.optimizers.Adam(lr = self.actor_lr)
        self.critic = get_value_network(self.state_size)
        self.critic_optimizer = keras.optimizers.Adam(lr = self.critic_lr)
        
        self.last_state = None
        self.last_action = None
        
        # Initialize buffers
        self.buffer_old_logprob_v = []
        self.adv_buffer = []
        self.ref_buffer = []
        self.optimize_buffer = []
        self.distribution_buffer = []
        self.extra_index = [] # Focused training
        
    def agent_start(self,state,test=False):
        self.sum_rewards = 0
        self.episode_steps = 0
        self.total_turns = 0
        self.total_time = 0
        self.total_cost = 0
        self.last_state = state
        self.episode_buffer = []
        mu,log_std = self.actor(self.last_state)
        log_std = tf.clip_by_value(log_std, -9, 2)

        #####
        #print('AGENT_START')
        #print('state shape',state.shape)
        #print('state',state)
        #print('last state', self.last_state.shape)
        #print('mu shape',mu.shape)
        #print('mu',mu)
        #print('log_std shape',log_std.shape)
        #print('log_std',log_std)
        #####

        self.last_action = get_actions(mu, log_std)

        #####
        #print('last action shape',self.last_action.shape)
        #print('last action', self.last_action)
        #print('_______________')
        #####

        if test:
            self.last_action = mu
        else:
            self.distribution_buffer.append([mu.numpy(),log_std.numpy()])

        return self.last_action
        
    def agent_step(self, reward, state,test=False):    
        self.sum_rewards += reward
        self.episode_steps += 1

        mu,log_std = self.actor(state)
        log_std = tf.clip_by_value(log_std, -9, 2)

        if test:
            actions = mu
        else:
            actions = get_actions(mu, log_std)       
            # Append new experience to replay buffer
            self.episode_buffer.append([self.last_state, self.last_action, reward, 0, state])
            self.distribution_buffer.append([mu.numpy(),log_std.numpy()])

        #####
        #print('AGENT_STEP')
        #print('state shape', state.shape)
        #print('state',state)
        #print('reward',reward)
        #print('mu step shape', mu.shape, 'mu',mu)
        #print('log_std_step shape', log_std.shape,'log_std',log_std)
        #print('actions shape', actions.shape)
        #print('actions',actions)
        #print('episode_buffer',self.episode_buffer)
        #print('distribution_buffer',self.distribution_buffer)
        #print('_______________')
        #####

        # Update the last state and last action.
        self.last_state = state
        self.last_action = actions
    
    def agent_end(self, reward, state, test=False):
        self.sum_rewards += reward
        self.episode_steps += 1
        
        if not test:
            # Set terminal state to an array of zeros
            state = np.zeros_like(self.last_state)

            # Append new experience to replay buffer
            self.episode_buffer.append([self.last_state, self.last_action, reward, 1, state])

            # We get the advantages and references of all this episode
            adv_v, ref_v = get_adv_ref(self, self.episode_buffer)

            len_opt_before = len(self.optimize_buffer) # Focused training   
            
            self.adv_buffer.extend(adv_v) 
            self.ref_buffer.extend(ref_v) 
            self.optimize_buffer.extend(self.episode_buffer) 
            
            #####
            #print('AGENT END')
            #print('adv_v shape',np.shape(adv_v))
            #print('adv_v:',adv_v)
            #print('ref_v shape',np.shape(ref_v))
            #print('ref_v:',ref_v)
            #print('self.adv_buffer shape',np.shape(self.adv_buffer))
            #print('self.adv_buffer',self.adv_buffer)
            #print('self.ref_buffer shape',np.shape(self.ref_buffer))
            #print('self.ref_buffer',self.ref_buffer)
            #print('_______________')
            #####
            
            # Focused training
            if self.sum_rewards > 0:
                len_opt_after = len(self.optimize_buffer) 
                index = np.arange(len_opt_before,len_opt_after)
                index = index.tolist()
                if self.sum_rewards>200:
                    index = 4*index
                elif self.sum_rewards > 100:
                    index = 2*index
                self.extra_index.extend(index)

    def agent_message(self, message):
        if message == "get_sum_reward":
            return self.sum_rewards
        else:
            raise Exception("Unrecognized Message!")

In [13]:
def run_experiment(agent,num_episodes, max_count, load = False):
    env = gym.make(ENV_NAME)
    
    agent_sum_reward = np.zeros((1,num_episodes))
    agent_cost = np.zeros((1,num_episodes))
    agent_time = np.zeros((1,num_episodes))
    agent_turn = np.zeros((1,num_episodes))
    agent.agent_init(agent)
    
    if load:
        agent.critic.load_weights("G:/My Drive/JAXA/2020-2021/Transformer/weights/PPO_Lander_critic_" + str(108) + ".h5")
        agent.actor.load_weights("G:/My Drive/JAXA/2020-2021/Transformer/weights/PPO_Lander_actor_" + str(108) + ".h5")
        
    for episode in tqdm(range(1,num_episodes + 1)):
        run_episode(agent,env,test=False)
        episode_reward = agent.agent_message(agent,'get_sum_reward')
        agent_sum_reward[0,episode - 1] = episode_reward
        #agent_cost[0,episode - 1] = agent.total_cost
        #agent_time[0,episode - 1] = agent.total_time
        #agent_turn[0,episode - 1] = agent.total_turns
            
        # Perform a test 
        if episode % agent.test_episode == 0:
            print('Running test')
            count = 0
            sum_test_reward = 0
            for t in range(1,agent.test_iterations):
                run_episode(agent,env,test=True)
                test_reward = agent.agent_message(agent,'get_sum_reward')
                sum_test_reward += test_reward
                if test_reward > 200:
                    count += 1
                    print('Test:', t, 'Test_Reward:', test_reward)
                #print('Turns:', agent.total_turns, 'Time:', agent.total_time, 'Cost:', agent.total_cost)
            print('Test finished with success ' + str(count) + ' out of ' + str(agent.test_iterations) + ' iterations.')
            print('Test average reward is ' + str(sum_test_reward/agent.test_iterations))
            if count>=max_count:
                max_count = count
                agent.critic.save_weights("G:/My Drive/JAXA/2020-2021/Transformer/weights/PPO_Lander_critic_" + str(count) + ".h5")
                agent.actor.save_weights("G:/My Drive/JAXA/2020-2021/Transformer/weights/PPO_Lander_actor_" + str(count) + ".h5")
        
        run_episode(agent,env,test=False)
        
        if len(agent.optimize_buffer) > agent.buffer_size:
            mu_v, log_std_v = map(list, zip(*agent.distribution_buffer))

            #####
            #print('TRAIN INSIDE EPISODE')
            #print('mu before concat',np.shape(mu_v))
            #print('mu',mu_v)
            #print('log_std_v before concat', np.shape(log_std_v))
            #print('log_std',log_std_v)
            #print('len(agent.optimize_buffer)',len(agent.optimize_buffer))
            #####

            mu_v = np.concatenate(mu_v)
            mu_v = tf.convert_to_tensor(mu_v, dtype=tf.float32)
            log_std_v = np.concatenate(log_std_v)
            log_std_v = tf.convert_to_tensor(log_std_v, dtype=tf.float32)

            #####
            #print('mu after concat',mu_v.shape)
            #print('mu',mu_v)
            #print('log_std_v after concat', log_std_v.shape)
            #print('log_std',log_std_v)
            #####

            # Get states and actions
            states, actions, rewards, terminals, next_states = map(list, zip(*agent.optimize_buffer))

            actions = np.concatenate(actions)
            states = np.concatenate(states)

            #####
            #print('actions after concat',actions.shape)
            #print('actions',actions)
            #print('states after concat', states.shape)
            #print('states',states)
            #####

            old_logprob_v = calc_logprob(mu_v, log_std_v, actions)
            adv_buffer = np.array(agent.adv_buffer, dtype = np.float32) # size (sth,)
            adv_buffer = tf.expand_dims(adv_buffer,axis = 1).numpy()
            ref_buffer = np.array(agent.ref_buffer, dtype = np.float32) # size (sth,)
            ref_buffer = tf.expand_dims(ref_buffer,axis = 1).numpy()

            #####
            #print('old_logprob_v',old_logprob_v.shape)
            #print('adv_buffer', adv_buffer.shape)
            #print('ref_buffer', ref_buffer.shape)
            #####

            mean_adv = tf.math.reduce_mean(adv_buffer,axis =0, keepdims = True)
            std_adv = tf.math.reduce_std(adv_buffer,axis =0, keepdims = True)
            norm_adv = (adv_buffer - mean_adv) / (tf.clip_by_value(std_adv,1e-8,1e8)) # Corrected (this is the true normalization)
            #norm_adv = tf.keras.utils.normalize(adv_buffer,axis = 0) # Corrected
            
            idxs = np.arange(len(states))
            extra_idx = np.array(agent.extra_index) # Focused training
            if not len(extra_idx) == None: # Focused training
                idxs = np.append(idxs,extra_idx) # Focused training
            np.random.shuffle(idxs) 
            idxs = idxs.astype(int) # Focused training
            
            #####
            #print('norm_adv shape', norm_adv.shape)
            #print('norm_adv', norm_adv)
            #print('max of adv_v',tf.math.reduce_max(norm_adv))
            #print('min of adv_v',tf.math.reduce_min (norm_adv))
            #print('shape idx',idxs.shape)
            #print(idxs)
            #####
            
            for epoch in range(agent.epochs):
                #num_batches = int(math.floor(len(states)/agent.batch_size))
                num_batches = int(math.floor(len(idxs)/agent.batch_size)) # Focused training

                #####
                #print('NUM BATCHES', num_batches)
                #####

                for batch in range(num_batches):
                    start_num = batch*agent.batch_size
                    end_num = start_num + agent.batch_size

                    #####
                    #print('len(states)',len(states))
                    #print('shape states',states.shape)
                    #print('slice idx',idxs[start_num:end_num])
                    #####

                    #states_batch = states[start_num:end_num]
                    states_batch = np.array([states[idx] for idx in idxs[start_num:end_num]])
                    actions_batch = np.array([actions[idx] for idx in idxs[start_num:end_num]])
                    adv_batch = np.array([norm_adv[idx] for idx in idxs[start_num:end_num]]) 
                    ref_batch = np.array([ref_buffer[idx] for idx in idxs[start_num:end_num]]) 
                    old_logprob_v_batch = np.array([old_logprob_v[idx] for idx in idxs[start_num:end_num]])

                    #####
                    #print('BATCHES')
                    #print(' states_batch', states_batch.shape)
                    #print('actions_batch', actions_batch.shape)
                    #print('adv_batch', adv_batch.shape)
                    #print('ref_batch', ref_batch.shape)
                    #print('old_logprob_v_batch', old_logprob_v_batch.shape)
                    #print('adv_batch',adv_batch)
                    #####

                    optimize_network(agent,states_batch,actions_batch,adv_batch,ref_batch,old_logprob_v_batch)
                    
            #####
            #if episode%100 == 0:
                #print('mu_v', mu_v)
                #print('log_std_v',log_std_v)
            #####        
                    
            # restart all lists
            del agent.buffer_old_logprob_v[:]
            del agent.adv_buffer[:]
            del agent.ref_buffer[:]
            del agent.optimize_buffer[:]
            del agent.distribution_buffer[:]
            del agent.extra_index[:]

        if episode%10 == 0 or episode == 1:
            print('Episodes:', episode, 'Episodic_Reward:', episode_reward)
            #print('Turns:', agent.total_turns, 'Time:', agent.total_time, 'Cost:', agent.total_cost)
            
        if episode%50 == 0 or episode == 1:
            env.render()

        if episode_reward > 200:
          print('Success in episode ' + str(episode) + ' with a reward of ' + str(episode_reward))
        elif episode_reward > 100:
          print('Looking good in ' + str(episode) + ' with a reward of ' + str(episode_reward))
        elif episode_reward > 0:
          print('Getting somewhere in ' + str(episode) + ' with a reward of ' + str(episode_reward))
        
    save_name = "{}".format(agent.name) 
    if not os.path.exists('results'):
        os.makedirs('results')
    np.save("results/sum_reward_{}".format(save_name), agent_sum_reward)
    shutil.make_archive('results', 'zip', 'results')

In [14]:
def get_objective_attitude(min_ang, max_ang):
    euler_1 = (min_ang + rd.random()*(max_ang - min_ang)) * np.pi/180
    euler_2 = (min_ang + rd.random()*(max_ang - min_ang)) * np.pi/180
    euler_3 = (min_ang + rd.random()*(max_ang - min_ang)) * np.pi/180
    '''
    if rd.random() < 0.33:
        euler_1 = 0.0
    elif rd.random() > 0.66:
        euler_1 = 0.0
        euler_2 = 0.0
    '''
    initial_att = np.array([euler_1,euler_2,euler_3])
    
    # np.random.shuffle(initial_att)      
    objective_attitude = initial_att
    return objective_attitude

In [15]:
def run_episode(agent,env,test=False):
    done = False
    s = env.reset()

    # First the objective attitude is given in the form of Euler y-x-z in rad
    #objective_attitude = get_objective_attitude(MIN_ANG, MAX_ANG)
    # The initial attitude of the sc is always zero
    #initial_attitude = np.array([0,0,0])    
    # Initial state (1,6)
    #last_state = np.array([np.append(objective_attitude,np.append(initial_attitude,diff))])
    last_state = np.zeros((1,agent.state_size[0]))
    # Action (1,12)
    last_action = agent.agent_start(agent,last_state,test)
    # Let's start the real episode
    while (done == False):
        # We get the next state from the environment, as well as the reward and if terminal
        #state, reward, done = environment(agent)
        state, reward, done, _ = env.step(tf.squeeze(agent.last_action).numpy())
        state = np.array([state])#tf.expand_dims(state, axis=0).numpy().tolist()
        if done:
            agent.agent_end(agent,reward, state,test)
        else:
            agent.agent_step(agent, reward, state,test)
        

In [16]:
STATE_SIZE = (8,)
OUTPUT_SIZE = (2,)
TEST_ITERATIONS = 100
BATCH_SIZE = 64
ADAM_LR_CRITIC = 5e-4
ADAM_LR_ACTOR = 5e-5
GAMMA = 0.99
GAE_LAMBDA = 0.95
TOLERANCE = None
BUFFER_SIZE = 2048
TIMEOUT = None
PPO_EPS = 0.2
TEST_EPISODE = 50
TRAIN_EPOCHS = 10

In [17]:
PPO_agent = Agent
run_experiment(PPO_agent,3000, 0, load = False)

  0%|          | 0/3000 [00:00<?, ?it/s]

Episodes: 1 Episodic_Reward: -49.230032500672024


  0%|          | 10/3000 [00:08<1:07:56,  1.36s/it]

Episodes: 10 Episodic_Reward: -393.73575772356884


  1%|          | 20/3000 [00:16<49:14,  1.01it/s]  

Episodes: 20 Episodic_Reward: -169.5211486105952


  1%|          | 30/3000 [00:25<51:16,  1.04s/it]  

Episodes: 30 Episodic_Reward: -371.8780124993341


  1%|▏         | 40/3000 [00:33<39:27,  1.25it/s]  

Episodes: 40 Episodic_Reward: -290.4919907712265


  2%|▏         | 49/3000 [00:40<46:24,  1.06it/s]  

Running test
Test finished with success 0 out of 100 iterations.
Test average reward is -873.716769676374


  2%|▏         | 50/3000 [00:54<4:01:47,  4.92s/it]

Episodes: 50 Episodic_Reward: -260.4916592319884


  2%|▏         | 60/3000 [01:02<35:53,  1.37it/s]  

Episodes: 60 Episodic_Reward: -557.9632199298376


  2%|▏         | 70/3000 [01:10<34:43,  1.41it/s]  

Episodes: 70 Episodic_Reward: -292.8908391218957


  3%|▎         | 80/3000 [01:18<33:55,  1.43it/s]  

Episodes: 80 Episodic_Reward: -629.6652223702918


  3%|▎         | 90/3000 [01:26<40:55,  1.18it/s]  

Episodes: 90 Episodic_Reward: -417.8355075380188


  3%|▎         | 99/3000 [01:33<49:12,  1.02s/it]  

Running test
Test finished with success 0 out of 100 iterations.
Test average reward is -681.5657646521853


  3%|▎         | 100/3000 [01:45<3:31:02,  4.37s/it]

Episodes: 100 Episodic_Reward: -653.5609182778177


  4%|▎         | 110/3000 [01:53<35:58,  1.34it/s]  

Episodes: 110 Episodic_Reward: -256.3484059582445


  4%|▍         | 120/3000 [02:01<29:38,  1.62it/s]  

Episodes: 120 Episodic_Reward: -285.00187395319324


  4%|▍         | 130/3000 [02:10<26:58,  1.77it/s]  

Episodes: 130 Episodic_Reward: -179.78418034927466


  5%|▍         | 140/3000 [02:19<25:07,  1.90it/s]  

Episodes: 140 Episodic_Reward: -393.2738528394987


  5%|▍         | 149/3000 [02:27<26:22,  1.80it/s]  

Running test
Test finished with success 0 out of 100 iterations.
Test average reward is -595.5795399429273


  5%|▌         | 150/3000 [02:40<3:22:08,  4.26s/it]

Episodes: 150 Episodic_Reward: -450.0929062430653


  5%|▌         | 160/3000 [02:49<36:55,  1.28it/s]  

Episodes: 160 Episodic_Reward: -123.91433204577265


  6%|▌         | 170/3000 [03:02<1:19:53,  1.69s/it]

Episodes: 170 Episodic_Reward: -130.55623754442863


  6%|▌         | 180/3000 [03:07<20:35,  2.28it/s]  

Episodes: 180 Episodic_Reward: -345.97564491751757


  6%|▋         | 190/3000 [03:14<19:25,  2.41it/s]  

Episodes: 190 Episodic_Reward: -88.66977758652665


  7%|▋         | 199/3000 [03:22<26:15,  1.78it/s]  

Running test
Test finished with success 0 out of 100 iterations.
Test average reward is -424.91564646622743


  7%|▋         | 200/3000 [03:44<5:24:21,  6.95s/it]

Episodes: 200 Episodic_Reward: -569.2995835276454


  7%|▋         | 210/3000 [03:51<1:16:12,  1.64s/it]

Episodes: 210 Episodic_Reward: -32.113351908730536


  7%|▋         | 220/3000 [03:55<19:27,  2.38it/s]  

Episodes: 220 Episodic_Reward: -657.1502301895958


  8%|▊         | 230/3000 [04:03<21:30,  2.15it/s]  

Episodes: 230 Episodic_Reward: -314.1153713901574


  8%|▊         | 240/3000 [04:11<20:09,  2.28it/s]  

Episodes: 240 Episodic_Reward: -134.12006314064382


  8%|▊         | 249/3000 [04:19<27:55,  1.64it/s]  

Running test
Test finished with success 0 out of 100 iterations.
Test average reward is -230.20643107487402


  8%|▊         | 250/3000 [04:31<3:04:54,  4.03s/it]

Episodes: 250 Episodic_Reward: -98.62782670036246


  9%|▊         | 260/3000 [04:39<26:24,  1.73it/s]  

Episodes: 260 Episodic_Reward: -93.65470015070582


  9%|▉         | 270/3000 [04:48<27:42,  1.64it/s]  

Episodes: 270 Episodic_Reward: -87.29429570684158


  9%|▉         | 280/3000 [04:56<25:30,  1.78it/s]  

Episodes: 280 Episodic_Reward: -98.42354124288646


 10%|▉         | 290/3000 [05:08<1:06:54,  1.48s/it]

Episodes: 290 Episodic_Reward: -400.73409157848573


 10%|▉         | 299/3000 [05:16<1:10:14,  1.56s/it]

Running test
Test: 44 Test_Reward: 200.1043261849549
Test finished with success 1 out of 100 iterations.
Test average reward is -224.25109808454857


 10%|█         | 300/3000 [05:30<3:55:59,  5.24s/it]

Episodes: 300 Episodic_Reward: -159.84229163161527


 10%|█         | 310/3000 [05:39<47:00,  1.05s/it]  

Episodes: 310 Episodic_Reward: -134.04915725626554


 11%|█         | 320/3000 [05:49<43:26,  1.03it/s]  

Episodes: 320 Episodic_Reward: -466.2466652061688


 11%|█         | 328/3000 [06:00<45:25,  1.02s/it]  

Getting somewhere in 328 with a reward of 2.4328201033009407


 11%|█         | 330/3000 [06:07<1:29:47,  2.02s/it]

Episodes: 330 Episodic_Reward: -59.305997806255675


 11%|█         | 336/3000 [06:14<1:09:29,  1.57s/it]

Getting somewhere in 336 with a reward of 11.721046882849961


 11%|█▏        | 340/3000 [06:16<33:43,  1.31it/s]  

Episodes: 340 Episodic_Reward: -48.86635200796444


 12%|█▏        | 349/3000 [06:34<1:26:10,  1.95s/it]

Running test
Test finished with success 0 out of 100 iterations.
Test average reward is -393.5130714777821


 12%|█▏        | 350/3000 [06:54<5:22:40,  7.31s/it]

Episodes: 350 Episodic_Reward: -15.259345655633709


 12%|█▏        | 351/3000 [06:54<3:51:38,  5.25s/it]

Getting somewhere in 351 with a reward of 6.373358915472835


 12%|█▏        | 360/3000 [07:10<1:10:35,  1.60s/it]

Episodes: 360 Episodic_Reward: -16.466392904974725


 12%|█▏        | 370/3000 [07:20<32:26,  1.35it/s]  

Episodes: 370 Episodic_Reward: -178.47303972052592


 13%|█▎        | 378/3000 [07:35<1:13:36,  1.68s/it]

Getting somewhere in 378 with a reward of 11.648623641213447


 13%|█▎        | 380/3000 [07:37<53:51,  1.23s/it]  

Episodes: 380 Episodic_Reward: -113.05821413146386


 13%|█▎        | 386/3000 [07:46<50:38,  1.16s/it]  

Getting somewhere in 386 with a reward of 11.676937081110182


 13%|█▎        | 389/3000 [07:51<56:06,  1.29s/it]  

Getting somewhere in 389 with a reward of 7.646505682028149


 13%|█▎        | 390/3000 [07:53<58:00,  1.33s/it]

Episodes: 390 Episodic_Reward: -416.2418860210221


 13%|█▎        | 399/3000 [08:10<1:03:11,  1.46s/it]

Running test
Test finished with success 0 out of 100 iterations.
Test average reward is -364.7691674296831


 13%|█▎        | 400/3000 [08:31<5:13:29,  7.23s/it]

Episodes: 400 Episodic_Reward: -21.453953225217788


 13%|█▎        | 402/3000 [08:32<2:48:55,  3.90s/it]

Getting somewhere in 402 with a reward of 38.452842226674534


 13%|█▎        | 404/3000 [08:38<2:16:41,  3.16s/it]

Getting somewhere in 404 with a reward of 32.26266872382499


 14%|█▎        | 410/3000 [08:51<1:44:43,  2.43s/it]

Episodes: 410 Episodic_Reward: -174.51160872204437


 14%|█▍        | 420/3000 [09:18<2:35:29,  3.62s/it]

Episodes: 420 Episodic_Reward: -75.15893817113255


 14%|█▍        | 428/3000 [09:53<4:16:21,  5.98s/it]

Getting somewhere in 428 with a reward of 0.9250987593005675


 14%|█▍        | 430/3000 [09:54<2:25:00,  3.39s/it]

Episodes: 430 Episodic_Reward: -78.87461068074639


 15%|█▍        | 437/3000 [10:34<4:13:28,  5.93s/it]

Getting somewhere in 437 with a reward of 35.62367744565043


 15%|█▍        | 440/3000 [10:44<3:03:42,  4.31s/it]

Episodes: 440 Episodic_Reward: 22.359648410099012
Getting somewhere in 440 with a reward of 22.359648410099012


 15%|█▍        | 444/3000 [11:06<3:41:16,  5.19s/it]

Getting somewhere in 444 with a reward of 59.13705034529917


 15%|█▍        | 446/3000 [11:19<4:38:59,  6.55s/it]

Getting somewhere in 446 with a reward of 14.127583861253893


 15%|█▍        | 449/3000 [11:44<5:05:13,  7.18s/it]

Running test
Test finished with success 0 out of 100 iterations.
Test average reward is -448.87006309425067


 15%|█▌        | 450/3000 [12:21<11:33:13, 16.31s/it]

Episodes: 450 Episodic_Reward: -280.5150264550041


 15%|█▌        | 451/3000 [12:22<8:15:57, 11.67s/it] 

Getting somewhere in 451 with a reward of 2.1562113528786853


 15%|█▌        | 457/3000 [13:03<4:10:35,  5.91s/it]

Getting somewhere in 457 with a reward of 17.79089643458539


 15%|█▌        | 460/3000 [13:23<4:30:39,  6.39s/it]

Episodes: 460 Episodic_Reward: -25.371397633761617


 15%|█▌        | 463/3000 [13:37<3:29:24,  4.95s/it]

Getting somewhere in 463 with a reward of 14.596073469735103


 16%|█▌        | 465/3000 [13:52<4:52:03,  6.91s/it]

Getting somewhere in 465 with a reward of 35.19873185414545


 16%|█▌        | 467/3000 [14:18<7:22:34, 10.48s/it]

Getting somewhere in 467 with a reward of 18.468881196984896


 16%|█▌        | 470/3000 [14:40<5:51:59,  8.35s/it]

Episodes: 470 Episodic_Reward: -74.66989468993751


 16%|█▌        | 473/3000 [15:11<6:49:33,  9.72s/it]

Getting somewhere in 473 with a reward of 35.20582918724143


 16%|█▌        | 475/3000 [15:36<8:14:53, 11.76s/it]

Getting somewhere in 475 with a reward of 14.945858107160376


 16%|█▌        | 476/3000 [15:43<7:08:40, 10.19s/it]

Getting somewhere in 476 with a reward of 57.55708164744824


 16%|█▌        | 477/3000 [16:03<9:19:03, 13.29s/it]

Getting somewhere in 477 with a reward of 78.05384649118747


 16%|█▌        | 479/3000 [16:27<9:13:02, 13.16s/it]

Looking good in 479 with a reward of 120.42914851538083


 16%|█▌        | 480/3000 [16:28<6:36:41,  9.45s/it]

Episodes: 480 Episodic_Reward: -31.271209553034083


 16%|█▌        | 481/3000 [16:46<8:27:19, 12.08s/it]

Looking good in 481 with a reward of 101.03933838561441


 16%|█▌        | 482/3000 [16:53<7:25:55, 10.63s/it]

Looking good in 482 with a reward of 132.01001397282988


 16%|█▌        | 483/3000 [17:16<10:02:37, 14.37s/it]

Looking good in 483 with a reward of 134.7897835263447


 16%|█▌        | 484/3000 [17:24<8:41:45, 12.44s/it] 

Getting somewhere in 484 with a reward of 98.79460636809237


 16%|█▌        | 485/3000 [17:47<10:58:17, 15.70s/it]

Looking good in 485 with a reward of 147.4242060087109


 16%|█▌        | 486/3000 [17:50<8:18:19, 11.89s/it] 

Looking good in 486 with a reward of 191.19374596430418


 16%|█▌        | 487/3000 [18:05<8:51:40, 12.69s/it]

Looking good in 487 with a reward of 107.37238272691592


 16%|█▋        | 489/3000 [18:30<9:24:15, 13.48s/it]

Looking good in 489 with a reward of 105.42393666678689


 16%|█▋        | 490/3000 [18:32<7:07:06, 10.21s/it]

Episodes: 490 Episodic_Reward: 236.02528386361357
Success in episode 490 with a reward of 236.02528386361357


 16%|█▋        | 491/3000 [18:55<9:41:30, 13.91s/it]

Looking good in 491 with a reward of 193.86689524107334


 16%|█▋        | 492/3000 [18:59<7:44:15, 11.11s/it]

Getting somewhere in 492 with a reward of 34.824139453234686


 16%|█▋        | 493/3000 [19:13<8:12:09, 11.78s/it]

Success in episode 493 with a reward of 213.48544186384768


 16%|█▋        | 494/3000 [19:15<6:07:08,  8.79s/it]

Success in episode 494 with a reward of 256.5430390514772


 16%|█▋        | 495/3000 [19:17<4:44:34,  6.82s/it]

Looking good in 495 with a reward of 199.66645262706396


 17%|█▋        | 496/3000 [19:35<7:04:47, 10.18s/it]

Success in episode 496 with a reward of 262.7194932055012


 17%|█▋        | 497/3000 [19:37<5:17:23,  7.61s/it]

Success in episode 497 with a reward of 222.5542370222244


 17%|█▋        | 498/3000 [19:39<4:08:46,  5.97s/it]

Success in episode 498 with a reward of 257.598067097578


 17%|█▋        | 499/3000 [19:57<6:45:19,  9.72s/it]

Success in episode 499 with a reward of 226.97347118335864
Running test
Test finished with success 0 out of 100 iterations.
Test average reward is -96.72802467683137


 17%|█▋        | 500/3000 [24:58<67:28:27, 97.16s/it]

Episodes: 500 Episodic_Reward: 282.3806908143275
Success in episode 500 with a reward of 282.3806908143275


 17%|█▋        | 501/3000 [25:01<47:51:27, 68.94s/it]

Success in episode 501 with a reward of 224.30906799481977


 17%|█▋        | 502/3000 [25:27<38:54:45, 56.08s/it]

Success in episode 502 with a reward of 209.47254161252044


 17%|█▋        | 503/3000 [25:30<27:40:48, 39.91s/it]

Success in episode 503 with a reward of 218.04018386680343


 17%|█▋        | 504/3000 [25:32<19:45:45, 28.50s/it]

Success in episode 504 with a reward of 242.44160865864984


 17%|█▋        | 505/3000 [25:51<17:47:28, 25.67s/it]

Success in episode 505 with a reward of 240.14020819145372


 17%|█▋        | 506/3000 [25:52<12:47:44, 18.47s/it]

Success in episode 506 with a reward of 237.20757940481877


 17%|█▋        | 507/3000 [25:54<9:15:45, 13.38s/it] 

Success in episode 507 with a reward of 263.8680259362055


 17%|█▋        | 508/3000 [26:15<10:59:43, 15.88s/it]

Looking good in 508 with a reward of 150.83058315892288


 17%|█▋        | 509/3000 [26:18<8:11:10, 11.83s/it] 

Success in episode 509 with a reward of 268.55137098821626


 17%|█▋        | 510/3000 [26:20<6:08:35,  8.88s/it]

Episodes: 510 Episodic_Reward: -378.16122883117777


 17%|█▋        | 511/3000 [26:40<8:33:42, 12.38s/it]

Success in episode 511 with a reward of 220.76998916552253


 17%|█▋        | 512/3000 [26:42<6:13:06,  9.00s/it]

Success in episode 512 with a reward of 235.97916395872505


 17%|█▋        | 514/3000 [26:44<3:31:36,  5.11s/it]

Success in episode 514 with a reward of 245.46353961104452


 17%|█▋        | 515/3000 [27:05<6:42:37,  9.72s/it]

Success in episode 515 with a reward of 221.25025036491775


 17%|█▋        | 516/3000 [27:06<5:01:01,  7.27s/it]

Success in episode 516 with a reward of 232.81394963234106


 17%|█▋        | 518/3000 [27:09<3:00:27,  4.36s/it]

Looking good in 518 with a reward of 184.86945184283832


 17%|█▋        | 519/3000 [27:30<6:16:20,  9.10s/it]

Success in episode 519 with a reward of 264.1880491418606


 17%|█▋        | 520/3000 [27:32<4:48:00,  6.97s/it]

Episodes: 520 Episodic_Reward: -15.510073744328167


 17%|█▋        | 521/3000 [27:33<3:34:53,  5.20s/it]

Getting somewhere in 521 with a reward of 34.28116806393328


 17%|█▋        | 522/3000 [27:34<2:49:41,  4.11s/it]

Success in episode 522 with a reward of 267.61100600364085


 18%|█▊        | 526/3000 [28:03<4:27:15,  6.48s/it]

Success in episode 526 with a reward of 246.93395749053735


 18%|█▊        | 527/3000 [28:05<3:25:07,  4.98s/it]

Success in episode 527 with a reward of 267.568250828626


 18%|█▊        | 528/3000 [28:06<2:44:06,  3.98s/it]

Success in episode 528 with a reward of 247.35330090243698


 18%|█▊        | 529/3000 [28:08<2:10:50,  3.18s/it]

Success in episode 529 with a reward of 270.88866483033394


 18%|█▊        | 530/3000 [28:26<5:13:08,  7.61s/it]

Episodes: 530 Episodic_Reward: 278.0125086400284
Success in episode 530 with a reward of 278.0125086400284


 18%|█▊        | 531/3000 [28:27<3:55:10,  5.72s/it]

Getting somewhere in 531 with a reward of 8.440259614785361


 18%|█▊        | 532/3000 [28:28<2:58:19,  4.34s/it]

Success in episode 532 with a reward of 267.4394276544593


 18%|█▊        | 533/3000 [28:29<2:20:12,  3.41s/it]

Getting somewhere in 533 with a reward of 41.135060323374205


 18%|█▊        | 534/3000 [28:46<5:04:39,  7.41s/it]

Success in episode 534 with a reward of 246.1456182802551


 18%|█▊        | 535/3000 [28:48<3:50:00,  5.60s/it]

Success in episode 535 with a reward of 257.5065880654324


 18%|█▊        | 536/3000 [28:50<3:13:25,  4.71s/it]

Success in episode 536 with a reward of 276.8932173535148


 18%|█▊        | 537/3000 [29:07<5:41:35,  8.32s/it]

Success in episode 537 with a reward of 260.6405926996159


 18%|█▊        | 538/3000 [29:08<4:13:10,  6.17s/it]

Success in episode 538 with a reward of 248.93382483586225


 18%|█▊        | 539/3000 [29:09<3:09:56,  4.63s/it]

Success in episode 539 with a reward of 265.42352391827296


 18%|█▊        | 540/3000 [29:10<2:29:09,  3.64s/it]

Episodes: 540 Episodic_Reward: 275.78809237650825
Success in episode 540 with a reward of 275.78809237650825


 18%|█▊        | 541/3000 [29:28<5:26:03,  7.96s/it]

Success in episode 541 with a reward of 226.10526607650547


 18%|█▊        | 542/3000 [29:30<4:04:20,  5.96s/it]

Success in episode 542 with a reward of 268.2783294905608


 18%|█▊        | 543/3000 [29:31<3:05:24,  4.53s/it]

Success in episode 543 with a reward of 300.1267724971206


 18%|█▊        | 545/3000 [29:52<5:44:12,  8.41s/it]

Success in episode 545 with a reward of 286.8055588836091


 18%|█▊        | 546/3000 [29:53<4:13:12,  6.19s/it]

Success in episode 546 with a reward of 236.728850658105


 18%|█▊        | 547/3000 [29:54<3:15:21,  4.78s/it]

Success in episode 547 with a reward of 294.15691033569294


 18%|█▊        | 548/3000 [29:56<2:34:39,  3.78s/it]

Success in episode 548 with a reward of 285.3518124044417


 18%|█▊        | 549/3000 [30:17<6:00:57,  8.84s/it]

Success in episode 549 with a reward of 299.6105486428929
Running test
Test: 1 Test_Reward: 231.15213520611263
Test: 2 Test_Reward: 212.04951743072587
Test: 3 Test_Reward: 260.73214823786805
Test: 5 Test_Reward: 225.28405885903322
Test: 10 Test_Reward: 237.79635393795166
Test: 11 Test_Reward: 266.57507310276367
Test: 12 Test_Reward: 231.80813550599748
Test: 14 Test_Reward: 239.28530856831347
Test: 16 Test_Reward: 250.35750002430132
Test: 17 Test_Reward: 228.38663243598958
Test: 18 Test_Reward: 279.6468785851551
Test: 20 Test_Reward: 266.3428636004782
Test: 21 Test_Reward: 213.83922510269184
Test: 23 Test_Reward: 300.4186875198236
Test: 25 Test_Reward: 248.23109206955658
Test: 26 Test_Reward: 261.4359062909871
Test: 27 Test_Reward: 273.45842875639715
Test: 28 Test_Reward: 230.31140855425488
Test: 29 Test_Reward: 228.60323408458405
Test: 30 Test_Reward: 208.27480295182943
Test: 33 Test_Reward: 215.30005966055006
Test: 34 Test_Reward: 227.8335778672869
Test: 35 Test_Reward: 236.1337097131

 18%|█▊        | 550/3000 [31:50<23:14:54, 34.16s/it]

Episodes: 550 Episodic_Reward: 264.6080769593706
Success in episode 550 with a reward of 264.6080769593706


 18%|█▊        | 551/3000 [31:51<16:30:08, 24.26s/it]

Success in episode 551 with a reward of 231.81030275127324


 18%|█▊        | 552/3000 [31:53<11:53:40, 17.49s/it]

Success in episode 552 with a reward of 294.5423307420052


 18%|█▊        | 553/3000 [32:12<12:10:46, 17.92s/it]

Success in episode 553 with a reward of 263.36228212364995


 18%|█▊        | 554/3000 [32:13<8:43:36, 12.84s/it] 

Success in episode 554 with a reward of 261.6403188935285


 18%|█▊        | 555/3000 [32:14<6:23:45,  9.42s/it]

Success in episode 555 with a reward of 287.4012745166409


 19%|█▊        | 556/3000 [32:15<4:42:28,  6.93s/it]

Success in episode 556 with a reward of 280.4131421607594


 19%|█▊        | 557/3000 [32:32<6:49:33, 10.06s/it]

Getting somewhere in 557 with a reward of 7.840477960712221


 19%|█▊        | 558/3000 [32:34<5:04:30,  7.48s/it]

Success in episode 558 with a reward of 293.11289035940297


 19%|█▊        | 559/3000 [32:35<3:46:32,  5.57s/it]

Success in episode 559 with a reward of 229.19755142995524


 19%|█▊        | 560/3000 [32:36<2:49:55,  4.18s/it]

Episodes: 560 Episodic_Reward: 224.3347326584579
Success in episode 560 with a reward of 224.3347326584579


 19%|█▊        | 561/3000 [32:55<5:46:29,  8.52s/it]

Success in episode 561 with a reward of 231.22332832341039


 19%|█▊        | 562/3000 [32:56<4:19:11,  6.38s/it]

Success in episode 562 with a reward of 250.71372708956363


 19%|█▉        | 563/3000 [32:57<3:15:54,  4.82s/it]

Success in episode 563 with a reward of 216.00612857568865


 19%|█▉        | 564/3000 [32:59<2:33:34,  3.78s/it]

Success in episode 564 with a reward of 246.79450527869977


 19%|█▉        | 565/3000 [33:21<6:16:58,  9.29s/it]

Success in episode 565 with a reward of 212.92332988721347


 19%|█▉        | 566/3000 [33:22<4:41:24,  6.94s/it]

Success in episode 566 with a reward of 221.49664472194644


 19%|█▉        | 567/3000 [33:24<3:44:29,  5.54s/it]

Success in episode 567 with a reward of 267.1568560780669


 19%|█▉        | 568/3000 [33:45<6:42:16,  9.92s/it]

Success in episode 568 with a reward of 279.43638406261


 19%|█▉        | 569/3000 [33:46<4:56:33,  7.32s/it]

Getting somewhere in 569 with a reward of 32.23835460675741


 19%|█▉        | 570/3000 [33:47<3:43:54,  5.53s/it]

Episodes: 570 Episodic_Reward: 278.14073935812075
Success in episode 570 with a reward of 278.14073935812075


 19%|█▉        | 571/3000 [33:48<2:52:17,  4.26s/it]

Success in episode 571 with a reward of 237.75189043581267


 19%|█▉        | 573/3000 [34:11<4:38:18,  6.88s/it]

Success in episode 573 with a reward of 264.1016225668285


 19%|█▉        | 574/3000 [34:12<3:32:38,  5.26s/it]

Success in episode 574 with a reward of 242.60840446627645


 19%|█▉        | 575/3000 [34:14<2:44:17,  4.07s/it]

Success in episode 575 with a reward of 288.06539343790416


 19%|█▉        | 576/3000 [34:34<5:59:12,  8.89s/it]

Success in episode 576 with a reward of 276.3427323766617


 19%|█▉        | 578/3000 [34:38<3:38:23,  5.41s/it]

Success in episode 578 with a reward of 248.36109673486214


 19%|█▉        | 579/3000 [34:49<4:42:17,  7.00s/it]

Success in episode 579 with a reward of 240.84005313408494


 19%|█▉        | 580/3000 [34:50<3:33:44,  5.30s/it]

Episodes: 580 Episodic_Reward: 282.28905403247063
Success in episode 580 with a reward of 282.28905403247063


 19%|█▉        | 581/3000 [34:52<2:46:19,  4.13s/it]

Success in episode 581 with a reward of 277.82585811366266


 19%|█▉        | 582/3000 [34:53<2:11:02,  3.25s/it]

Success in episode 582 with a reward of 258.02503851413815


 19%|█▉        | 583/3000 [35:17<6:24:24,  9.54s/it]

Success in episode 583 with a reward of 262.3649923877911


 19%|█▉        | 584/3000 [35:18<4:40:00,  6.95s/it]

Success in episode 584 with a reward of 264.7096308975491


 20%|█▉        | 585/3000 [35:19<3:30:18,  5.22s/it]

Success in episode 585 with a reward of 229.88565799269696


 20%|█▉        | 586/3000 [35:20<2:40:49,  4.00s/it]

Success in episode 586 with a reward of 291.67147932658224


 20%|█▉        | 587/3000 [35:21<2:05:09,  3.11s/it]

Success in episode 587 with a reward of 230.70298780026855


 20%|█▉        | 588/3000 [35:45<6:07:15,  9.14s/it]

Success in episode 588 with a reward of 285.35039288241444


 20%|█▉        | 589/3000 [35:46<4:36:53,  6.89s/it]

Success in episode 589 with a reward of 260.39759844856155


 20%|█▉        | 590/3000 [35:47<3:26:19,  5.14s/it]

Episodes: 590 Episodic_Reward: 275.0833975610732
Success in episode 590 with a reward of 275.0833975610732


 20%|█▉        | 591/3000 [35:49<2:44:08,  4.09s/it]

Success in episode 591 with a reward of 243.4871015188324


 20%|█▉        | 592/3000 [36:08<5:46:53,  8.64s/it]

Success in episode 592 with a reward of 263.5006702928962


 20%|█▉        | 593/3000 [36:11<4:35:33,  6.87s/it]

Success in episode 593 with a reward of 238.90632866661682


 20%|█▉        | 595/3000 [36:27<5:15:51,  7.88s/it]

Success in episode 595 with a reward of 242.137884686409


 20%|█▉        | 596/3000 [36:28<3:54:55,  5.86s/it]

Success in episode 596 with a reward of 251.02515043159158


 20%|█▉        | 597/3000 [36:29<2:58:00,  4.44s/it]

Success in episode 597 with a reward of 261.40669425584315


 20%|█▉        | 598/3000 [36:31<2:24:59,  3.62s/it]

Success in episode 598 with a reward of 277.07302732116784


 20%|█▉        | 599/3000 [36:47<5:01:28,  7.53s/it]

Success in episode 599 with a reward of 253.22018071963993
Running test
Test: 1 Test_Reward: 274.11599711389306
Test: 2 Test_Reward: 288.89793595091123
Test: 4 Test_Reward: 239.55239219173743
Test: 5 Test_Reward: 270.1576641236259
Test: 6 Test_Reward: 214.34011396408886
Test: 7 Test_Reward: 232.06358095320553
Test: 8 Test_Reward: 247.79798568964992
Test: 11 Test_Reward: 263.9410789933288
Test: 12 Test_Reward: 253.60354324714947
Test: 14 Test_Reward: 249.10741032527838
Test: 15 Test_Reward: 295.946721102664
Test: 16 Test_Reward: 294.45623916415104
Test: 17 Test_Reward: 284.4574130754103
Test: 18 Test_Reward: 218.7067073389087
Test: 19 Test_Reward: 282.54018574792786
Test: 20 Test_Reward: 266.86106421695865
Test: 21 Test_Reward: 231.2596731682846
Test: 22 Test_Reward: 201.67568295010764
Test: 23 Test_Reward: 201.25073156592896
Test: 24 Test_Reward: 242.26727158914585
Test: 25 Test_Reward: 254.87159469283733
Test: 26 Test_Reward: 311.56873459596443
Test: 27 Test_Reward: 269.8537442475101


 20%|██        | 600/3000 [37:54<16:46:21, 25.16s/it]

Episodes: 600 Episodic_Reward: 271.7148466290421
Success in episode 600 with a reward of 271.7148466290421


 20%|██        | 601/3000 [37:55<11:57:36, 17.95s/it]

Success in episode 601 with a reward of 250.833194134428


 20%|██        | 602/3000 [37:56<8:35:31, 12.90s/it] 

Success in episode 602 with a reward of 236.98525591092778


 20%|██        | 603/3000 [38:15<9:52:45, 14.84s/it]

Success in episode 603 with a reward of 267.6157524491431


 20%|██        | 604/3000 [38:17<7:09:44, 10.76s/it]

Success in episode 604 with a reward of 239.1099235680788


 20%|██        | 605/3000 [38:18<5:14:26,  7.88s/it]

Success in episode 605 with a reward of 263.70623620579994


 20%|██        | 606/3000 [38:21<4:17:14,  6.45s/it]

Success in episode 606 with a reward of 256.6433289204107


 20%|██        | 607/3000 [38:45<7:43:53, 11.63s/it]

Success in episode 607 with a reward of 259.84989200298844


 20%|██        | 608/3000 [38:46<5:42:38,  8.59s/it]

Success in episode 608 with a reward of 229.63149255407592


 20%|██        | 609/3000 [38:47<4:11:59,  6.32s/it]

Success in episode 609 with a reward of 243.76551201329988


 20%|██        | 610/3000 [38:49<3:20:48,  5.04s/it]

Episodes: 610 Episodic_Reward: 234.71767652295935
Success in episode 610 with a reward of 234.71767652295935


 20%|██        | 611/3000 [39:15<7:30:48, 11.32s/it]

Looking good in 611 with a reward of 132.64911888896899


 20%|██        | 612/3000 [39:16<5:29:04,  8.27s/it]

Success in episode 612 with a reward of 270.60464758376725


 20%|██        | 613/3000 [39:18<4:06:38,  6.20s/it]

Getting somewhere in 613 with a reward of 37.2460152972356


 20%|██        | 614/3000 [39:20<3:14:41,  4.90s/it]

Success in episode 614 with a reward of 240.63573295091845


 20%|██        | 615/3000 [39:43<6:57:41, 10.51s/it]

Success in episode 615 with a reward of 298.96115988421604


 21%|██        | 616/3000 [39:45<5:10:51,  7.82s/it]

Success in episode 616 with a reward of 269.02780148863064


 21%|██        | 617/3000 [39:46<3:48:51,  5.76s/it]

Success in episode 617 with a reward of 233.52774911234403


 21%|██        | 618/3000 [39:47<2:54:31,  4.40s/it]

Success in episode 618 with a reward of 252.1798812946433


 21%|██        | 619/3000 [39:48<2:16:40,  3.44s/it]

Success in episode 619 with a reward of 253.86083386612145


 21%|██        | 620/3000 [40:07<5:19:11,  8.05s/it]

Episodes: 620 Episodic_Reward: 234.27473367975227
Success in episode 620 with a reward of 234.27473367975227


 21%|██        | 621/3000 [40:08<3:57:22,  5.99s/it]

Success in episode 621 with a reward of 252.43648893163925


 21%|██        | 623/3000 [40:11<2:22:42,  3.60s/it]

Success in episode 623 with a reward of 285.41282756430365


 21%|██        | 624/3000 [40:26<4:43:53,  7.17s/it]

Getting somewhere in 624 with a reward of 25.908155425648786


 21%|██        | 625/3000 [40:27<3:31:16,  5.34s/it]

Success in episode 625 with a reward of 283.6578783313244


 21%|██        | 626/3000 [40:30<2:56:24,  4.46s/it]

Success in episode 626 with a reward of 237.39233383037097


 21%|██        | 627/3000 [40:49<5:57:17,  9.03s/it]

Success in episode 627 with a reward of 293.4455218443756


 21%|██        | 628/3000 [40:51<4:23:22,  6.66s/it]

Success in episode 628 with a reward of 264.6076772563671


 21%|██        | 630/3000 [40:54<2:42:29,  4.11s/it]

Episodes: 630 Episodic_Reward: 267.79441953191883
Success in episode 630 with a reward of 267.79441953191883


 21%|██        | 631/3000 [41:12<5:26:31,  8.27s/it]

Success in episode 631 with a reward of 294.2163339200528


 21%|██        | 632/3000 [41:13<4:00:27,  6.09s/it]

Success in episode 632 with a reward of 243.99266797028187


 21%|██        | 633/3000 [41:14<3:02:27,  4.62s/it]

Success in episode 633 with a reward of 278.35774111915987


 21%|██        | 634/3000 [41:15<2:22:38,  3.62s/it]

Success in episode 634 with a reward of 274.50507871035836


 21%|██        | 635/3000 [41:35<5:30:37,  8.39s/it]

Success in episode 635 with a reward of 238.4680169626034


 21%|██        | 636/3000 [41:36<4:04:50,  6.21s/it]

Success in episode 636 with a reward of 278.30722531179305


 21%|██        | 637/3000 [41:37<3:07:54,  4.77s/it]

Success in episode 637 with a reward of 229.45399549185484


 21%|██▏       | 638/3000 [41:39<2:25:12,  3.69s/it]

Success in episode 638 with a reward of 273.8565628435206


 21%|██▏       | 639/3000 [41:57<5:14:14,  7.99s/it]

Success in episode 639 with a reward of 259.66588602381205


 21%|██▏       | 640/3000 [41:58<3:52:54,  5.92s/it]

Episodes: 640 Episodic_Reward: 283.8098125353335
Success in episode 640 with a reward of 283.8098125353335


 21%|██▏       | 641/3000 [41:59<2:56:44,  4.50s/it]

Success in episode 641 with a reward of 281.8512009964736


 21%|██▏       | 642/3000 [42:19<6:00:08,  9.16s/it]

Success in episode 642 with a reward of 249.89299852898387


 21%|██▏       | 643/3000 [42:20<4:23:56,  6.72s/it]

Getting somewhere in 643 with a reward of 9.483559921299644


 21%|██▏       | 644/3000 [42:21<3:21:08,  5.12s/it]

Success in episode 644 with a reward of 262.99727851896887


 22%|██▏       | 645/3000 [42:23<2:35:41,  3.97s/it]

Success in episode 645 with a reward of 263.15026683213574


 22%|██▏       | 646/3000 [42:24<2:00:14,  3.06s/it]

Success in episode 646 with a reward of 257.44193443400894


 22%|██▏       | 647/3000 [42:46<5:45:31,  8.81s/it]

Success in episode 647 with a reward of 287.82198560888924


 22%|██▏       | 648/3000 [42:47<4:13:43,  6.47s/it]

Success in episode 648 with a reward of 285.43817331609125


 22%|██▏       | 649/3000 [42:48<3:12:40,  4.92s/it]

Success in episode 649 with a reward of 288.79574182050294
Running test
Test: 1 Test_Reward: 243.92298021044942
Test: 2 Test_Reward: 290.5893379427388
Test: 3 Test_Reward: 272.7246250828602
Test: 4 Test_Reward: 287.40318210253474
Test: 5 Test_Reward: 290.57102303445237
Test: 6 Test_Reward: 281.7918598648512
Test: 7 Test_Reward: 278.0670475255522
Test: 11 Test_Reward: 287.7144292210472
Test: 13 Test_Reward: 281.15243270436764
Test: 15 Test_Reward: 268.62835732316256
Test: 16 Test_Reward: 259.32336459102567
Test: 17 Test_Reward: 211.46487083574766
Test: 18 Test_Reward: 259.0178141125734
Test: 19 Test_Reward: 259.5328110201558
Test: 20 Test_Reward: 237.2484570062725
Test: 21 Test_Reward: 260.21127630564285
Test: 22 Test_Reward: 210.3622924931594
Test: 23 Test_Reward: 223.20954897794763
Test: 24 Test_Reward: 233.35522106850024
Test: 25 Test_Reward: 266.0757734345947
Test: 26 Test_Reward: 267.81896867845984
Test: 27 Test_Reward: 272.5608888752715
Test: 28 Test_Reward: 276.68247867599837
Tes

 22%|██▏       | 650/3000 [43:41<12:41:10, 19.43s/it]

Episodes: 650 Episodic_Reward: 256.6966286502968
Success in episode 650 with a reward of 256.6966286502968


 22%|██▏       | 651/3000 [43:59<12:14:44, 18.77s/it]

Success in episode 651 with a reward of 265.2681088208665


 22%|██▏       | 652/3000 [43:59<8:44:52, 13.41s/it] 

Getting somewhere in 652 with a reward of 47.419615684009216


 22%|██▏       | 653/3000 [44:01<6:23:26,  9.80s/it]

Success in episode 653 with a reward of 280.9599275857702


 22%|██▏       | 654/3000 [44:02<4:40:04,  7.16s/it]

Success in episode 654 with a reward of 275.64977712961695


 22%|██▏       | 655/3000 [44:20<6:44:20, 10.35s/it]

Success in episode 655 with a reward of 282.7303393815591


 22%|██▏       | 656/3000 [44:21<4:54:24,  7.54s/it]

Getting somewhere in 656 with a reward of 54.68627096921429


 22%|██▏       | 657/3000 [44:22<3:37:03,  5.56s/it]

Success in episode 657 with a reward of 257.0817021092234


 22%|██▏       | 658/3000 [44:23<2:45:51,  4.25s/it]

Success in episode 658 with a reward of 288.0234029898686


 22%|██▏       | 659/3000 [44:24<2:12:32,  3.40s/it]

Success in episode 659 with a reward of 246.1482377774787


 22%|██▏       | 660/3000 [44:46<5:44:16,  8.83s/it]

Episodes: 660 Episodic_Reward: 238.5171946710816
Success in episode 660 with a reward of 238.5171946710816


 22%|██▏       | 661/3000 [44:47<4:14:37,  6.53s/it]

Success in episode 661 with a reward of 270.00874288976866


 22%|██▏       | 662/3000 [44:48<3:13:59,  4.98s/it]

Success in episode 662 with a reward of 277.19749055084424


 22%|██▏       | 663/3000 [44:49<2:25:48,  3.74s/it]

Getting somewhere in 663 with a reward of 67.1611277057356


 22%|██▏       | 664/3000 [45:09<5:32:17,  8.54s/it]

Success in episode 664 with a reward of 276.0338010817211


 22%|██▏       | 665/3000 [45:10<4:03:45,  6.26s/it]

Success in episode 665 with a reward of 265.149251115944


 22%|██▏       | 666/3000 [45:13<3:28:34,  5.36s/it]

Success in episode 666 with a reward of 204.19751495762097


 22%|██▏       | 667/3000 [45:14<2:37:06,  4.04s/it]

Success in episode 667 with a reward of 243.70219121348504


 22%|██▏       | 668/3000 [45:38<6:27:36,  9.97s/it]

Success in episode 668 with a reward of 212.88029648315273


 22%|██▏       | 669/3000 [45:40<4:54:50,  7.59s/it]

Success in episode 669 with a reward of 209.65275101968004


 22%|██▏       | 670/3000 [45:41<3:42:30,  5.73s/it]

Episodes: 670 Episodic_Reward: 246.02771677232099
Success in episode 670 with a reward of 246.02771677232099


 22%|██▏       | 671/3000 [45:42<2:50:11,  4.38s/it]

Success in episode 671 with a reward of 287.64181356051233


 22%|██▏       | 672/3000 [46:05<6:25:07,  9.93s/it]

Success in episode 672 with a reward of 276.2683854824778


 22%|██▏       | 673/3000 [46:06<4:43:04,  7.30s/it]

Success in episode 673 with a reward of 256.1538009739518


 22%|██▏       | 674/3000 [46:08<3:30:59,  5.44s/it]

Success in episode 674 with a reward of 264.6494427854809


 22%|██▎       | 675/3000 [46:09<2:42:44,  4.20s/it]

Success in episode 675 with a reward of 268.69209512373243


 23%|██▎       | 676/3000 [46:27<5:20:59,  8.29s/it]

Success in episode 676 with a reward of 271.56491484840734


 23%|██▎       | 677/3000 [46:28<3:54:59,  6.07s/it]

Success in episode 677 with a reward of 284.48047153188793


 23%|██▎       | 678/3000 [46:28<2:53:35,  4.49s/it]

Success in episode 678 with a reward of 225.88656439768994


 23%|██▎       | 679/3000 [46:29<2:12:41,  3.43s/it]

Success in episode 679 with a reward of 246.7591028574239


 23%|██▎       | 680/3000 [46:30<1:45:44,  2.73s/it]

Episodes: 680 Episodic_Reward: 293.9068217727265
Success in episode 680 with a reward of 293.9068217727265


 23%|██▎       | 681/3000 [46:50<4:57:34,  7.70s/it]

Success in episode 681 with a reward of 237.28432682086964


 23%|██▎       | 682/3000 [46:51<3:37:22,  5.63s/it]

Success in episode 682 with a reward of 254.45273106418992


 23%|██▎       | 683/3000 [46:52<2:49:42,  4.39s/it]

Success in episode 683 with a reward of 281.2870303294085


 23%|██▎       | 684/3000 [46:53<2:08:42,  3.33s/it]

Success in episode 684 with a reward of 230.35410836022555


 23%|██▎       | 686/3000 [47:14<3:56:57,  6.14s/it]

Success in episode 686 with a reward of 265.8278222563995


 23%|██▎       | 688/3000 [47:15<2:12:08,  3.43s/it]

Success in episode 688 with a reward of 244.64295015753925


 23%|██▎       | 689/3000 [47:16<1:44:07,  2.70s/it]

Success in episode 689 with a reward of 267.03769099814104


 23%|██▎       | 690/3000 [47:37<5:14:47,  8.18s/it]

Episodes: 690 Episodic_Reward: 235.1987109693369
Success in episode 690 with a reward of 235.1987109693369


 23%|██▎       | 691/3000 [47:39<3:54:22,  6.09s/it]

Success in episode 691 with a reward of 291.5732575366071


 23%|██▎       | 692/3000 [47:40<2:55:22,  4.56s/it]

Success in episode 692 with a reward of 243.81441597970712


 23%|██▎       | 693/3000 [47:41<2:16:48,  3.56s/it]

Success in episode 693 with a reward of 259.08553306885335


 23%|██▎       | 694/3000 [48:03<5:50:24,  9.12s/it]

Success in episode 694 with a reward of 216.65502384682873


 23%|██▎       | 695/3000 [48:04<4:21:03,  6.80s/it]

Success in episode 695 with a reward of 238.09739536136806


 23%|██▎       | 696/3000 [48:05<3:16:01,  5.10s/it]

Success in episode 696 with a reward of 239.44053569585577


 23%|██▎       | 697/3000 [48:07<2:29:31,  3.90s/it]

Success in episode 697 with a reward of 260.0062221667597


 23%|██▎       | 699/3000 [48:28<4:11:08,  6.55s/it]

Success in episode 699 with a reward of 289.0992104112163
Running test
Test: 1 Test_Reward: 265.2364604471478
Test: 2 Test_Reward: 240.85341347062462
Test: 3 Test_Reward: 281.13957843956814
Test: 4 Test_Reward: 254.1595912607846
Test: 5 Test_Reward: 265.74701035804446
Test: 6 Test_Reward: 254.07059103865055
Test: 7 Test_Reward: 280.935956301686
Test: 8 Test_Reward: 259.667111027172
Test: 9 Test_Reward: 287.25495062128755
Test: 12 Test_Reward: 289.7923645589849
Test: 13 Test_Reward: 212.33782543481112
Test: 14 Test_Reward: 263.5277989122258
Test: 15 Test_Reward: 259.3977343162785
Test: 16 Test_Reward: 271.69268939667626
Test: 17 Test_Reward: 234.37648954588084
Test: 18 Test_Reward: 267.63144726702353
Test: 20 Test_Reward: 255.33862262458337
Test: 21 Test_Reward: 254.47858557870043
Test: 22 Test_Reward: 243.82016385263185
Test: 23 Test_Reward: 243.04412596944218
Test: 24 Test_Reward: 288.09782626297203
Test: 25 Test_Reward: 291.50874865219765
Test: 26 Test_Reward: 220.40131389516188
Test

 23%|██▎       | 700/3000 [49:31<14:55:08, 23.35s/it]

Episodes: 700 Episodic_Reward: 244.03229218187258
Success in episode 700 with a reward of 244.03229218187258


 23%|██▎       | 701/3000 [49:32<10:38:06, 16.65s/it]

Success in episode 701 with a reward of 267.4172184897974


 23%|██▎       | 702/3000 [49:51<11:06:51, 17.41s/it]

Success in episode 702 with a reward of 290.47412876678095


 23%|██▎       | 703/3000 [49:52<8:00:41, 12.56s/it] 

Success in episode 703 with a reward of 279.5456465964917


 23%|██▎       | 704/3000 [49:53<5:47:50,  9.09s/it]

Success in episode 704 with a reward of 271.1147928468465


 24%|██▎       | 705/3000 [49:54<4:16:37,  6.71s/it]

Getting somewhere in 705 with a reward of 30.69618783610605


 24%|██▎       | 706/3000 [50:12<6:26:48, 10.12s/it]

Success in episode 706 with a reward of 279.786900317364


 24%|██▎       | 707/3000 [50:13<4:43:00,  7.41s/it]

Getting somewhere in 707 with a reward of 49.37903099227839


 24%|██▎       | 708/3000 [50:14<3:27:30,  5.43s/it]

Success in episode 708 with a reward of 283.7476227603328


 24%|██▎       | 709/3000 [50:16<2:39:20,  4.17s/it]

Success in episode 709 with a reward of 252.60016830415822


 24%|██▎       | 710/3000 [50:17<2:04:06,  3.25s/it]

Episodes: 710 Episodic_Reward: 264.979373859501
Success in episode 710 with a reward of 264.979373859501


 24%|██▎       | 711/3000 [50:36<5:14:17,  8.24s/it]

Success in episode 711 with a reward of 258.7331938886053


 24%|██▍       | 713/3000 [50:53<5:27:14,  8.59s/it]

Looking good in 713 with a reward of 143.3814368368409


 24%|██▍       | 714/3000 [50:54<4:01:32,  6.34s/it]

Success in episode 714 with a reward of 273.2876104809467


 24%|██▍       | 715/3000 [50:55<3:01:37,  4.77s/it]

Success in episode 715 with a reward of 243.54749754558856


 24%|██▍       | 716/3000 [50:56<2:18:10,  3.63s/it]

Success in episode 716 with a reward of 271.28463516261684


 24%|██▍       | 717/3000 [50:57<1:48:53,  2.86s/it]

Success in episode 717 with a reward of 256.3111875016353


 24%|██▍       | 718/3000 [51:21<5:54:46,  9.33s/it]

Success in episode 718 with a reward of 298.93885867879567


 24%|██▍       | 719/3000 [51:23<4:22:39,  6.91s/it]

Success in episode 719 with a reward of 308.0849133750845


 24%|██▍       | 720/3000 [51:24<3:14:41,  5.12s/it]

Episodes: 720 Episodic_Reward: 255.4201778431616
Success in episode 720 with a reward of 255.4201778431616


 24%|██▍       | 721/3000 [51:25<2:29:01,  3.92s/it]

Success in episode 721 with a reward of 281.14934566737674


 24%|██▍       | 722/3000 [51:26<1:54:22,  3.01s/it]

Success in episode 722 with a reward of 227.27738915401386


 24%|██▍       | 724/3000 [51:50<4:16:51,  6.77s/it]

Success in episode 724 with a reward of 283.6658650310777


 24%|██▍       | 725/3000 [51:54<3:43:10,  5.89s/it]

Success in episode 725 with a reward of 293.67903412867855


 24%|██▍       | 726/3000 [52:11<5:45:19,  9.11s/it]

Success in episode 726 with a reward of 281.47378098560426


 24%|██▍       | 727/3000 [52:12<4:12:42,  6.67s/it]

Success in episode 727 with a reward of 231.78869562806375


 24%|██▍       | 728/3000 [52:13<3:14:08,  5.13s/it]

Success in episode 728 with a reward of 227.35996947324728


 24%|██▍       | 729/3000 [52:14<2:27:57,  3.91s/it]

Success in episode 729 with a reward of 289.8097747273751


 24%|██▍       | 730/3000 [52:32<5:08:19,  8.15s/it]

Episodes: 730 Episodic_Reward: 264.39302254787043
Success in episode 730 with a reward of 264.39302254787043


 24%|██▍       | 731/3000 [52:34<3:49:53,  6.08s/it]

Success in episode 731 with a reward of 284.3832297228921


 24%|██▍       | 732/3000 [52:38<3:25:46,  5.44s/it]

Success in episode 732 with a reward of 286.04229303189857


 24%|██▍       | 733/3000 [52:53<5:16:41,  8.38s/it]

Success in episode 733 with a reward of 273.0502931914758


 24%|██▍       | 734/3000 [52:54<3:57:08,  6.28s/it]

Success in episode 734 with a reward of 289.5458289261221


 24%|██▍       | 735/3000 [52:55<2:58:51,  4.74s/it]

Success in episode 735 with a reward of 277.32663238391075


 25%|██▍       | 736/3000 [52:57<2:17:30,  3.64s/it]

Success in episode 736 with a reward of 229.17373968098804


 25%|██▍       | 737/3000 [52:58<1:48:28,  2.88s/it]

Success in episode 737 with a reward of 249.03510112161223


 25%|██▍       | 738/3000 [53:18<5:07:52,  8.17s/it]

Success in episode 738 with a reward of 258.46815621013013


 25%|██▍       | 739/3000 [53:19<3:45:36,  5.99s/it]

Getting somewhere in 739 with a reward of 68.17615270552102


 25%|██▍       | 740/3000 [53:20<2:49:15,  4.49s/it]

Episodes: 740 Episodic_Reward: 274.0810660134489
Success in episode 740 with a reward of 274.0810660134489


 25%|██▍       | 741/3000 [53:21<2:10:30,  3.47s/it]

Success in episode 741 with a reward of 294.5743786297148


 25%|██▍       | 742/3000 [53:22<1:41:57,  2.71s/it]

Success in episode 742 with a reward of 273.16525005488023


 25%|██▍       | 743/3000 [53:40<4:35:47,  7.33s/it]

Getting somewhere in 743 with a reward of 36.7874423519591


 25%|██▍       | 744/3000 [53:41<3:27:55,  5.53s/it]

Success in episode 744 with a reward of 281.4486901336955


 25%|██▍       | 745/3000 [53:42<2:36:48,  4.17s/it]

Success in episode 745 with a reward of 229.54218214395803


 25%|██▍       | 746/3000 [53:44<2:07:49,  3.40s/it]

Success in episode 746 with a reward of 302.5592536191109


 25%|██▍       | 747/3000 [54:03<4:57:52,  7.93s/it]

Success in episode 747 with a reward of 272.7975181783495


 25%|██▍       | 748/3000 [54:04<3:40:50,  5.88s/it]

Success in episode 748 with a reward of 293.3198491639214


 25%|██▍       | 749/3000 [54:05<2:46:33,  4.44s/it]

Success in episode 749 with a reward of 247.5126296430211
Running test
Test: 1 Test_Reward: 278.99156682967924
Test: 3 Test_Reward: 279.3240089747013
Test: 4 Test_Reward: 263.2908254791437
Test: 5 Test_Reward: 271.39584028942073
Test: 6 Test_Reward: 279.6235558284562
Test: 7 Test_Reward: 263.8212753482501
Test: 8 Test_Reward: 235.03973952541855
Test: 9 Test_Reward: 263.75529331143844
Test: 10 Test_Reward: 231.91784571679904
Test: 11 Test_Reward: 266.72749425390293
Test: 12 Test_Reward: 273.6957218635681
Test: 13 Test_Reward: 263.1009675813282
Test: 14 Test_Reward: 267.39961342616414
Test: 15 Test_Reward: 260.62072306728146
Test: 16 Test_Reward: 272.82611935884273
Test: 17 Test_Reward: 272.92263608953397
Test: 18 Test_Reward: 203.20673128768308
Test: 19 Test_Reward: 272.35090988318643
Test: 20 Test_Reward: 298.03759310321317
Test: 21 Test_Reward: 232.26556463243702
Test: 22 Test_Reward: 285.3424111152388
Test: 23 Test_Reward: 254.3898201942634
Test: 24 Test_Reward: 237.05384574134098
Te

 25%|██▌       | 750/3000 [54:48<9:58:20, 15.96s/it]

Episodes: 750 Episodic_Reward: 264.19379369997364
Success in episode 750 with a reward of 264.19379369997364


 25%|██▌       | 751/3000 [55:07<10:38:13, 17.03s/it]

Success in episode 751 with a reward of 292.02491156008307


 25%|██▌       | 752/3000 [55:08<7:37:23, 12.21s/it] 

Success in episode 752 with a reward of 260.86624190833334


 25%|██▌       | 753/3000 [55:09<5:34:05,  8.92s/it]

Success in episode 753 with a reward of 298.65367675228754


 25%|██▌       | 754/3000 [55:11<4:07:10,  6.60s/it]

Success in episode 754 with a reward of 262.7358546723425


 25%|██▌       | 756/3000 [55:32<4:52:52,  7.83s/it]

Success in episode 756 with a reward of 263.9943794917545


 25%|██▌       | 757/3000 [55:33<3:35:56,  5.78s/it]

Success in episode 757 with a reward of 283.9487361794582


 25%|██▌       | 758/3000 [55:34<2:43:39,  4.38s/it]

Success in episode 758 with a reward of 254.8789129181119


 25%|██▌       | 759/3000 [55:35<2:05:37,  3.36s/it]

Success in episode 759 with a reward of 276.5612061057941


 25%|██▌       | 760/3000 [55:57<5:30:15,  8.85s/it]

Episodes: 760 Episodic_Reward: 242.3476303193914
Success in episode 760 with a reward of 242.3476303193914


 25%|██▌       | 761/3000 [55:58<4:02:16,  6.49s/it]

Success in episode 761 with a reward of 251.45011927869885


 25%|██▌       | 762/3000 [56:00<3:14:12,  5.21s/it]

Success in episode 762 with a reward of 271.61122917619025


 25%|██▌       | 763/3000 [56:01<2:28:02,  3.97s/it]

Success in episode 763 with a reward of 283.94761334019745


 25%|██▌       | 764/3000 [56:23<5:48:42,  9.36s/it]

Success in episode 764 with a reward of 295.49554857246204


 26%|██▌       | 765/3000 [56:24<4:16:32,  6.89s/it]

Success in episode 765 with a reward of 260.49063265218194


 26%|██▌       | 766/3000 [56:25<3:12:02,  5.16s/it]

Success in episode 766 with a reward of 263.0355658364578


 26%|██▌       | 767/3000 [56:26<2:28:47,  4.00s/it]

Success in episode 767 with a reward of 245.9373250014926


 26%|██▌       | 768/3000 [56:45<5:11:36,  8.38s/it]

Success in episode 768 with a reward of 275.91930898918247


 26%|██▌       | 769/3000 [56:46<3:51:28,  6.23s/it]

Success in episode 769 with a reward of 265.45969832363005


 26%|██▌       | 770/3000 [56:48<2:56:27,  4.75s/it]

Episodes: 770 Episodic_Reward: 252.2460963012819
Success in episode 770 with a reward of 252.2460963012819


 26%|██▌       | 771/3000 [56:49<2:16:01,  3.66s/it]

Success in episode 771 with a reward of 278.3977140348551


 26%|██▌       | 772/3000 [57:08<5:04:59,  8.21s/it]

Success in episode 772 with a reward of 275.8692263534516


 26%|██▌       | 773/3000 [57:09<3:45:19,  6.07s/it]

Success in episode 773 with a reward of 288.35557829616937


 26%|██▌       | 774/3000 [57:10<2:48:52,  4.55s/it]

Success in episode 774 with a reward of 283.97531032327487


 26%|██▌       | 775/3000 [57:11<2:10:05,  3.51s/it]

Success in episode 775 with a reward of 263.42027712384225


 26%|██▌       | 776/3000 [57:29<4:53:04,  7.91s/it]

Success in episode 776 with a reward of 249.39948017142103


 26%|██▌       | 777/3000 [57:30<3:39:49,  5.93s/it]

Success in episode 777 with a reward of 274.72141840247633


 26%|██▌       | 778/3000 [57:31<2:45:11,  4.46s/it]

Success in episode 778 with a reward of 222.57969089690687


 26%|██▌       | 779/3000 [57:32<2:07:05,  3.43s/it]

Success in episode 779 with a reward of 299.10503482997456


 26%|██▌       | 780/3000 [57:50<4:49:52,  7.83s/it]

Episodes: 780 Episodic_Reward: 284.30682947567345
Success in episode 780 with a reward of 284.30682947567345


 26%|██▌       | 781/3000 [57:51<3:35:19,  5.82s/it]

Success in episode 781 with a reward of 261.90677108486227


 26%|██▌       | 782/3000 [57:54<3:03:39,  4.97s/it]

Success in episode 782 with a reward of 290.21846489545396


 26%|██▌       | 783/3000 [58:11<5:06:41,  8.30s/it]

Success in episode 783 with a reward of 284.8595214502297


 26%|██▌       | 784/3000 [58:12<3:48:21,  6.18s/it]

Success in episode 784 with a reward of 272.45338274985454


 26%|██▌       | 785/3000 [58:13<2:58:26,  4.83s/it]

Success in episode 785 with a reward of 216.64249324595045


 26%|██▌       | 786/3000 [58:14<2:16:01,  3.69s/it]

Success in episode 786 with a reward of 245.35246203353458


 26%|██▌       | 787/3000 [58:37<5:46:20,  9.39s/it]

Success in episode 787 with a reward of 258.81798828099795


 26%|██▋       | 788/3000 [58:38<4:14:03,  6.89s/it]

Success in episode 788 with a reward of 246.3235423079377


 26%|██▋       | 789/3000 [58:39<3:09:27,  5.14s/it]

Success in episode 789 with a reward of 275.5218707964756


 26%|██▋       | 790/3000 [58:41<2:29:25,  4.06s/it]

Episodes: 790 Episodic_Reward: 262.9847170389437
Success in episode 790 with a reward of 262.9847170389437


 26%|██▋       | 791/3000 [59:00<5:19:30,  8.68s/it]

Success in episode 791 with a reward of 292.7222303416834


 26%|██▋       | 792/3000 [59:03<4:14:17,  6.91s/it]

Success in episode 792 with a reward of 259.80901703462314


 26%|██▋       | 793/3000 [59:05<3:23:02,  5.52s/it]

Success in episode 793 with a reward of 263.30787970459596


 26%|██▋       | 794/3000 [59:26<6:06:26,  9.97s/it]

Success in episode 794 with a reward of 293.0166756437885


 26%|██▋       | 795/3000 [59:27<4:28:54,  7.32s/it]

Success in episode 795 with a reward of 258.7655895105303


 27%|██▋       | 796/3000 [59:29<3:33:30,  5.81s/it]

Success in episode 796 with a reward of 287.4707347108672


 27%|██▋       | 797/3000 [59:31<2:44:59,  4.49s/it]

Success in episode 797 with a reward of 299.9863135154968


 27%|██▋       | 798/3000 [59:56<6:30:27, 10.64s/it]

Success in episode 798 with a reward of 241.148841320683


 27%|██▋       | 799/3000 [59:57<4:44:24,  7.75s/it]

Success in episode 799 with a reward of 290.30564213718037
Running test
Test: 1 Test_Reward: 272.3154876878052
Test: 2 Test_Reward: 249.40550303590138
Test: 3 Test_Reward: 227.28062983220346
Test: 4 Test_Reward: 247.6180976220382
Test: 5 Test_Reward: 274.79449474573346
Test: 6 Test_Reward: 249.44866350882944
Test: 7 Test_Reward: 264.12477789021403
Test: 8 Test_Reward: 285.3428117505544
Test: 9 Test_Reward: 278.3368575206314
Test: 10 Test_Reward: 268.5409624135469
Test: 11 Test_Reward: 293.25695271482687
Test: 12 Test_Reward: 253.98415765511942
Test: 13 Test_Reward: 250.1891197438865
Test: 14 Test_Reward: 268.0050813289109
Test: 15 Test_Reward: 277.5939129745587
Test: 16 Test_Reward: 263.86301304274605
Test: 17 Test_Reward: 259.5981545609607
Test: 18 Test_Reward: 257.3456816988593
Test: 19 Test_Reward: 304.24497299679325
Test: 20 Test_Reward: 229.2518564356319
Test: 21 Test_Reward: 239.60946491228853
Test: 22 Test_Reward: 271.12054257302344
Test: 23 Test_Reward: 286.31874883722753
Test:

 27%|██▋       | 800/3000 [1:00:42<11:37:42, 19.03s/it]

Episodes: 800 Episodic_Reward: 253.39203255449135
Success in episode 800 with a reward of 253.39203255449135


 27%|██▋       | 801/3000 [1:00:43<8:18:56, 13.61s/it] 

Success in episode 801 with a reward of 227.83836817238398


 27%|██▋       | 802/3000 [1:00:44<5:59:54,  9.82s/it]

Success in episode 802 with a reward of 247.46727872352426


 27%|██▋       | 803/3000 [1:01:04<7:52:58, 12.92s/it]

Success in episode 803 with a reward of 259.41544716160445


 27%|██▋       | 804/3000 [1:01:05<5:41:31,  9.33s/it]

Success in episode 804 with a reward of 274.3063969788376


 27%|██▋       | 805/3000 [1:01:06<4:11:50,  6.88s/it]

Success in episode 805 with a reward of 279.54047156866613


 27%|██▋       | 806/3000 [1:01:07<3:06:58,  5.11s/it]

Success in episode 806 with a reward of 264.45311117694644


 27%|██▋       | 807/3000 [1:01:08<2:25:22,  3.98s/it]

Success in episode 807 with a reward of 279.08348501510113


 27%|██▋       | 808/3000 [1:01:35<6:31:51, 10.73s/it]

Success in episode 808 with a reward of 227.72207342681597


 27%|██▋       | 809/3000 [1:01:36<4:45:06,  7.81s/it]

Success in episode 809 with a reward of 266.4096938718221


 27%|██▋       | 810/3000 [1:01:37<3:31:44,  5.80s/it]

Episodes: 810 Episodic_Reward: 264.06789842790454
Success in episode 810 with a reward of 264.06789842790454


 27%|██▋       | 811/3000 [1:01:38<2:40:34,  4.40s/it]

Success in episode 811 with a reward of 304.484790329921


 27%|██▋       | 812/3000 [1:01:58<5:25:01,  8.91s/it]

Success in episode 812 with a reward of 217.34953470423943


 27%|██▋       | 813/3000 [1:01:59<4:00:26,  6.60s/it]

Success in episode 813 with a reward of 290.1904340133876


 27%|██▋       | 814/3000 [1:02:00<3:01:09,  4.97s/it]

Success in episode 814 with a reward of 277.7609417128152


 27%|██▋       | 815/3000 [1:02:01<2:18:42,  3.81s/it]

Success in episode 815 with a reward of 234.54266517231878


 27%|██▋       | 816/3000 [1:02:19<4:50:01,  7.97s/it]

Success in episode 816 with a reward of 261.5669920813999


 27%|██▋       | 817/3000 [1:02:20<3:33:52,  5.88s/it]

Success in episode 817 with a reward of 271.70230709246886


 27%|██▋       | 818/3000 [1:02:22<2:56:27,  4.85s/it]

Success in episode 818 with a reward of 253.67587696648079


 27%|██▋       | 819/3000 [1:02:23<2:14:41,  3.71s/it]

Success in episode 819 with a reward of 255.31681347096333


 27%|██▋       | 820/3000 [1:02:41<4:51:39,  8.03s/it]

Episodes: 820 Episodic_Reward: 283.8701258990028
Success in episode 820 with a reward of 283.8701258990028


 27%|██▋       | 821/3000 [1:02:42<3:35:30,  5.93s/it]

Success in episode 821 with a reward of 272.87129482648527


 27%|██▋       | 822/3000 [1:02:44<2:44:18,  4.53s/it]

Success in episode 822 with a reward of 295.4883501365157


 27%|██▋       | 823/3000 [1:02:45<2:05:28,  3.46s/it]

Success in episode 823 with a reward of 264.2450718043252


 27%|██▋       | 824/3000 [1:02:46<1:39:50,  2.75s/it]

Success in episode 824 with a reward of 242.13737629601738


 28%|██▊       | 825/3000 [1:03:07<5:02:36,  8.35s/it]

Success in episode 825 with a reward of 270.75125680697863


 28%|██▊       | 826/3000 [1:03:08<3:45:34,  6.23s/it]

Success in episode 826 with a reward of 274.6760444062171


 28%|██▊       | 827/3000 [1:03:10<2:51:36,  4.74s/it]

Success in episode 827 with a reward of 277.33478725077487


 28%|██▊       | 828/3000 [1:03:11<2:10:58,  3.62s/it]

Success in episode 828 with a reward of 236.663294740191


 28%|██▊       | 829/3000 [1:03:12<1:44:20,  2.88s/it]

Success in episode 829 with a reward of 301.79209402853166


 28%|██▊       | 830/3000 [1:03:33<5:03:47,  8.40s/it]

Episodes: 830 Episodic_Reward: 251.4656667638942
Success in episode 830 with a reward of 251.4656667638942


 28%|██▊       | 831/3000 [1:03:34<3:44:37,  6.21s/it]

Success in episode 831 with a reward of 269.02081222906605


 28%|██▊       | 832/3000 [1:03:36<2:56:54,  4.90s/it]

Looking good in 832 with a reward of 175.03873575227053


 28%|██▊       | 833/3000 [1:03:37<2:17:17,  3.80s/it]

Success in episode 833 with a reward of 292.1969682538354


 28%|██▊       | 834/3000 [1:03:56<4:54:07,  8.15s/it]

Success in episode 834 with a reward of 284.4773508646136


 28%|██▊       | 835/3000 [1:03:57<3:36:29,  6.00s/it]

Success in episode 835 with a reward of 242.2335957155164


 28%|██▊       | 836/3000 [1:03:58<2:43:59,  4.55s/it]

Success in episode 836 with a reward of 295.41002114665923


 28%|██▊       | 837/3000 [1:03:59<2:04:25,  3.45s/it]

Success in episode 837 with a reward of 272.4301103103923


 28%|██▊       | 838/3000 [1:04:00<1:38:18,  2.73s/it]

Success in episode 838 with a reward of 286.41565753727


 28%|██▊       | 839/3000 [1:04:21<4:58:50,  8.30s/it]

Success in episode 839 with a reward of 259.7773369860863


 28%|██▊       | 840/3000 [1:04:22<3:40:44,  6.13s/it]

Episodes: 840 Episodic_Reward: 279.7947090814636
Success in episode 840 with a reward of 279.7947090814636


 28%|██▊       | 841/3000 [1:04:23<2:45:44,  4.61s/it]

Success in episode 841 with a reward of 270.8641638855746


 28%|██▊       | 842/3000 [1:04:24<2:05:56,  3.50s/it]

Getting somewhere in 842 with a reward of 19.171282289375156


 28%|██▊       | 843/3000 [1:04:25<1:42:42,  2.86s/it]

Success in episode 843 with a reward of 299.40832331574893


 28%|██▊       | 844/3000 [1:04:45<4:45:37,  7.95s/it]

Success in episode 844 with a reward of 244.93501423129652


 28%|██▊       | 845/3000 [1:04:46<3:30:24,  5.86s/it]

Success in episode 845 with a reward of 225.78597570752055


 28%|██▊       | 846/3000 [1:04:47<2:37:08,  4.38s/it]

Success in episode 846 with a reward of 256.41996233171926


 28%|██▊       | 847/3000 [1:04:48<2:02:30,  3.41s/it]

Success in episode 847 with a reward of 283.6371715208824


 28%|██▊       | 848/3000 [1:04:50<1:42:01,  2.84s/it]

Success in episode 848 with a reward of 219.6829264304321


 28%|██▊       | 849/3000 [1:05:11<4:59:42,  8.36s/it]

Success in episode 849 with a reward of 296.51137152554287
Running test
Test: 1 Test_Reward: 256.39920988149237
Test: 2 Test_Reward: 288.1853984740023
Test: 3 Test_Reward: 294.0282431714601
Test: 4 Test_Reward: 284.45373109511104
Test: 5 Test_Reward: 292.3750186199287
Test: 6 Test_Reward: 273.44592887902604
Test: 7 Test_Reward: 284.6798713169168
Test: 8 Test_Reward: 305.7333941919125
Test: 9 Test_Reward: 235.70636944770035
Test: 10 Test_Reward: 227.93713096483557
Test: 11 Test_Reward: 284.0760129995546
Test: 12 Test_Reward: 265.9728969809547
Test: 14 Test_Reward: 257.4921907766627
Test: 15 Test_Reward: 215.13199845443881
Test: 17 Test_Reward: 276.94764442403306
Test: 18 Test_Reward: 295.26616470444844
Test: 19 Test_Reward: 246.99185518633547
Test: 20 Test_Reward: 293.5592511120219
Test: 21 Test_Reward: 267.9573173052393
Test: 22 Test_Reward: 258.1254257781412
Test: 24 Test_Reward: 261.3463622492693
Test: 25 Test_Reward: 260.0207928180115
Test: 26 Test_Reward: 243.97934846922226
Test: 2

 28%|██▊       | 850/3000 [1:05:54<11:12:22, 18.76s/it]

Episodes: 850 Episodic_Reward: -48.899501160208324


 28%|██▊       | 851/3000 [1:05:55<8:02:41, 13.48s/it] 

Success in episode 851 with a reward of 290.7231246332794


 28%|██▊       | 852/3000 [1:05:56<5:50:26,  9.79s/it]

Success in episode 852 with a reward of 294.3401008479414


 28%|██▊       | 853/3000 [1:06:13<6:59:16, 11.72s/it]

Success in episode 853 with a reward of 296.84662872153535


 28%|██▊       | 854/3000 [1:06:14<5:04:47,  8.52s/it]

Success in episode 854 with a reward of 279.6240102460574


 28%|██▊       | 855/3000 [1:06:15<3:45:08,  6.30s/it]

Success in episode 855 with a reward of 292.415857182953


 29%|██▊       | 856/3000 [1:06:16<2:50:25,  4.77s/it]

Success in episode 856 with a reward of 295.0128502514423


 29%|██▊       | 857/3000 [1:06:17<2:09:34,  3.63s/it]

Success in episode 857 with a reward of 238.85950573688086


 29%|██▊       | 858/3000 [1:06:44<6:19:12, 10.62s/it]

Success in episode 858 with a reward of 282.14054406269054


 29%|██▊       | 859/3000 [1:06:45<4:36:52,  7.76s/it]

Success in episode 859 with a reward of 256.5018183785013


 29%|██▊       | 860/3000 [1:06:46<3:26:53,  5.80s/it]

Episodes: 860 Episodic_Reward: 248.00604127563966
Success in episode 860 with a reward of 248.00604127563966


 29%|██▊       | 861/3000 [1:06:47<2:35:34,  4.36s/it]

Success in episode 861 with a reward of 301.8275655789405


 29%|██▊       | 862/3000 [1:07:08<5:26:52,  9.17s/it]

Success in episode 862 with a reward of 246.27531454060792


 29%|██▉       | 863/3000 [1:07:09<3:59:33,  6.73s/it]

Success in episode 863 with a reward of 275.946547430968


 29%|██▉       | 864/3000 [1:07:10<3:07:39,  5.27s/it]

Success in episode 864 with a reward of 227.46727811004152


 29%|██▉       | 865/3000 [1:07:11<2:22:06,  3.99s/it]

Success in episode 865 with a reward of 276.57750686098757


 29%|██▉       | 866/3000 [1:07:31<5:05:48,  8.60s/it]

Success in episode 866 with a reward of 285.3287057970239


 29%|██▉       | 867/3000 [1:07:32<3:44:03,  6.30s/it]

Success in episode 867 with a reward of 274.6859055187435


 29%|██▉       | 868/3000 [1:07:33<2:47:57,  4.73s/it]

Success in episode 868 with a reward of 239.79100211317441


 29%|██▉       | 869/3000 [1:07:34<2:08:42,  3.62s/it]

Success in episode 869 with a reward of 253.44039866939582


 29%|██▉       | 870/3000 [1:07:35<1:40:52,  2.84s/it]

Episodes: 870 Episodic_Reward: 261.6639103464534
Success in episode 870 with a reward of 261.6639103464534


 29%|██▉       | 872/3000 [1:07:55<3:19:22,  5.62s/it]

Success in episode 872 with a reward of 276.3311232264406


 29%|██▉       | 873/3000 [1:07:56<2:30:33,  4.25s/it]

Success in episode 873 with a reward of 236.47372881365504


 29%|██▉       | 874/3000 [1:07:57<1:58:24,  3.34s/it]

Success in episode 874 with a reward of 294.1576011822071


 29%|██▉       | 875/3000 [1:08:15<4:40:55,  7.93s/it]

Success in episode 875 with a reward of 253.8739155869244


 29%|██▉       | 876/3000 [1:08:16<3:27:40,  5.87s/it]

Success in episode 876 with a reward of 289.1619942524188


 29%|██▉       | 877/3000 [1:08:17<2:36:02,  4.41s/it]

Success in episode 877 with a reward of 256.127031853722


 29%|██▉       | 878/3000 [1:08:19<2:01:48,  3.44s/it]

Success in episode 878 with a reward of 302.77621382722805


 29%|██▉       | 879/3000 [1:08:37<4:39:07,  7.90s/it]

Success in episode 879 with a reward of 268.7050768862464


 29%|██▉       | 880/3000 [1:08:40<3:48:42,  6.47s/it]

Episodes: 880 Episodic_Reward: 284.0694923629172
Success in episode 880 with a reward of 284.0694923629172


 29%|██▉       | 881/3000 [1:08:41<2:51:09,  4.85s/it]

Success in episode 881 with a reward of 264.1470585825733


 29%|██▉       | 882/3000 [1:08:57<4:43:59,  8.05s/it]

Success in episode 882 with a reward of 254.70496561496114


 29%|██▉       | 883/3000 [1:08:58<3:32:32,  6.02s/it]

Success in episode 883 with a reward of 244.07142279194775


 29%|██▉       | 884/3000 [1:08:59<2:40:29,  4.55s/it]

Getting somewhere in 884 with a reward of 18.64301074275032


 30%|██▉       | 885/3000 [1:09:19<5:23:40,  9.18s/it]

Success in episode 885 with a reward of 254.895862543264


 30%|██▉       | 886/3000 [1:09:20<3:58:30,  6.77s/it]

Success in episode 886 with a reward of 309.50864250138045


 30%|██▉       | 887/3000 [1:09:21<2:57:55,  5.05s/it]

Getting somewhere in 887 with a reward of 66.00225471797935


 30%|██▉       | 888/3000 [1:09:22<2:16:50,  3.89s/it]

Success in episode 888 with a reward of 280.1628203565919


 30%|██▉       | 889/3000 [1:09:40<4:37:18,  7.88s/it]

Success in episode 889 with a reward of 311.580671103887


 30%|██▉       | 890/3000 [1:09:41<3:24:22,  5.81s/it]

Episodes: 890 Episodic_Reward: 301.37435558767845
Success in episode 890 with a reward of 301.37435558767845


 30%|██▉       | 891/3000 [1:09:44<2:56:19,  5.02s/it]

Success in episode 891 with a reward of 284.8877376791803


 30%|██▉       | 892/3000 [1:09:59<4:42:20,  8.04s/it]

Success in episode 892 with a reward of 252.99296482850625


 30%|██▉       | 893/3000 [1:10:00<3:30:31,  5.99s/it]

Success in episode 893 with a reward of 283.5911192198984


 30%|██▉       | 894/3000 [1:10:02<2:42:30,  4.63s/it]

Success in episode 894 with a reward of 295.152943146956


 30%|██▉       | 895/3000 [1:10:03<2:04:50,  3.56s/it]

Success in episode 895 with a reward of 282.0054639198336


 30%|██▉       | 896/3000 [1:10:22<4:53:09,  8.36s/it]

Success in episode 896 with a reward of 305.9037421843301


 30%|██▉       | 897/3000 [1:10:23<3:35:28,  6.15s/it]

Success in episode 897 with a reward of 301.3681947865818


 30%|██▉       | 898/3000 [1:10:25<2:45:04,  4.71s/it]

Success in episode 898 with a reward of 284.6320586386828


 30%|██▉       | 899/3000 [1:10:26<2:08:00,  3.66s/it]

Success in episode 899 with a reward of 232.18701545079134
Running test
Test: 1 Test_Reward: 300.4174515534189
Test: 2 Test_Reward: 269.1880929892857
Test: 3 Test_Reward: 232.42593289194838
Test: 4 Test_Reward: 273.6500263811837
Test: 5 Test_Reward: 226.1602468803434
Test: 6 Test_Reward: 287.3812678309893
Test: 8 Test_Reward: 284.53310376749425
Test: 9 Test_Reward: 281.5909848941346
Test: 10 Test_Reward: 230.83513028376706
Test: 11 Test_Reward: 283.48206863143764
Test: 12 Test_Reward: 270.530306277858
Test: 13 Test_Reward: 265.93506246224615
Test: 14 Test_Reward: 294.9238454651828
Test: 15 Test_Reward: 274.3982911667542
Test: 16 Test_Reward: 273.37854454201965
Test: 17 Test_Reward: 290.9974351488406
Test: 18 Test_Reward: 297.15854609750704
Test: 19 Test_Reward: 293.29487598216633
Test: 20 Test_Reward: 256.2934396680771
Test: 21 Test_Reward: 281.81252065873093
Test: 22 Test_Reward: 280.0502988607407
Test: 23 Test_Reward: 250.234617687204
Test: 24 Test_Reward: 294.4018493710514
Test: 25 

 30%|███       | 900/3000 [1:11:28<12:24:24, 21.27s/it]

Episodes: 900 Episodic_Reward: 264.79653561274733
Success in episode 900 with a reward of 264.79653561274733


 30%|███       | 901/3000 [1:11:29<8:53:00, 15.24s/it] 

Success in episode 901 with a reward of 258.3355260097435


 30%|███       | 902/3000 [1:11:30<6:23:14, 10.96s/it]

Success in episode 902 with a reward of 282.19777869954873


 30%|███       | 903/3000 [1:11:31<4:40:44,  8.03s/it]

Success in episode 903 with a reward of 229.49575961794247


 30%|███       | 904/3000 [1:11:32<3:27:30,  5.94s/it]

Success in episode 904 with a reward of 253.70308080873204


 30%|███       | 906/3000 [1:11:54<4:21:10,  7.48s/it]

Success in episode 906 with a reward of 271.3386265877655


 30%|███       | 907/3000 [1:11:55<3:13:54,  5.56s/it]

Success in episode 907 with a reward of 237.50536901287626


 30%|███       | 908/3000 [1:11:56<2:27:36,  4.23s/it]

Success in episode 908 with a reward of 255.73510892792373


 30%|███       | 909/3000 [1:11:57<1:54:38,  3.29s/it]

Success in episode 909 with a reward of 267.16414727371466


 30%|███       | 910/3000 [1:12:18<4:58:54,  8.58s/it]

Episodes: 910 Episodic_Reward: 225.38423146387902
Success in episode 910 with a reward of 225.38423146387902


 30%|███       | 911/3000 [1:12:19<3:40:26,  6.33s/it]

Success in episode 911 with a reward of 257.33008568080436


 30%|███       | 912/3000 [1:12:20<2:45:54,  4.77s/it]

Success in episode 912 with a reward of 257.3508828649315


 30%|███       | 913/3000 [1:12:21<2:08:26,  3.69s/it]

Success in episode 913 with a reward of 270.8123953750912


 30%|███       | 914/3000 [1:12:23<1:42:34,  2.95s/it]

Success in episode 914 with a reward of 274.19803084604035


 30%|███       | 914/3000 [1:12:26<2:45:20,  4.76s/it]


KeyboardInterrupt: 

In [18]:
PPO_agent.critic.save_weights("G:/My Drive/JAXA/2020-2021/Transformer/weights/PPO_Lander_critic_BEST" + ".h5")
PPO_agent.actor.save_weights("G:/My Drive/JAXA/2020-2021/Transformer/weights/PPO_Lander_actor_BEST" + ".h5")

In [None]:
PPO_agent.distribution_buffer


In [None]:
env = gym.make(ENV_NAME)

In [None]:
a = env.action_space.sample()

In [None]:
PPO_agent.last_action

In [None]:
a.shape

In [None]:
b = tf.squeeze(PPO_agent.last_action).numpy()

In [None]:
b.shape