In [1]:
import os
import sys
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions.normal import Normal
from tqdm import tqdm
from collections import deque

import time
import psutil
import datetime
import subprocess
# import torch
import torchvision
from tensorboard import program
import webbrowser
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter

RED = "\033[31m"
GREEN = "\033[32m"
YELLOW = "\033[33m"
BLUE = "\033[34m"
MAGENTA = "\033[35m"
CYAN = "\033[36m"
RESET = "\033[0m"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
current_time = datetime.datetime.now().strftime("%Y%m%d_%H-%M-%S")
log_dir = f"../runs/{current_time}/"
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir)
print(f"{YELLOW}[MODEL/TENSORBOARD]{RESET} The data will be saved in {YELLOW}{log_dir}{RESET} directory!")

# tb = program.TensorBoard()
# tb.configure(argv=[None, '--logdir', f"../runs/franka_cabinet/{current_time}", '--port', '6300'])
# url = tb.launch()
# webbrowser.open_new(url)

#learning rate backward propagation NN action
lr_actor = 0.0003
#learning rate backward propagation NN state value estimation
lr_critic = 0.0003
#Number of Learning Iteration we want to perform
Iter = 100000
#Number max of step to realise in one episode. 
MAX_STEP = 1000
#How rewards are discounted.
gamma =0.98
#How do we stabilize variance in the return computation.
lambd = 0.95
#batch to train on
batch_size = 64
# Do we want high change to be taken into account.
epsilon = 0.2
#weight decay coefficient in ADAM for state value optim.
l2_rate = 0.001

save_freq = 100

save_flag = False

[33m[MODEL/TENSORBOARD][0m The data will be saved in [33m../runs/20240712_02-38-11/[0m directory!


In [2]:
# Actor class: Used to choose actions of a continuous action space.

class Actor(nn.Module):
    def __init__(self, N_S, N_A, chkpt_dir):
      # Initialize NN structure.
        super(Actor,self).__init__()
        self.fc1 = nn.Linear(N_S,64)
        self.fc2 = nn.Linear(64,64)
        self.sigma = nn.Linear(64,N_A)
        self.mu = nn.Linear(64,N_A)
        self.mu.weight.data.mul_(0.1)
        self.mu.bias.data.mul_(0.0)
        # This approach use gaussian distribution to decide actions. Could be
        # something else.
        self.distribution = torch.distributions.Normal
        
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, '_actor')
        
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

    def set_init(self,layers):
      # Initialize weight and bias according to a normal distrib mean 0 and sd 0.1.
        for layer in layers:
            nn.init.normal_(layer.weight,mean=0.,std=0.1)
            nn.init.constant_(layer.bias,0.)

    def forward(self,s):
      # Use of tanh activation function is recommanded : bounded [-1,1],
      # gives some non-linearity, and tends to give some stability.
        x = torch.tanh(self.fc1(s))
        x = torch.tanh(self.fc2(x))
        # mu action output of the NN.
        mu = self.mu(x)
        #log_sigma action output of the NN
        log_sigma = self.sigma(x)
        sigma = torch.exp(log_sigma)
        return mu,sigma

    def choose_action(self,s):
      # Choose action in the continuous action space using normal distribution
      # defined by mu and sigma of each actions returned by the NN.
        s = torch.from_numpy(np.array(s).astype(np.float32)).unsqueeze(0).to(self.device)
        mu,sigma = self.forward(s)
        Pi = self.distribution(mu,sigma)
        return Pi.sample().cpu().numpy().squeeze(0)
    
    def save_model(self):
        torch.save(self.state_dict(), self.checkpoint_file)
        
    def load_model(self):
        self.load_state_dict(torch.load(self.checkpoint_file))

In [3]:
# Critic class : Used to estimate V(state) the state value function through a NN.
class Critic(nn.Module):
    def __init__(self, N_S, chkpt_dir):
      # Initialize NN structure.
        super(Critic,self).__init__()
        self.fc1 = nn.Linear(N_S,64)
        self.fc2 = nn.Linear(64,64)
        self.fc3 = nn.Linear(64,1)
        self.fc3.weight.data.mul_(0.1) # 초기 weight에 0.1을 곱해주면서 학습을 더 안정적으로 할 수 있도록(tanh, sigmoid를 사용할 경우 많이 쓰는 방식)
        self.fc3.bias.data.mul_(0.0) # bias tensor의 모든 원소를 0으로 설정
        
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, '_critic')
        
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)

    def set_init(self,layers):
      # Initialize weight and bias according to a normal distrib mean 0 and sd 0.1.
        for layer in layers:
            nn.init.normal_(layer.weight,mean=0.,std=0.1)
            nn.init.constant_(layer.bias,0.)

    def forward(self,s):
      # Use of tanh activation function is recommanded.
        x = torch.tanh(self.fc1(s))
        x = torch.tanh(self.fc2(x))
        values = self.fc3(x)
        return values
    
    def save_model(self):
        torch.save(self.state_dict(), self.checkpoint_file)
        
    def load_model(self):
        self.load_state_dict(torch.load(self.checkpoint_file))

In [4]:
class PPO:
    def __init__(self, N_S, N_A, log_dir):
        self.log_dir = log_dir
        
        self.actor_net = Actor(N_S, N_A, log_dir)
        self.critic_net = Critic(N_S, log_dir)
        self.actor_optim = optim.Adam(self.actor_net.parameters(), lr=1e-4)
        self.critic_optim = optim.Adam(self.critic_net.parameters(), lr=1e-3, weight_decay=1e-3)
        self.critic_loss_func = torch.nn.MSELoss()
        
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    def train(self, memory):
        states, actions, rewards, masks = [], [], [], []
        
        for m in memory:
            states.append(m[0])
            actions.append(m[1])
            rewards.append(m[2])
            masks.append(m[3])
        
        states = torch.tensor(np.array(states), dtype=torch.float32).to(self.device)
        actions = torch.tensor(np.array(actions), dtype=torch.float32).to(self.device)
        rewards = torch.tensor(np.array(rewards), dtype=torch.float32).to(self.device)
        masks = torch.tensor(np.array(masks), dtype=torch.float32).to(self.device)

        # Use critic network defined in Model.py
        # This function enables to get the current state value V(S).
        values = self.critic_net(states)
        # Get advantage.
        returns,advants = self.get_gae(rewards,masks,values)
        #Get old mu and std.
        old_mu,old_std = self.actor_net(states)
        #Get the old distribution.
        pi = self.actor_net.distribution(old_mu,old_std)
        #Compute old policy.
        old_log_prob = pi.log_prob(actions).sum(1,keepdim=True)

        # Everything happens here
        n = len(states)
        arr = np.arange(n)
        for epoch in range(1):
            np.random.shuffle(arr)
            for i in range(n//batch_size):
                b_index = arr[batch_size*i:batch_size*(i+1)]
                b_states = states[b_index]
                b_advants = advants[b_index].unsqueeze(1)
                b_actions = actions[b_index]
                b_returns = returns[b_index].unsqueeze(1)

                #New parameter of the policy distribution by action.
                mu,std = self.actor_net(b_states)
                pi = self.actor_net.distribution(mu,std)
                new_prob = pi.log_prob(b_actions).sum(1,keepdim=True)
                old_prob = old_log_prob[b_index].detach()
                #Regularisation fixed KL : does not work as good as following clipping strategy
                # empirically.
                # KL_penalty = self.kl_divergence(old_mu[b_index],old_std[b_index],mu,std)
                ratio = torch.exp(new_prob-old_prob)

                surrogate_loss = ratio*b_advants
                values = self.critic_net(b_states)
                # MSE Loss : (State action value - State value)^2
                critic_loss = self.critic_loss_func(values,b_returns)
                # critic_loss = critic_loss - beta*KL_penalty

                self.critic_optim.zero_grad()
                critic_loss.backward()
                self.critic_optim.step()
                #Clipping strategy
                ratio = torch.clamp(ratio,1.0-epsilon,1.0+epsilon)
                clipped_loss =ratio*b_advants
                # Actual loss
                actor_loss = -torch.min(surrogate_loss,clipped_loss).mean()
                
                walker_xvel = torch.tensor([get_walker_x_velocity(state) for state in b_states], dtype=torch.float32).to(self.device)
                actor_loss = augmented_objective(actor_loss, walker_xvel, 3, 20)

                #Now that we have the loss, we can do the backward propagation to learn : everything is here.
                self.actor_optim.zero_grad()
                actor_loss.backward()
                self.actor_optim.step()
                
    # Get the Kullback - Leibler divergence: Measure of the diff btwn new and old policy:
    # Could be used for the objective function depending on the strategy that needs to be
    # teste.
    def kl_divergence(self,old_mu,old_sigma,mu,sigma):

        old_mu = old_mu.detach()
        old_sigma = old_sigma.detach()

        kl = torch.log(old_sigma) - torch.log(sigma) + (old_sigma.pow(2) + (old_mu - mu).pow(2)) / \
             (2.0 * sigma.pow(2)) - 0.5
        return kl.sum(1, keepdim=True)
    
    # Advantage estimation:
    def get_gae(self,rewards, masks, values):
        rewards = torch.Tensor(rewards).to(self.device)
        masks = torch.Tensor(masks).to(self.device)
        #Create an equivalent fullfilled of 0.
        returns = torch.zeros_like(rewards).to(self.device)
        advants = torch.zeros_like(rewards).to(self.device)
        #Init
        running_returns = 0
        previous_value = 0
        running_advants = 0
        #Here we compute A_t the advantage.
        for t in reversed(range(0, len(rewards))):
            # Here we compute the discounted returns. Gamma is the discount factor.
            running_returns = rewards[t] + gamma * running_returns * masks[t]
            #computes the difference between the estimated value at time step t (values.data[t]) and the discounted next value.
            running_tderror = rewards[t] + gamma * previous_value * masks[t] - values.data[t]
            # Compute advantage
            running_advants = running_tderror + gamma * lambd * running_advants * masks[t]

            returns[t] = running_returns
            previous_value = values.data[t]
            advants[t] = running_advants
        #Normalization to stabilize final advantage of the history to now.
        advants = (advants - advants.mean()) / advants.std()
        return returns, advants

    def save(self, filename):
        filename = str(filename)
        torch.save(self.actor_net.state_dict(), filename + "_actor")
        torch.save(self.critic_net.state_dict(), filename + "_critic")
        torch.save(self.actor_optim.state_dict(), filename + "_actor_optimizer")
        torch.save(self.critic_optim.state_dict(), filename + "_critic_optimizer")

    def load(self, filename):
        filename = str(filename)
        self.actor_net.load_state_dict(torch.load(filename + "_actor"))
        self.critic_net.load_state_dict(torch.load(filename + "_critic"))
        self.actor_optim.load_state_dict(torch.load(filename + "_actor_optimizer"))
        self.critic_optim.load_state_dict(torch.load(filename + "_critic_optimizer"))

In [5]:
# Creation of a class to normalize the states
class Normalize:
    def __init__(self, N_S, chkpt_dir):
        self.mean = np.zeros((N_S,))
        self.std = np.zeros((N_S, ))
        self.stdd = np.zeros((N_S, ))
        self.n = 0
        
        self.checkpoint_dir = chkpt_dir
        self.checkpoint_file = os.path.join(self.checkpoint_dir, '_normalize')
        
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    def __call__(self, x):
        x = np.asarray(x)
        self.n += 1
        if self.n == 1:
            self.mean = x
        else:
            old_mean = self.mean.copy()
            self.mean = old_mean + (x - old_mean) / self.n
            self.stdd = self.stdd + (x - old_mean) * (x - self.mean)
        if self.n > 1:
            self.std = np.sqrt(self.stdd / (self.n - 1))
        else:
            self.std = self.mean

        x = x - self.mean
        x = x / (self.std + 1e-8)
        x = np.clip(x, -5, +5)
        return x
    
    def update(self, x):
        self.mean = np.mean(x, axis=0)
        self.std = np.std(x, axis=0) + 1e-8
    
    def save_params(self):
        np.save(self.checkpoint_file, {'mean': self.mean, 'std': self.std})

    def load_params(self):
        params = np.load(self.checkpoint_file, allow_pickle=True).item()
        self.mean = params['mean']
        self.std = params['std']

In [6]:
def test_model(env, model, episodes=10):
    scores = []
    for episode in range(episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        while not done:
            env.render()
            state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(DEVICE)
            action = model.actor_net.choose_action(state)
            state, reward, done, _, _ = env.step(action)
            total_reward += reward
        scores.append(total_reward)
        print(f"Episode {episode + 1}: Total Reward: {total_reward}")
    print(f"Average Reward over {episodes} episodes: {np.mean(scores)}")
    env.close()
    
def get_walker_x_velocity(state):
    x_vel = state[8]
    return x_vel

def logarithmic_barrier(state, constraint_max):
    return -torch.log(-(state - constraint_max))

def augmented_objective(actor_loss, state, constraint_max, t):
    constraint_barrier = logarithmic_barrier(state, constraint_max) / t
    return actor_loss + constraint_barrier.mean()

In [None]:
def main():
    env = gym.make('Walker2d-v4', render_mode='rgb_array')

    #Number of state and action
    N_S = env.observation_space.shape[0]
    N_A = env.action_space.shape[0]

    # Random seed initialization
    # env.seed(500)
    # torch.manual_seed(500)
    # np.random.seed(500)

    # Run the Ppo class
    frames = []
    ppo = PPO(N_S, N_A, log_dir)
    # ppo.actor_net.load_model("../runs/20240708_11-19-08/ppo/100000/")
    # ppo.critic_net.load_model("../runs/20240708_11-19-08/ppo/100000/")
    
    # Normalisation for stability, fast convergence... always good to do.
    normalize = Normalize(N_S, log_dir)
    episodes = 0
    eva_episodes = 0
    episode_data = []
    state, _ = env.reset()

    for iter in tqdm(range(Iter)):
        memory = deque()
        scores = []
        steps = 0
        xvel = []
        while steps < 2048: #Horizon
            episodes += 1
            state, _ = env.reset()
            s = normalize(state)
            score = 0
            for _ in range(MAX_STEP):
                steps += 1
                #Choose an action: detailed in PPO.py
                # The action is a numpy array of 17 elements. It means that in the 17 possible directions of action we have a specific value in the continuous space.
                # Exemple : the first coordinate correspond to the Torque applied on the hinge in the y-coordinate of the abdomen: this is continuous space.
                a = ppo.actor_net.choose_action(s)
                # print(f"{YELLOW}walker velocity: {RESET}", s[8]) # 3
                #Environnement reaction to the action : There is a reaction in the 376 elements that characterize the space :
                # Exemple : the first coordinate of the states is the z-coordinate of the torso (centre) and using env.step(a), we get the reaction of this state and
                # of all the other ones after the action has been made.
                s_ , r ,truncated, terminated ,info = env.step(a)
                s_ = normalize(s_)
                done = truncated or terminated

                # Do we continue or do we terminate an episode?
                mask = (1-done)*1
                memory.append([s,a,r,mask])
                # print('s: ', s)
                # print('a: ', a)
                # print('r: ', r)
                # print('mask: ', mask)
                xvel.append(s[8])
                score += r
                s = s_

                if done:
                    break
            # with open('log_' + args.env_name  + '.txt', 'a') as outfile:
            #     outfile.write('\t' + str(episodes)  + '\t' + str(score) + '\n')
            scores.append(score)
        score_avg = np.mean(scores)
        xvel_avg = np.mean(xvel)
        print('{} episode score is {:.2f}, average_xvel is {:.3f}'.format(episodes, score_avg, xvel_avg))
        episode_data.append([iter + 1, score_avg])
        if (iter + 1) % save_freq == 0:
            save_flag = True

            if save_flag:
                ppo.actor_net.save_model()
                ppo.critic_net.save_model()
                normalize.save_params()
                print(f"{GREEN} >> Successfully saved models! {RESET}")
                # path = log_dir + "ppo/" + str((iter + 1)) + "/"
                # os.makedirs(path, exist_ok=True)
                # if not os.path.exists(path):
                #     os.makedirs(path)
                # ppo.save(path)

                np.save(log_dir + "reward.npy", episode_data)
                save_flag = False

        ppo.train(memory)
        

if __name__ == "__main__":
    main()

  0%|          | 0/100000 [00:00<?, ?it/s]

109 episode score is -0.10, average_xvel is 0.046


  0%|          | 1/100000 [00:03<110:32:47,  3.98s/it]

210 episode score is 0.33, average_xvel is 0.029


  0%|          | 2/100000 [00:07<98:59:16,  3.56s/it] 

315 episode score is 0.53, average_xvel is 0.027


  0%|          | 3/100000 [00:10<95:19:39,  3.43s/it]

421 episode score is 1.35, average_xvel is 0.078


  0%|          | 4/100000 [00:13<93:41:34,  3.37s/it]

523 episode score is 0.81, average_xvel is 0.032


  0%|          | 5/100000 [00:17<92:21:36,  3.33s/it]

617 episode score is 2.13, average_xvel is 0.100


  0%|          | 6/100000 [00:20<91:32:36,  3.30s/it]

718 episode score is 1.69, average_xvel is 0.072


  0%|          | 7/100000 [00:23<91:17:10,  3.29s/it]

806 episode score is 3.34, average_xvel is 0.142


  0%|          | 8/100000 [00:26<91:05:50,  3.28s/it]

895 episode score is 4.46, average_xvel is 0.199


  0%|          | 9/100000 [00:30<91:00:37,  3.28s/it]

988 episode score is 3.83, average_xvel is 0.149


  0%|          | 10/100000 [00:33<90:56:10,  3.27s/it]

1070 episode score is 5.34, average_xvel is 0.195


  0%|          | 11/100000 [00:36<90:43:14,  3.27s/it]

1141 episode score is 6.82, average_xvel is 0.205


  0%|          | 12/100000 [00:39<91:14:45,  3.29s/it]

1213 episode score is 7.70, average_xvel is 0.249


  0%|          | 13/100000 [00:43<91:00:25,  3.28s/it]

1279 episode score is 8.69, average_xvel is 0.235


  0%|          | 14/100000 [00:46<91:04:55,  3.28s/it]

1332 episode score is 9.78, average_xvel is 0.179


  0%|          | 15/100000 [00:49<90:54:05,  3.27s/it]

1382 episode score is 16.20, average_xvel is 0.373


  0%|          | 16/100000 [00:53<91:00:23,  3.28s/it]

1431 episode score is 19.89, average_xvel is 0.458


  0%|          | 17/100000 [00:56<91:00:48,  3.28s/it]

1477 episode score is 21.03, average_xvel is 0.426


  0%|          | 18/100000 [00:59<91:05:58,  3.28s/it]

1519 episode score is 22.74, average_xvel is 0.387


  0%|          | 19/100000 [01:02<90:49:18,  3.27s/it]

1558 episode score is 30.70, average_xvel is 0.529


  0%|          | 20/100000 [01:06<90:58:10,  3.28s/it]

1595 episode score is 33.89, average_xvel is 0.523


  0%|          | 21/100000 [01:09<91:20:30,  3.29s/it]

1627 episode score is 43.69, average_xvel is 0.593


  0%|          | 22/100000 [01:12<91:15:24,  3.29s/it]

1657 episode score is 43.32, average_xvel is 0.475


  0%|          | 23/100000 [01:16<91:43:03,  3.30s/it]

1675 episode score is 113.08, average_xvel is 0.944


  0%|          | 24/100000 [01:19<91:21:24,  3.29s/it]

1693 episode score is 123.20, average_xvel is 0.966


  0%|          | 25/100000 [01:22<91:18:10,  3.29s/it]

1710 episode score is 120.56, average_xvel is 0.777


  0%|          | 26/100000 [01:26<93:30:04,  3.37s/it]

1726 episode score is 118.12, average_xvel is 0.691


  0%|          | 27/100000 [01:29<92:48:11,  3.34s/it]

1742 episode score is 133.53, average_xvel is 0.779


  0%|          | 28/100000 [01:32<93:01:00,  3.35s/it]

1756 episode score is 176.43, average_xvel is 0.946


  0%|          | 29/100000 [01:36<92:45:25,  3.34s/it]

1772 episode score is 127.82, average_xvel is 0.671


  0%|          | 30/100000 [01:39<92:16:23,  3.32s/it]

1784 episode score is 205.17, average_xvel is 0.881


  0%|          | 31/100000 [01:42<91:53:15,  3.31s/it]

1796 episode score is 190.28, average_xvel is 0.738


  0%|          | 32/100000 [01:45<91:48:49,  3.31s/it]

1811 episode score is 188.20, average_xvel is 0.984


  0%|          | 33/100000 [01:49<92:05:49,  3.32s/it]

1826 episode score is 186.28, average_xvel is 0.966


  0%|          | 34/100000 [01:52<91:39:26,  3.30s/it]

1840 episode score is 188.32, average_xvel is 0.819


  0%|          | 35/100000 [01:55<91:34:50,  3.30s/it]

1854 episode score is 233.87, average_xvel is 1.138


  0%|          | 36/100000 [01:59<91:22:06,  3.29s/it]

1868 episode score is 216.79, average_xvel is 0.961


  0%|          | 37/100000 [02:02<91:43:31,  3.30s/it]

1878 episode score is 297.35, average_xvel is 0.790


  0%|          | 38/100000 [02:06<93:48:15,  3.38s/it]

1892 episode score is 237.53, average_xvel is 0.892


  0%|          | 39/100000 [02:09<95:50:51,  3.45s/it]

1903 episode score is 234.76, average_xvel is 0.642


  0%|          | 40/100000 [02:12<94:34:08,  3.41s/it]

1919 episode score is 182.08, average_xvel is 0.717


  0%|          | 41/100000 [02:16<95:13:53,  3.43s/it]

1932 episode score is 226.09, average_xvel is 0.721


  0%|          | 42/100000 [02:19<95:24:21,  3.44s/it]

1947 episode score is 209.61, average_xvel is 0.794


  0%|          | 43/100000 [02:23<95:47:57,  3.45s/it]

1958 episode score is 298.08, average_xvel is 0.795


  0%|          | 44/100000 [02:26<96:43:45,  3.48s/it]

1966 episode score is 321.36, average_xvel is 0.493


  0%|          | 45/100000 [02:30<96:05:12,  3.46s/it]

1976 episode score is 316.39, average_xvel is 0.808


  0%|          | 46/100000 [02:33<95:02:57,  3.42s/it]

1988 episode score is 250.75, average_xvel is 0.726


  0%|          | 47/100000 [02:36<94:05:41,  3.39s/it]

2000 episode score is 273.83, average_xvel is 0.752


  0%|          | 48/100000 [02:40<95:04:20,  3.42s/it]

2013 episode score is 213.97, average_xvel is 0.586


  0%|          | 49/100000 [02:43<93:39:38,  3.37s/it]

2025 episode score is 274.70, average_xvel is 0.662


  0%|          | 50/100000 [02:47<96:04:17,  3.46s/it]

2037 episode score is 226.95, average_xvel is 0.505


  0%|          | 51/100000 [02:50<95:07:00,  3.43s/it]

2048 episode score is 265.25, average_xvel is 0.566


  0%|          | 52/100000 [02:54<95:01:39,  3.42s/it]

2060 episode score is 280.22, average_xvel is 0.734


  0%|          | 53/100000 [02:57<95:33:30,  3.44s/it]

2072 episode score is 297.06, average_xvel is 0.799


  0%|          | 54/100000 [03:01<96:06:57,  3.46s/it]

2085 episode score is 234.26, average_xvel is 0.531


  0%|          | 55/100000 [03:04<96:33:46,  3.48s/it]

2096 episode score is 230.00, average_xvel is 0.370


  0%|          | 56/100000 [03:07<94:50:57,  3.42s/it]

2108 episode score is 272.73, average_xvel is 0.713


  0%|          | 57/100000 [03:11<93:45:17,  3.38s/it]

2122 episode score is 210.46, average_xvel is 0.514


  0%|          | 58/100000 [03:14<93:36:55,  3.37s/it]

2134 episode score is 259.02, average_xvel is 0.606


  0%|          | 59/100000 [03:18<94:09:38,  3.39s/it]

2146 episode score is 276.21, average_xvel is 0.623


  0%|          | 60/100000 [03:21<94:49:45,  3.42s/it]

2161 episode score is 214.33, average_xvel is 0.615


  0%|          | 61/100000 [03:24<94:23:13,  3.40s/it]

2173 episode score is 256.61, average_xvel is 0.537


  0%|          | 62/100000 [03:28<94:06:22,  3.39s/it]

2184 episode score is 282.54, average_xvel is 0.562


  0%|          | 63/100000 [03:31<93:37:35,  3.37s/it]

2195 episode score is 269.93, average_xvel is 0.512


  0%|          | 64/100000 [03:34<92:42:13,  3.34s/it]

2208 episode score is 264.45, average_xvel is 0.587


  0%|          | 65/100000 [03:38<94:41:15,  3.41s/it]

2221 episode score is 244.99, average_xvel is 0.500


  0%|          | 66/100000 [03:41<95:26:26,  3.44s/it]

2235 episode score is 247.16, average_xvel is 0.578


  0%|          | 67/100000 [03:45<96:31:25,  3.48s/it]

2249 episode score is 245.98, average_xvel is 0.672


  0%|          | 68/100000 [03:48<95:15:02,  3.43s/it]

2263 episode score is 220.60, average_xvel is 0.503


  0%|          | 69/100000 [03:52<94:18:23,  3.40s/it]

2276 episode score is 252.47, average_xvel is 0.565


  0%|          | 70/100000 [03:55<94:02:31,  3.39s/it]

2290 episode score is 212.57, average_xvel is 0.429


  0%|          | 71/100000 [03:58<93:19:20,  3.36s/it]

2304 episode score is 227.51, average_xvel is 0.556


  0%|          | 72/100000 [04:02<92:25:09,  3.33s/it]

2315 episode score is 297.69, average_xvel is 0.493


  0%|          | 73/100000 [04:05<93:34:20,  3.37s/it]

2326 episode score is 304.60, average_xvel is 0.588


  0%|          | 74/100000 [04:08<93:07:45,  3.36s/it]

2337 episode score is 288.96, average_xvel is 0.492


  0%|          | 75/100000 [04:12<92:49:58,  3.34s/it]

2349 episode score is 269.39, average_xvel is 0.519


  0%|          | 76/100000 [04:15<92:56:05,  3.35s/it]

2361 episode score is 273.33, average_xvel is 0.564


  0%|          | 77/100000 [04:18<92:26:10,  3.33s/it]

2374 episode score is 235.44, average_xvel is 0.376


  0%|          | 78/100000 [04:22<93:17:43,  3.36s/it]

2385 episode score is 264.56, average_xvel is 0.261


  0%|          | 79/100000 [04:25<94:47:21,  3.42s/it]

2396 episode score is 281.42, average_xvel is 0.455


  0%|          | 80/100000 [04:29<93:33:32,  3.37s/it]

2406 episode score is 305.86, average_xvel is 0.373


  0%|          | 81/100000 [04:32<93:56:52,  3.38s/it]

2419 episode score is 229.84, average_xvel is 0.401


  0%|          | 82/100000 [04:35<93:05:21,  3.35s/it]

2431 episode score is 274.96, average_xvel is 0.430


  0%|          | 83/100000 [04:39<94:23:27,  3.40s/it]

2443 episode score is 243.91, average_xvel is 0.277


  0%|          | 84/100000 [04:42<94:44:33,  3.41s/it]

2457 episode score is 232.08, average_xvel is 0.320


  0%|          | 85/100000 [04:46<97:09:44,  3.50s/it]

2468 episode score is 309.55, average_xvel is 0.533


  0%|          | 86/100000 [04:49<96:09:52,  3.46s/it]

2482 episode score is 215.53, average_xvel is 0.369


  0%|          | 87/100000 [04:53<94:54:02,  3.42s/it]

2494 episode score is 247.77, average_xvel is 0.354


  0%|          | 88/100000 [04:56<93:46:46,  3.38s/it]

2505 episode score is 283.40, average_xvel is 0.416


  0%|          | 89/100000 [04:59<93:14:35,  3.36s/it]

2518 episode score is 260.19, average_xvel is 0.367


  0%|          | 90/100000 [05:03<96:07:05,  3.46s/it]

2529 episode score is 237.23, average_xvel is 0.113


  0%|          | 91/100000 [05:06<95:31:31,  3.44s/it]

2542 episode score is 262.64, average_xvel is 0.555


  0%|          | 92/100000 [05:10<94:30:32,  3.41s/it]

2554 episode score is 287.10, average_xvel is 0.578


  0%|          | 93/100000 [05:13<92:59:03,  3.35s/it]

2566 episode score is 277.85, average_xvel is 0.446


  0%|          | 94/100000 [05:16<93:35:58,  3.37s/it]

2577 episode score is 297.61, average_xvel is 0.445


  0%|          | 95/100000 [05:20<93:24:29,  3.37s/it]

2589 episode score is 281.27, average_xvel is 0.491


  0%|          | 96/100000 [05:23<93:23:42,  3.37s/it]

2603 episode score is 243.77, average_xvel is 0.471


  0%|          | 97/100000 [05:26<93:45:04,  3.38s/it]

2615 episode score is 297.20, average_xvel is 0.506


  0%|          | 98/100000 [05:30<94:37:34,  3.41s/it]

2626 episode score is 306.97, average_xvel is 0.490


  0%|          | 99/100000 [05:33<93:57:28,  3.39s/it]

2640 episode score is 239.46, average_xvel is 0.476
[32m >> Successfully saved models! [0m


  0%|          | 100/100000 [05:37<93:42:48,  3.38s/it]

2652 episode score is 269.51, average_xvel is 0.439


  0%|          | 101/100000 [05:40<92:43:00,  3.34s/it]

2666 episode score is 272.35, average_xvel is 0.586


  0%|          | 102/100000 [05:43<94:09:29,  3.39s/it]

2679 episode score is 276.70, average_xvel is 0.540


  0%|          | 103/100000 [05:47<94:19:01,  3.40s/it]

2691 episode score is 275.44, average_xvel is 0.432


  0%|          | 104/100000 [05:50<93:51:55,  3.38s/it]

2704 episode score is 267.38, average_xvel is 0.430


  0%|          | 105/100000 [05:54<94:55:20,  3.42s/it]

2716 episode score is 277.41, average_xvel is 0.310


  0%|          | 106/100000 [05:57<96:26:12,  3.48s/it]

2729 episode score is 276.34, average_xvel is 0.549


  0%|          | 107/100000 [06:01<95:29:01,  3.44s/it]

2742 episode score is 272.86, average_xvel is 0.467


  0%|          | 108/100000 [06:04<97:22:13,  3.51s/it]

2755 episode score is 256.27, average_xvel is 0.391


  0%|          | 109/100000 [06:08<96:17:03,  3.47s/it]

2767 episode score is 286.26, average_xvel is 0.500


  0%|          | 110/100000 [06:11<94:45:32,  3.42s/it]

2779 episode score is 272.64, average_xvel is 0.356


  0%|          | 111/100000 [06:14<94:34:46,  3.41s/it]

2791 episode score is 276.55, average_xvel is 0.353


  0%|          | 112/100000 [06:18<94:48:23,  3.42s/it]

2802 episode score is 285.92, average_xvel is 0.335


  0%|          | 113/100000 [06:21<93:45:35,  3.38s/it]

2816 episode score is 280.22, average_xvel is 0.532


  0%|          | 114/100000 [06:25<96:05:23,  3.46s/it]

2827 episode score is 292.05, average_xvel is 0.364


  0%|          | 115/100000 [06:28<94:43:43,  3.41s/it]

2841 episode score is 253.29, average_xvel is 0.436


  0%|          | 116/100000 [06:31<95:09:22,  3.43s/it]

2853 episode score is 296.80, average_xvel is 0.410


  0%|          | 117/100000 [06:35<96:01:29,  3.46s/it]

2865 episode score is 273.57, average_xvel is 0.376


  0%|          | 118/100000 [06:38<94:43:47,  3.41s/it]

2878 episode score is 282.68, average_xvel is 0.491


  0%|          | 119/100000 [06:42<95:10:17,  3.43s/it]

2890 episode score is 276.13, average_xvel is 0.313


  0%|          | 120/100000 [06:45<95:15:26,  3.43s/it]

2903 episode score is 255.31, average_xvel is 0.376


  0%|          | 121/100000 [06:49<94:19:30,  3.40s/it]

2915 episode score is 287.14, average_xvel is 0.352


  0%|          | 122/100000 [06:52<95:01:11,  3.42s/it]

2928 episode score is 272.91, average_xvel is 0.518


  0%|          | 123/100000 [06:55<93:37:52,  3.37s/it]

2941 episode score is 281.49, average_xvel is 0.468


  0%|          | 124/100000 [06:59<94:28:05,  3.41s/it]

2955 episode score is 263.89, average_xvel is 0.562


  0%|          | 125/100000 [07:02<93:49:00,  3.38s/it]

2968 episode score is 279.61, average_xvel is 0.541


  0%|          | 126/100000 [07:05<93:16:27,  3.36s/it]

2982 episode score is 265.57, average_xvel is 0.494


  0%|          | 127/100000 [07:09<94:09:27,  3.39s/it]

2997 episode score is 256.53, average_xvel is 0.533


  0%|          | 128/100000 [07:12<95:25:08,  3.44s/it]

3011 episode score is 265.15, average_xvel is 0.499


  0%|          | 129/100000 [07:16<95:31:25,  3.44s/it]

3023 episode score is 297.25, average_xvel is 0.453


  0%|          | 130/100000 [07:19<95:07:56,  3.43s/it]

3036 episode score is 277.36, average_xvel is 0.515


  0%|          | 131/100000 [07:23<94:05:06,  3.39s/it]

3049 episode score is 283.25, average_xvel is 0.480


  0%|          | 132/100000 [07:26<94:25:30,  3.40s/it]

3061 episode score is 286.49, average_xvel is 0.408


  0%|          | 133/100000 [07:29<93:36:17,  3.37s/it]

3074 episode score is 255.77, average_xvel is 0.361


  0%|          | 134/100000 [07:33<93:06:26,  3.36s/it]

3087 episode score is 278.55, average_xvel is 0.449


  0%|          | 135/100000 [07:36<93:31:37,  3.37s/it]

3099 episode score is 260.30, average_xvel is 0.160


  0%|          | 136/100000 [07:39<93:59:09,  3.39s/it]

3112 episode score is 280.23, average_xvel is 0.476


  0%|          | 137/100000 [07:43<93:39:11,  3.38s/it]

3125 episode score is 281.93, average_xvel is 0.472


  0%|          | 138/100000 [07:46<93:45:37,  3.38s/it]

3137 episode score is 293.40, average_xvel is 0.436


  0%|          | 139/100000 [07:49<93:10:29,  3.36s/it]

3150 episode score is 284.52, average_xvel is 0.504


  0%|          | 140/100000 [07:53<92:47:02,  3.34s/it]

3163 episode score is 279.47, average_xvel is 0.493


  0%|          | 141/100000 [07:56<92:25:24,  3.33s/it]

3176 episode score is 292.43, average_xvel is 0.486


  0%|          | 142/100000 [08:00<94:33:03,  3.41s/it]

3190 episode score is 274.39, average_xvel is 0.515


  0%|          | 143/100000 [08:03<94:39:06,  3.41s/it]

3201 episode score is 302.36, average_xvel is 0.211


  0%|          | 144/100000 [08:07<95:26:26,  3.44s/it]

3213 episode score is 285.72, average_xvel is 0.237


  0%|          | 145/100000 [08:10<96:29:59,  3.48s/it]

3227 episode score is 250.62, average_xvel is 0.373


  0%|          | 146/100000 [08:14<95:55:04,  3.46s/it]

3240 episode score is 276.93, average_xvel is 0.459


  0%|          | 147/100000 [08:17<94:37:52,  3.41s/it]

3254 episode score is 270.07, average_xvel is 0.486


  0%|          | 148/100000 [08:20<94:09:14,  3.39s/it]

3268 episode score is 265.82, average_xvel is 0.436


  0%|          | 149/100000 [08:24<94:26:29,  3.40s/it]

3281 episode score is 254.08, average_xvel is 0.313


  0%|          | 150/100000 [08:27<93:22:09,  3.37s/it]

3295 episode score is 265.56, average_xvel is 0.495


  0%|          | 151/100000 [08:30<92:46:10,  3.34s/it]

3308 episode score is 248.73, average_xvel is 0.209


  0%|          | 152/100000 [08:34<93:24:55,  3.37s/it]

3321 episode score is 285.85, average_xvel is 0.471


  0%|          | 153/100000 [08:37<93:22:15,  3.37s/it]

3334 episode score is 278.91, average_xvel is 0.458


  0%|          | 154/100000 [08:40<92:51:18,  3.35s/it]

3347 episode score is 272.03, average_xvel is 0.328


  0%|          | 155/100000 [08:44<93:39:16,  3.38s/it]

3361 episode score is 278.33, average_xvel is 0.493


  0%|          | 156/100000 [08:47<94:28:59,  3.41s/it]

3375 episode score is 269.97, average_xvel is 0.469


  0%|          | 157/100000 [08:51<94:21:57,  3.40s/it]

3388 episode score is 278.70, average_xvel is 0.451


  0%|          | 158/100000 [08:54<93:26:23,  3.37s/it]

3403 episode score is 265.58, average_xvel is 0.541


  0%|          | 159/100000 [08:57<94:07:38,  3.39s/it]

3416 episode score is 268.14, average_xvel is 0.308


  0%|          | 160/100000 [09:01<94:14:17,  3.40s/it]

3430 episode score is 272.83, average_xvel is 0.476


  0%|          | 161/100000 [09:04<94:12:53,  3.40s/it]

3444 episode score is 269.17, average_xvel is 0.491


  0%|          | 162/100000 [09:07<93:26:30,  3.37s/it]

3457 episode score is 289.09, average_xvel is 0.440


  0%|          | 163/100000 [09:11<93:49:04,  3.38s/it]

3470 episode score is 293.51, average_xvel is 0.423


  0%|          | 164/100000 [09:14<95:25:37,  3.44s/it]

3482 episode score is 305.35, average_xvel is 0.450


  0%|          | 165/100000 [09:18<94:01:53,  3.39s/it]

3494 episode score is 306.91, average_xvel is 0.384


  0%|          | 166/100000 [09:22<98:18:29,  3.54s/it]

3506 episode score is 282.33, average_xvel is 0.314


  0%|          | 167/100000 [09:25<96:00:21,  3.46s/it]

3519 episode score is 304.96, average_xvel is 0.494


  0%|          | 168/100000 [09:28<96:09:53,  3.47s/it]

3532 episode score is 300.54, average_xvel is 0.472


  0%|          | 169/100000 [09:32<98:52:12,  3.57s/it]

3546 episode score is 281.78, average_xvel is 0.501


  0%|          | 170/100000 [09:36<103:00:06,  3.71s/it]

3559 episode score is 296.72, average_xvel is 0.470


  0%|          | 171/100000 [09:40<100:18:37,  3.62s/it]

3571 episode score is 329.22, average_xvel is 0.434


  0%|          | 172/100000 [09:43<99:47:38,  3.60s/it] 

3583 episode score is 298.73, average_xvel is 0.325


  0%|          | 173/100000 [09:47<98:18:04,  3.54s/it]

In [None]:
print('average score: ', score_avg)
print('average xvel:  ', xvel_avg)

In [None]:
ppo.actor_net.eval()