# **Implementation of PPO in Mujoco environement.**

## Requirements

In [None]:
# Installation in colab
!pip install mujoco
!pip install pyvirtualdisplay
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install piglet

In [None]:
#Drive connexion
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Directory of your folder
%cd /your_dir

/content/drive/MyDrive/testppo/PPO ant/PPO-pytorch-Mujoco-master


In [None]:
# Import all the libraries and fundamental coded class.
import gym
import sys
import torch
import mujoco
import gym
import torch
import numpy as np
import argparse
import torch
import torch.optim as optim
import pandas as pd
from collections import deque
from parameters import *
from PPO import Ppo,Normalize
from collections import deque
from pyvirtualdisplay import Display
from IPython import display
import matplotlib.pyplot as plt
import imageio
%matplotlib inline


## Run - MAIN

These are the parameters of *parameters.py* file, they can be adapted in this script.

In [None]:
#learning rate backward propagation NN action (Paper recommendation)
lr_actor = 0.0003
#learning rate backward propagation NN state value estimation (Paper recommendation)
lr_critic = 0.0003
#Number of Learning Iteration we want to perform
Iter = 300
#Number max of step to realise in one episode.
MAX_STEP = 1000
#How rewards are discounted. (Paper recommendation)
gamma =0.98
#How do we stabilize variance in the return computation. (Paper recommendation)
lambd = 0.95
#batch to train on
batch_size = 64
# Do we want high change to be taken into account.
epsilon = 0.2
#weight decay coefficient in ADAM for state value optim.
l2_rate = 0.001



This part is the execution of all the code to train ppo.

In [None]:
parser = argparse.ArgumentParser()

## TO CHOOSE :
#'InvertedPendulum-v4'
#'Humanoid-v4'

parser.add_argument('--env_name', type=str, default='Humanoid-v4',
                    help='name of Mujoco environement')
parser.add_argument("-f", required=False)
args = parser.parse_args()

#We set the environement using gym library completed by Mujoco environnement.
#We set the environement to Humanoid-v4 (link: https://www.gymlibrary.dev/environments/mujoco/humanoid/)
env = gym.make(args.env_name)


#Number of state and action
N_S = env.observation_space.shape[0]
N_A = env.action_space.shape[0]

# Random seed initialization
env.seed(500)
torch.manual_seed(500)
np.random.seed(500)

# Run the Ppo class
frames = []
ppo = Ppo(N_S,N_A)
# Normalisation for stability, fast convergence... always good to do.
normalize = Normalize(N_S)
episodes = 0
eva_episodes = 0
for iter in range(Iter):
    memory = deque()
    scores = []
    steps = 0
    while steps <2048: #Horizon
        episodes += 1
        s = normalize(env.reset())
        score = 0
        for _ in range(MAX_STEP):
            steps += 1
            #Choose an action: detailed in PPO.py
            # The action is a numpy array of 17 elements. It means that in the 17 possible directions of action we have a specific value in the continuous space.
            # Exemple : the first coordinate correspond to the Torque applied on the hinge in the y-coordinate of the abdomen: this is continuous space.
            a=ppo.actor_net.choose_action(torch.from_numpy(np.array(s).astype(np.float32)).unsqueeze(0))[0]

            #Environnement reaction to the action : There is a reaction in the 376 elements that characterize the space :
            # Exemple : the first coordinate of the states is the z-coordinate of the torso (centre) and using env.step(a), we get the reaction of this state and
            # of all the other ones after the action has been made.
            s_ , r ,done,info = env.step(a)
            s_ = normalize(s_)

            # Do we continue or do we terminate an episode?
            mask = (1-done)*1
            memory.append([s,a,r,mask])

            score += r
            s = s_
            if done:
                break
        with open('log_' + args.env_name  + '.txt', 'a') as outfile:
            outfile.write('\t' + str(episodes)  + '\t' + str(score) + '\n')
        scores.append(score)
    score_avg = np.mean(scores)
    print('{} episode score is {:.2f}'.format(episodes, score_avg))
    ppo.train(memory)

If we want to continue the train after a first session, we can use the following code.
A break has been added in order to stop when we achieve a high score.

In [None]:
# Code to continue the learning after a break.

for iter in range(Iter):
    memory = deque()
    scores = []
    steps = 0
    while steps <2048: #Horizon
        episodes += 1
        s = normalize(env.reset())
        score = 0
        for _ in range(MAX_STEP):
            steps += 1
            #Choose an action: detailed in PPO.py
            # The action is a numpy array of 17 elements. It means that in the 17 possible directions of action we have a specific value in the continuous space.
            # Exemple : the first coordinate correspond to the Torque applied on the hinge in the y-coordinate of the abdomen: this is continuous space.
            a=ppo.actor_net.choose_action(torch.from_numpy(np.array(s).astype(np.float32)).unsqueeze(0))[0]

            #Environnement reaction to the action : There is a reaction in the 376 elements that characterize the space :
            # Exemple : the first coordinate of the states is the z-coordinate of the torso (centre) and using env.step(a), we get the reaction of this state and
            # of all the other ones after the action has been made.
            s_ , r ,done,info = env.step(a)
            s_ = normalize(s_)

            # Do we continue or do we terminate an episode?
            mask = (1-done)*1
            memory.append([s,a,r,mask])

            score += r
            s = s_
            if done:
                break
        # We store results of all the episodes in a txt file.
        with open('log_' + args.env_name  + '.txt', 'a') as outfile:
            outfile.write('\t' + str(episodes)  + '\t' + str(score) + '\n')
        scores.append(score)
    score_avg = np.mean(scores)
    print('{} episode score is {:.2f}'.format(episodes, score_avg))
    # Here the magic happens.
    ppo.train(memory)
    #Add a stop when it is enough
    if score_avg>2000:
      break

## Visualisation

### GIF

In [None]:
# We initialize the environement
s = normalize(env.reset())
# Initialize visualization
Display().start()
img = plt.imshow(env.render('rgb_array'))
# List to store frames
frames = []
# i for me to know how many steps the agent did.

for _ in range(500):

        # Update the image data
      img.set_data(env.render('rgb_array'))

      # Capture the current frame
      frame = env.render('rgb_array')
      frames.append(frame)

      # Update the display
      display.display(plt.gcf())
      display.clear_output(wait=True)
      #Choose an action according to the environnement s.
      a=ppo.actor_net.choose_action(torch.from_numpy(np.array(s).astype(np.float32)).unsqueeze(0))[0]

      #Environnement reaction to the action.
      s_ , r ,done,info = env.step(a)
      # Update the state in the good format(normalized).
      s_ = normalize(s_)
      s = s_
      # Do we continue or do we terminate an episode?
      mask = (1-done)*1

      if done:
        break

# Save frames as a GIF
imageio.mimsave('Humanoid_afterv3.gif', frames)

# Close the environment and virtual display
env.close()


### Learning curve

In [None]:
episodes = []
scores = []
with open('log_' +args.env_name+ '.txt', 'r') as infile:
    for line in infile:
        data = line.strip().split('\t')
        episodes.append(int(data[0]))
        scores.append(float(data[1]))

df = pd.DataFrame({'episode': episodes, 'score': scores})
grouped = df.groupby('episode')['score']
mean_scores = grouped.mean()
upper_bounds = grouped.max()
lower_bounds = grouped.min()
plt.figure(figsize=(10, 6))
# Plotting mean scores
plt.plot(mean_scores.index, mean_scores.values, label='Score', color='red')
# Plotting upper and lower bounds
#plt.fill_between(mean_scores.index, lower_bounds.values, upper_bounds.values, color='tomato', alpha=0.3, label='Bounds')
plt.title('Score evolution '+' of ' + args.env_name)
plt.xlabel('Episode')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
plt.savefig('Humanoid_learning.png')