<h1> MJRL training </h1>
MJRL Algorithms: TRPO - MBAC - PPO - NPG - DAPG <br>
source: https://github.com/aravindr93/mjrl/tree/master/mjrl/algos

In [1]:
import warnings
warnings.filterwarnings('ignore')

import gym
from mjrl.utils.gym_env import GymEnv
from mjrl.policies.gaussian_mlp import MLP
from mjrl.baselines.mlp_baseline import MLPBaseline
from mjrl.algos.ppo_clip import PPO
from mjrl.algos.npg_cg import NPG
from mjrl.utils.train_agent import train_agent
import myosuite

# Create the environment
# Find tasks in: \envs\myo\myobase or \envs\myo\myochallenge
env = gym.make('CenterReachOut-v0')
env.reset()


MyoSuite:> Registering Myo Envs
[36m    MyoSuite: A contact-rich simulation suite for musculoskeletal motor control
        Vittorio Caggiano, Huawei Wang, Guillaume Durandau, Massimo Sartori, Vikash Kumar
        L4DC-2019 | https://sites.google.com/view/myosuite
    [0m


array([-2.2856e-02,  1.1130e-02, -8.2680e-02,  1.0125e-01, -4.9400e-02,
        1.6578e-01,  1.1868e-01, -1.9320e-01, -1.9648e-01,  3.8760e-02,
        1.6806e-01,  1.7281e-01, -2.5407e-01, -1.0998e-01,  1.4748e+00,
        1.2882e+00,  1.0647e-01, -2.7489e-01,  3.3200e-01,  4.1140e-01,
       -1.7716e-01, -2.6182e-01,  1.8852e-01,  1.7017e-01,  4.4773e-01,
        2.6707e-01,  2.8278e-01, -1.0472e-02,  1.6854e-02,  7.0198e-03,
        9.1390e-02, -6.5450e-02,  2.4350e-01,  3.3776e-01,  2.3565e-01,
       -1.0184e-01,  6.0707e-04,  3.1420e-02, -2.0000e-01,  0.0000e+00,
       -2.0000e-02,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
        0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.00

<h1> Hyperparameter tuning </h1>
Still working on it

In [2]:
import numpy as np
from sklearn.model_selection import ParameterSampler

param_grid = {
    'policy_size': [(256, 256), (128, 128), (64, 64)],
    'rl_step_size': np.logspace(-4, 0, num=10),
    'learnrate': np.logspace(-4, 0, num=10),
    'entcoeff': np.logspace(-4, 0, num=10),
    'batchsize': [16, 32, 64, 128],
    'num_epoch': [5, 10, 20],
    'vf_hidden_size': [(256, 256), (128, 128), (64, 64)],
    'regcoef': np.logspace(-4, 0, num=10),
    'gamma': np.linspace(0.9, 0.999, num=10),
    'gae_lambda': np.linspace(0.9, 1.0, num=10)
}

n_iter_search = 50  

random_search = list(ParameterSampler(param_grid, n_iter=n_iter_search, random_state=123))

In [3]:
#Policy hyperparameters
policy_size = (256, 256)  # Size of the policy network
seed = 123  # Random seed used for reproducibility
rl_step_size = 0.1
learnrate = 0.025
entcoeff = 0.01
batchsize = 32
num_epoch = 10

#Value Function hyperparameters
vf_hidden_size = (256, 256) # Value Function Network size
seed = 123  # Random seed used for reproducibility
rl_step_size = 0.1
learnrate = 0.025
entcoeff = 0.01
batchsize = 32
num_epoch = 10
regcoef = 1e-3

#Training hyperparameters
iterations = 1000
gamma = 0.995
gae_lambda = 0.97


# Wrap the environment
e = GymEnv(env)

<H1>Set Policies </h1>

In [4]:
# Initialize policy and baseline
policy = MLP(e.spec, hidden_sizes=policy_size, seed=seed, init_log_std=-0.25, min_log_std=-1.0)
baseline = MLPBaseline(e.spec, reg_coef=regcoef, batch_size=batchsize , hidden_sizes=vf_hidden_size, epochs=num_epoch, learn_rate=learnrate)

# Initialize NPG/PPO agent
agent = NPG(e, policy, baseline, normalized_step_size=rl_step_size, seed=seed, save_logs=True,ent_coeff=entcoeff ,tensorboard_log="./ppo_objhold_tensorboard/")


<h1> Start Training </h1>

In [5]:
print("========================================")
print("Starting policy learning")
print("========================================")

# Train the agent
train_agent(job_name='.',
            agent=agent,
            seed=seed,
            niter=10000,
            gamma=gamma,
            gae_lambda=gae_lambda,
            sample_mode="trajectories",
            num_traj=96,
            num_samples=0,
            save_freq=100,
            evaluation_rollouts=10)

print("========================================")
print("Job Finished.")
print("========================================")

Starting policy learning
......................................................................................
ITERATION : 0 
Performing evaluation rollouts ........
Iter | Stoc Pol | Mean Pol | Best (Stoc) 

[ Fri May 17 15:33:54 2024 ]    0 -4226.35 -2994.37 -100000000.00 
------------------  -------------
VF_error_after          0.017988
VF_error_before         0.999976
alpha                   0.420992
delta                   0.1
env_samples          8259
eval_score          -2994.37
eval_success            0
kl_dist                 0.0496465
num_samples          8259
rollout_success         0
running_score       -4226.35
rwd_dense             -42.2635
rwd_sparse             -0.401014
stoc_pol_max           19.5302
stoc_pol_mean       -4226.35
stoc_pol_min        -5050.48
stoc_pol_std         1736.28
success_percentage      0
surr_improvement        0.276954
time_VF                 5.30057
time_npg                2.10066
time_sampling          19.6421
time_vpg                0.3422

KeyboardInterrupt: 

<h1> Logging hyperparameters

In [None]:
# Data to be written into the file
data = f"""
# Policy hyperparameters
policy_size = {policy_size}
seed = {seed}
rl_step_size = {rl_step_size}
learnrate = {learnrate}
entcoeff = {entcoeff}
batchsize = {batchsize}
num_epoch = {num_epoch}

# Value Function hyperparameters
vf_hidden_size = {vf_hidden_size}
seed = {seed}
rl_step_size = {rl_step_size}
learnrate = {learnrate}
entcoeff = {entcoeff}
batchsize = {batchsize}
num_epoch = {num_epoch}
regcoef = {regcoef}


"""

# Write data to the file
with open("myoarmreach#9/readme.txt", "w") as file: #Change the folder name 
    file.write(data)

FileNotFoundError: [Errno 2] No such file or directory: 'myoarmreach#9/readme.txt'