[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/PolitoVandal/project-sim2real-delli-modi-necerini_project/blob/main/project_RL.ipynb)

# Before starting

## Clone GitHub repo

In [None]:
import os

if not os.path.isdir('/content/project-sim2real-delli-modi-necerini_project'):
  !git clone -b main https://github.com/RonPlusSign/RobotLearningLabs.git

# Rename the folder
!mv /content/RobotLearningLabs /content/sim2real
!cd /content/sim2real && git pull

## Download libraries

In [None]:
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf
!apt-get install python-opengl -y
!apt install xvfb -y

!pip install gym
!pip install free-mujoco-py
!pip install pyvirtualdisplay
!pip install piglet
!pip install stable-baselines3[extra]
!pip install shimmy

from pyvirtualdisplay import Display
Display().start()

import gym
import torch
import random
import numpy as np
from IPython import display
from sim2real.env.custom_hopper import *
import matplotlib.pyplot as plt
%matplotlib inline

# Lab 4

## PPO Training on Custom Hopper environment


In [None]:
import os
import warnings

# Ignore warnings during training
warnings.filterwarnings("ignore")

# Create the 'models' directory if it doesn't exist
if not os.path.exists('sim2real/models'):
    os.makedirs('sim2real/models')

# Create the 'plots' directory if it doesn't exist
if not os.path.exists('sim2real/plots'):
    os.makedirs('sim2real/plots')

### Source training


In [None]:
# Training 1 - 500k timesteps
!cd sim2real; python train.py --env=CustomHopper-source-v0 --total_timesteps=500000

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Training time: 844.9242844581604
Test reward (avg +/- std): (610.2515685785082 +/- 173.88491636395824) - Num episodes: 100


In [None]:
# Training 2 - 5M timesteps
!cd sim2real; python train.py --env=CustomHopper-source-v0 --total_timesteps=5000000

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Training time: 8449.11818575859
Test reward (avg +/- std): (1544.6508584914657 +/- 304.80255938674514) - Num episodes: 100


In [None]:
# Training 3 (test LINEAR LR schedule - 500k timesteps)
!cd sim2real; python train.py --env=CustomHopper-source-v0 --total_timesteps=500000

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Training time: 962.2152454853058
Test reward (avg +/- std): (619.0097090676186 +/- 205.1345712081536) - Num episodes: 100


In [None]:
# Training 4 (test EXPONENTIAL LR schedule - 500k timesteps)
!cd sim2real; python train.py --env=CustomHopper-source-v0 --total_timesteps=500000 --lr_schedule='exponential'

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Training time: 991.3553211688995
Test reward (avg +/- std): (921.7609026647743 +/- 355.72170703486603) - Num episodes: 100


In [None]:
# Training 5 (test EXPONENTIAL LR schedule - 2M timesteps)
!cd sim2real; python train.py --env=CustomHopper-source-v0 --total_timesteps=2000000 --lr_schedule='exponential' --model_id="source_5"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Training time: 3698.463559150696
Test reward (avg +/- std): (1226.324881008564 +/- 411.15862480353917) - Num episodes: 100


### Target training

In [None]:
# Training 1
!cd sim2real; python train.py --env=CustomHopper-target-v0 --total_timesteps=500000

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Training time: 844.8844656944275
Test reward (avg +/- std): (651.8186004186551 +/- 87.96238364152971) - Num episodes: 100


In [None]:
# Training 2
!cd sim2real; python train.py --env=CustomHopper-target-v0 --total_timesteps=5000000

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Training time: 8560.386691331863
Test reward (avg +/- std): (1243.2276279905634 +/- 494.55743941516295) - Num episodes: 100


In [None]:
# Training 3
!cd sim2real; python train.py --env=CustomHopper-target-v0 --model_id="target_3" --total_timesteps=2000000 --lr_schedule='exponential'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Training time: 3662.456332921982
Test reward (avg +/- std): (1580.8654151466749 +/- 288.70644216599095) - Num episodes: 100


 ### Testing the trained models

In [None]:
# Test the 5 models trained on SOURCE and 3 models trained on TARGET
models = ['source_1', 'source_2', 'source_3', 'source_4', 'source_5', 'target_1', 'target_2', 'target_3']

for model_id in models:
  print(f"\n----- {model_id} -----")
  env_name = model_id.split('_')[0]
  !cd sim2real; python train.py --test=models/PPO_model_{model_id} --env=CustomHopper-{env_name}-v0 --test_episodes=1000 --test_rendering --video_name=hopper_{env_name}_to_{env_name}_test_model_{model_id}.mp4

----- source_1 -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/hopper_source_to_source_test_model_source_1.mp4
Test reward (avg +/- std): (1202.062789947555 +/- 324.21626837181344) - Num episodes: 1000

----- source_2 -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/hopper_source_to_source_test_model_source_2.mp4
Test reward (avg +/- std): (1534.4865589557387 +/- 266.00493908574356) - Num episodes: 1000

----- source_3 -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/hopper_source_to_source_test_model_source

### Test baseline models (source_5 and target_3)

In [None]:
print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_source_5" --env=CustomHopper-source-v0 --test_episodes=1000 --test_rendering --video_name="hopper_source_to_source_test_model_source_5.mp4"

print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_source_5" --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name="hopper_source_to_target_test_model_source_5.mp4"

print("\n----- TARGET ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_target_3" --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name="hopper_target_to_target_test_model_target_3.mp4"

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/hopper_source_to_source_test_model_source_5.mp4
Test reward (avg +/- std): (1471.4690957737257 +/- 311.72029948778004) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/hopper_source_to_target_test_model_source_5.mp4
Test reward (avg +/- std): (1495.0460916799027 +/- 253.6159042769558) - Num episodes: 1000

----- TARGET ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/hopper_target_to_tar

## Grid search of parameters

In [None]:
# Grid search with 100k steps for each model
!cd sim2real; python train.py --env=CustomHopper-source-v0 --total_timesteps=100000 --grid_search --verbose=0

-------- TRAINING ON SOURCE ENVIRONMENT --------

Starting grid search...
Training with n_epochs=5, clip_range=0.1, gae_lambda=0.9, gamma=0.95, batch_size=32
   => Mean reward: 332.0210466647148
Training with n_epochs=5, clip_range=0.1, gae_lambda=0.9, gamma=0.95, batch_size=64
   => Mean reward: 338.30170162439344
Training with n_epochs=5, clip_range=0.1, gae_lambda=0.9, gamma=0.95, batch_size=128
   => Mean reward: 333.1047730827332
Training with n_epochs=5, clip_range=0.1, gae_lambda=0.9, gamma=0.99, batch_size=32
   => Mean reward: 315.7885083663464
Training with n_epochs=5, clip_range=0.1, gae_lambda=0.9, gamma=0.99, batch_size=64
   => Mean reward: 367.29987224698067
Training with n_epochs=5, clip_range=0.1, gae_lambda=0.9, gamma=0.99, batch_size=128
   => Mean reward: 183.92171713232995
Training with n_epochs=5, clip_range=0.1, gae_lambda=0.9, gamma=0.999, batch_size=32
   => Mean reward: 375.8082304239273
Training with n_epochs=5, clip_range=0.1, gae_lambda=0.9, gamma=0.999, ba

In [None]:
# Long training (2M timesteps) with best parameters from grid search
#Grid search finished. Best reward: 1461.237786039114 with params: {'n_epochs': 20, 'clip_range': 0.3, 'gae_lambda': 0.99, 'gamma': 0.999, 'batch_size': 128}

!cd sim2real; python train.py --env=CustomHopper-source-v0 --model_id="best_source" --total_timesteps=2000000 --n_epochs=20 --clip_range=0.3 --gae_lambda=0.99 --gamma=0.999 --batch_size=128 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1103.78 +/- 211.34
Episode length: 378.80 +/- 99.57
Eval num_timesteps=1700000, episode_reward=1062.21 +/- 391.01
Episode length: 427.60 +/- 144.80
Eval num_timesteps=1800000, episode_reward=1044.69 +/- 266.07
Episode length: 412.20 +/- 111.15
Eval num_timesteps=1900000, episode_reward=960.97 +/- 69.81
Episode length: 418.80 +/- 28.53
Eval num_timesteps=2000000, episode_reward=1264.52 +/- 160.34
Episode length: 460.40 +/- 79.20
Training time: 4237.585305452347
Test reward (avg +/- std): (1077.0278314890043 +/- 329.15435129331354) - Num episodes: 100


In [None]:
# long training with best parameters
#Grid search finished. Best reward: 1461.237786039114 with params: {'n_epochs': 20, 'clip_range': 0.3, 'gae_lambda': 0.99, 'gamma': 0.999, 'batch_size': 128}

!cd sim2real; python train.py --env=CustomHopper-target-v0 --model_id="best_target" --total_timesteps=2000000 --n_epochs=20 --clip_range=0.3 --gae_lambda=0.99 --gamma=0.999 --batch_size=128 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1217.10 +/- 105.70
Episode length: 483.20 +/- 23.22
Eval num_timesteps=1700000, episode_reward=1098.67 +/- 46.55
Episode length: 500.00 +/- 0.00
Eval num_timesteps=1800000, episode_reward=1135.02 +/- 21.49
Episode length: 500.00 +/- 0.00
Eval num_timesteps=1900000, episode_reward=1179.07 +/- 27.77
Episode length: 500.00 +/- 0.00
Eval num_timesteps=2000000, episode_reward=1140.78 +/- 211.08
Episode length: 456.20 +/- 87.60
Training time: 4240.038843870163
Test reward (avg +/- std): (1155.7295022985215 +/- 215.3178084996668) - Num episodes: 100


## Domain Randomization

### Train model using randomization with uniform distribution

In [None]:
# Uniform Domain Randomization (2M timesteps)
!cd sim2real; python train.py --model_id="source_randomized_uniform_1" --domain_rand=uniform --env=CustomHopper-source-v0 --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1375.79 +/- 237.27
Episode length: 403.20 +/- 79.63
Eval num_timesteps=1700000, episode_reward=1655.96 +/- 160.54
Episode length: 458.00 +/- 51.54
Eval num_timesteps=1800000, episode_reward=1766.72 +/- 25.05
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1900000, episode_reward=1665.03 +/- 107.70
Episode length: 478.00 +/- 44.00
Eval num_timesteps=2000000, episode_reward=1110.96 +/- 99.67
Episode length: 306.00 +/- 18.25
Training time: 3481.914088487625
Test reward (avg +/- std): (1457.7240589302537 +/- 355.89458405554495) - Num episodes: 100


### Train model using randomization with truncated normal distribution

In [None]:
# Truncated Normal Domain Randomization (2M timesteps)
!cd sim2real; python train.py --model_id="source_randomized_gaussian_1" --domain_rand=gaussian --env=CustomHopper-source-v0 --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=869.20 +/- 434.94
Episode length: 257.00 +/- 114.59
Eval num_timesteps=1700000, episode_reward=1326.87 +/- 328.24
Episode length: 394.20 +/- 98.88
Eval num_timesteps=1800000, episode_reward=1194.24 +/- 267.77
Episode length: 349.60 +/- 86.34
Eval num_timesteps=1900000, episode_reward=1568.43 +/- 172.29
Episode length: 452.80 +/- 49.77
Eval num_timesteps=2000000, episode_reward=1419.25 +/- 155.85
Episode length: 411.60 +/- 44.10
Training time: 3603.520294904709
Test reward (avg +/- std): (1338.7608543905403 +/- 283.3275855214356) - Num episodes: 100


### Test randomized models

In [None]:
# Test PPO_model_source_randomized_uniform_1 (UDR)

print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_source_randomized_uniform_1" --env=CustomHopper-source-v0 --test_episodes=1000 --test_rendering --video_name="hopper_randomized_uniform_source_to_source_test_video.mp4"

print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_source_randomized_uniform_1" --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name="hopper_randomized_uniform_source_to_target_test_video.mp4"


----- SOURCE ➔ SOURCE -----

--- WORKING ON SOURCE ENVIRONMENT ---

Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/hopper_randomized_uniform_source_to_source_test_video.mp4
Test reward (avg +/- std): (1719.8699071218934 +/- 8.322910960511003) - Num episodes: 1000

----- SOURCE ➔ TARGET -----

--- WORKING ON TARGET ENVIRONMENT ---

Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/hopper_randomized_uniform_source_to_target_test_video.mp4
Test reward (avg +/- std): (1721.6464029399863 +/- 9.576295941109272) - Num episodes: 1000


In [None]:
# Test PPO_model_source_randomized_gaussian_1 (TNR)

print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_source_randomized_gaussian_1" --env=CustomHopper-source-v0 --test_episodes=1000 --test_rendering --video_name="hopper_randomized_gaussian_source_to_source_test_video.mp4"

print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_source_randomized_gaussian_1" --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name="hopper_randomized_gaussian_source_to_target_test_video.mp4"


----- SOURCE ➔ SOURCE -----

--- WORKING ON SOURCE ENVIRONMENT ---

Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/hopper_randomized_gaussian_source_to_source_test_video.mp4
Test reward (avg +/- std): (1286.190773661852 +/- 366.4682855154399) - Num episodes: 1000

----- SOURCE ➔ TARGET -----

--- WORKING ON TARGET ENVIRONMENT ---

Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/hopper_randomized_gaussian_source_to_target_test_video.mp4
Test reward (avg +/- std): (1351.565737943985 +/- 133.70407167964783) - Num episodes: 1000


# Extensions

## 1. Learning rate schedule (Hopper)

Train and test different learning rate schedules, in order to find the most appropriate one.

Models trained: CustomHopper with source environment, and torso weight shift of -1kg with respect to target environment.

Initial Learning Rate: 0.0003

Tested schedules:
- Constant (keep 0.0003 for all training)
- Linear (lower the LR linearly at each episode, from initial to 0)
- Exponential (LR decreases drastically only at the end of training)

#### Training with 3 different LR schedules: constant, linear, exponential

In [None]:
# Constant Learning Rate (default: 0.0003)
!cd sim2real && python train.py --model_id="customHopper_source_lr_schedule_constant" --env=CustomHopper-source-v0 --total_timesteps=2000000 --lr_schedule="constant"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1673.45 +/- 6.51
Episode length: 500.00 +/- 0.00
Eval num_timesteps=1700000, episode_reward=1589.74 +/- 243.96
Episode length: 460.40 +/- 79.20
Eval num_timesteps=1800000, episode_reward=1716.16 +/- 9.27
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1900000, episode_reward=1692.53 +/- 108.59
Episode length: 474.60 +/- 33.54
Eval num_timesteps=2000000, episode_reward=1718.50 +/- 17.79
Episode length: 500.00 +/- 0.00
New best mean reward!
Training time: 3634.022445201874
Test reward (avg +/- std): (1521.829145932036 +/- 267.6017010840901) - Num episodes: 100


In [None]:
# Linear schedule for Learning Rate (initial: 0.0003)
!cd sim2real && python train.py --model_id="customHopper_source_lr_schedule_linear" --env=CustomHopper-source-v0 --total_timesteps=2000000 --lr_schedule="linear"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1753.37 +/- 3.66
Episode length: 500.00 +/- 0.00
New best mean reward!
Eval num_timesteps=1700000, episode_reward=1760.51 +/- 24.54
Episode length: 491.00 +/- 18.00
New best mean reward!
Eval num_timesteps=1800000, episode_reward=1789.12 +/- 104.57
Episode length: 464.00 +/- 25.85
New best mean reward!
Eval num_timesteps=1900000, episode_reward=1770.97 +/- 38.11
Episode length: 485.00 +/- 21.45
Eval num_timesteps=2000000, episode_reward=1805.28 +/- 73.07
Episode length: 489.40 +/- 21.20
New best mean reward!
Training time: 3638.6890399456024
Test reward (avg +/- std): (1433.9130158764106 +/- 394.01911175889694) - Num episodes: 100


In [None]:
# Exponential schedule for Learning Rate (initial: 0.0003, decay rate: 0.1)
!cd sim2real && python train.py --model_id="customHopper_source_lr_schedule_exponential" --env=CustomHopper-source-v0 --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1311.39 +/- 152.52
Episode length: 388.60 +/- 56.30
Eval num_timesteps=1700000, episode_reward=1594.51 +/- 6.32
Episode length: 500.00 +/- 0.00
Eval num_timesteps=1800000, episode_reward=1199.89 +/- 211.93
Episode length: 357.20 +/- 73.43
Eval num_timesteps=1900000, episode_reward=1623.56 +/- 5.66
Episode length: 493.40 +/- 13.20
Eval num_timesteps=2000000, episode_reward=1231.96 +/- 564.41
Episode length: 369.00 +/- 158.17
Training time: 3940.0959384441376
Test reward (avg +/- std): (1079.0912841794334 +/- 498.22934691469777) - Num episodes: 100


#### Testing the different LR schedules

In [None]:
print("\n===== TESTING CUSTOM HOPPER MODELS WITH DIFFERENT LR SCHEDULES =====")

print("\n----- Constant LR -----")
!cd sim2real; python train.py --test="models/PPO_model_customHopper_source_lr_schedule_constant" --env=CustomHopper-source-v0 --test_episodes=1000 --test_rendering --video_name="customHopper_source_lr_schedule_constant.mp4"

print("\n----- Linear LR scheduler -----")
!cd sim2real; python train.py --test="models/PPO_model_customHopper_source_lr_schedule_linear" --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name="customHopper_source_lr_schedule_linear.mp4"

print("\n----- Exponential LR scheduler -----")
!cd sim2real; python train.py --test="models/PPO_model_customHopper_source_lr_schedule_exponential" --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name="customHopper_source_lr_schedule_exponential.mp4"



===== TESTING CUSTOM HOPPER MODELS WITH DIFFERENT LR SCHEDULES =====

----- Constant LR -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/customHopper_source_lr_schedule_constant.mp4
Test reward (avg +/- std): (1706.7078160890396 +/- 96.13198545528981) - Num episodes: 1000

----- Linear LR scheduler -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/customHopper_source_lr_schedule_linear.mp4
Test reward (avg +/- std): (1811.0124509290624 +/- 61.16579476494211) - Num episodes: 1000

----- Exponential LR scheduler -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 80

## 2. Study the effect of single mass randomization (Hopper)

In [None]:
randomization_types = ["uniform", "normal"]
randomize_masses = ["randomize_thigh", "randomize_leg", "randomize_foot"]

for domain_rand in randomization_types:
    for mass in randomize_masses:
        print(f"Randomization type: {domain_rand} - Randomizing mass: {mass}")
        # name of the model
        model_id = f"source_randomized_{mass.split('_')[1]}_{domain_rand}_1"

        !cd sim2real && python train.py --model_id={model_id} --domain_rand={domain_rand} --{mass} --env=CustomHopper-source-v0 --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=527.13 +/- 521.87
Episode length: 164.00 +/- 130.79
Eval num_timesteps=1700000, episode_reward=1672.68 +/- 102.34
Episode length: 476.40 +/- 47.20
Eval num_timesteps=1800000, episode_reward=1470.21 +/- 292.08
Episode length: 396.40 +/- 86.49
Eval num_timesteps=1900000, episode_reward=1681.30 +/- 177.33
Episode length: 467.40 +/- 65.20
Eval num_timesteps=2000000, episode_reward=1769.59 +/- 39.65
Episode length: 500.00 +/- 0.00
New best mean reward!
Training time: 3473.8241922855377
Test reward (avg +/- std): (1599.8340286678326 +/- 258.0012141317806) - Num episodes: 100


In [None]:
# Test the models
randomization_types = ["uniform", "normal"]
randomize_masses = ["randomize_thigh", "randomize_leg", "randomize_foot"]

for domain_rand in randomization_types:
    for mass in randomize_masses:
        print()
        print(f"Randomization type: {domain_rand} - Randomizing mass: {mass}")
        # name of the model
        model_name = f"models/PPO_model_source_randomized_{mass.split('_')[1]}_{domain_rand}_1"
        video_name_source_to_source = f"hopper_randomized_{domain_rand}_{mass.split('_')[1]}_source_to_source_test_video.mp4"
        video_name_source_to_target = f"hopper_randomized_{domain_rand}_{mass.split('_')[1]}_source_to_target_test_video.mp4"

        print("\n----- SOURCE ➔ SOURCE -----")
        !cd sim2real && python train.py --test={model_name} --env=CustomHopper-source-v0 --test_episodes=1000 --test_rendering --video_name={video_name_source_to_source}

        print("\n----- SOURCE ➔ TARGET -----")
        !cd sim2real && python train.py --test={model_name} --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name={video_name_source_to_target}


Randomization type: uniform - Randomizing mass: randomize_thigh

----- SOURCE ➔ SOURCE -----
Testing...
Test video save as videos/hopper_randomized_uniform_thigh_source_to_source_test_video.mp4
Test reward (avg +/- std): (1459.0225557492859 +/- 101.61108837663537) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Test video save as videos/hopper_randomized_uniform_thigh_source_to_target_test_video.mp4
Test reward (avg +/- std): (1466.2309842430482 +/- 112.99349119251956) - Num episodes: 1000

Randomization type: uniform - Randomizing mass: randomize_leg

----- SOURCE ➔ SOURCE -----
Testing...
Test video save as videos/hopper_randomized_uniform_leg_source_to_source_test_video.mp4
Test reward (avg +/- std): (1721.9927571648263 +/- 104.93903780154353) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Test video save as videos/hopper_randomized_uniform_leg_source_to_target_test_video.mp4
Test reward (avg +/- std): (1723.1341117016257 +/- 103.06011278500056) - Num epi

## 3. ThinHopper: Training on Custom Hopper Environment by Shifting Torso Mass by 2kg Relative to the Target Domain

**Note:**

Before conducting these training and test sessions, modify the following line in the `custom_hopper.py` file:

`self.sim.model.body_mass[1] -= 1.0`

to:

`self.sim.model.body_mass[1] -= 2.0`

and revert it if you wish to return to the standard configuration.

In [None]:
# Training on SOURCE environment without Domain Randomization
# Source Torso Mass = Target Torso Mass - 2
!cd sim2real; python train.py --env=CustomHopper-source-v0 --model_id="customHopper_source_Torso-2" --total_timesteps=2000000 --lr_schedule="exponential"

--- WORKING ON CUSTOM HOPPER SOURCE ENVIRONMENT ---

Eval num_timesteps=100000, episode_reward=538.64 +/- 10.68
Episode length: 172.20 +/- 1.47
New best mean reward!
Eval num_timesteps=200000, episode_reward=517.71 +/- 19.57
Episode length: 162.20 +/- 3.76
Eval num_timesteps=300000, episode_reward=675.42 +/- 7.98
Episode length: 188.80 +/- 1.33
New best mean reward!
Eval num_timesteps=400000, episode_reward=1246.24 +/- 160.82
Episode length: 390.60 +/- 50.35
New best mean reward!
Eval num_timesteps=500000, episode_reward=917.72 +/- 34.56
Episode length: 272.60 +/- 11.04
Eval num_timesteps=600000, episode_reward=1311.55 +/- 300.74
Episode length: 387.40 +/- 95.43
New best mean reward!
Eval num_timesteps=700000, episode_reward=1288.59 +/- 262.99
Episode length: 364.60 +/- 74.66
Eval num_timesteps=800000, episode_reward=1428.41 +/- 276.42
Episode length: 423.80 +/- 93.33
New best mean reward!
Eval num_timesteps=900000, episode_reward=1549.30 +/- 162.88
Episode length: 469.20 +/- 41.24
New

In [None]:
# Training on SOURCE environment with Uniform Domain Randomization (UDR)
# Source Torso Mass = Target Torso Mass - 2
!cd sim2real; python train.py --env=CustomHopper-source-v0 --domain_rand=uniform --model_id="customHopper_source_Torso-2" --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Eval num_timesteps=1600000, episode_reward=1735.94 +/- 29.62
Episode length: 487.40 +/- 25.20
New best mean reward!
Eval num_timesteps=1700000, episode_reward=1085.43 +/- 761.76
Episode length: 309.00 +/- 192.92
Eval num_timesteps=1800000, episode_reward=1420.31 +/- 208.29
Episode length: 384.20 +/- 63.89
Eval num_timesteps=1900000, episode_reward=1222.55 +/- 353.50
Episode length: 333.40 +/- 101.59
Eval num_timesteps=2000000, episode_reward=1405.96 +/- 278.32
Episode length: 374.20 +/- 81.46
Training time: 3640.432469844818
Test reward (avg +/- std): (1514.802774384299 +/- 303.7605455648267) - Num episodes: 100


In [None]:
# Training on TARGET environment
# Source Torso Mass = Target Torso Mass - 2
!cd sim2real; python train.py --env=CustomHopper-target-v0 --model_id="customHopper_target_Torso-2" --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1038.64 +/- 160.64
Episode length: 291.60 +/- 39.75
Eval num_timesteps=1700000, episode_reward=1576.91 +/- 269.28
Episode length: 453.80 +/- 81.39
New best mean reward!
Eval num_timesteps=1800000, episode_reward=1264.98 +/- 336.28
Episode length: 355.00 +/- 99.35
Eval num_timesteps=1900000, episode_reward=1228.60 +/- 286.89
Episode length: 335.20 +/- 72.78
Eval num_timesteps=2000000, episode_reward=1545.48 +/- 184.79
Episode length: 442.40 +/- 70.95
Training time: 3656.999368906021
Test reward (avg +/- std): (1471.2034422351117 +/- 278.5423733257934) - Num episodes: 100


Testing:

In [None]:
print("\n===== TESTING CUSTOM HOPPER MODELS (without DR) =====")

# SOURCE ➔ SOURCE
print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_customHopper_source_Torso-2" --env=CustomHopper-source-v0 --test_episodes=1000 --test_rendering --video_name="customHopper_Torso-2_source_to_source_test.mp4"

# SOURCE ➔ TARGET
print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_customHopper_source_Torso-2" --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name="customHopper_Torso-2_source_to_target_test.mp4"

# TARGET ➔ TARGET
# print("\n----- TARGET ➔ TARGET -----")
# !cd sim2real; python train.py --test="models/PPO_model_customHopper_target_Torso-2" --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name="customHopper_Torso-2_target_to_target_test.mp4"


===== TESTING CUSTOM HOPPER MODELS (without DR) =====

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/customHopper_Torso-2_source_to_source_test.mp4
Test reward (avg +/- std): (1190.825767679774 +/- 154.54275328296958) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/customHopper_Torso-2_source_to_target_test.mp4
Test reward (avg +/- std): (1190.5782180375388 +/- 160.12238760127784) - Num episodes: 1000


In [None]:
print("\n===== TESTING CUSTOM HOPPER MODELS (UDR) =====")

# SOURCE ➔ SOURCE
print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_customHopper_source_Torso-2_udr" --env=CustomHopper-source-v0 --test_episodes=1000 --test_rendering --video_name="customHopper_Torso-2_udr_source_to_source_test.mp4"

# SOURCE ➔ TARGET
print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_customHopper_source_Torso-2_udr" --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name="customHopper_Torso-2_udr_source_to_target_test.mp4"

# TARGET ➔ TARGET
print("\n----- TARGET ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_customHopper_target_Torso-2_udr" --env=CustomHopper-target-v0 --test_episodes=1000 --test_rendering --video_name="customHopper_Torso-2_target_to_target_test.mp4"


===== TESTING CUSTOM HOPPER MODELS (UDR) =====

----- SOURCE (with UDR) ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/customHopper_Torso-2_udr_source_to_source_test.mp4
Test reward (avg +/- std): (1409.5016840324822 +/- 251.97632647441876) - Num episodes: 1000

----- SOURCE (with UDR) ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/customHopper_Torso-2_udr_source_to_target_test.mp4
Test reward (avg +/- std): (1416.2140568271516 +/- 256.3556054363525) - Num episodes: 1000

----- TARGET ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Epi

## 4. Add new environment: Walker2D

### Training on Custom Walker2D environment using PPO

In [None]:
# Training on source environment (2M timesteps)
!cd sim2real; python train.py --env=CustomWalker2D-source-v0 --model_id="walker2d_source_1" --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=2929.04 +/- 736.45
Episode length: 785.20 +/- 189.82
Eval num_timesteps=1700000, episode_reward=2691.23 +/- 1148.30
Episode length: 720.80 +/- 290.36
Eval num_timesteps=1800000, episode_reward=2097.49 +/- 1507.20
Episode length: 554.60 +/- 364.74
Eval num_timesteps=1900000, episode_reward=1781.86 +/- 258.93
Episode length: 495.20 +/- 76.48
Eval num_timesteps=2000000, episode_reward=3361.88 +/- 852.17
Episode length: 874.40 +/- 211.71
New best mean reward!
Training time: 4247.864328622818
Test reward (avg +/- std): (2052.202233350383 +/- 1001.7444958567927) - Num episodes: 100


In [None]:
# Training on target environment (2M timesteps)
!cd sim2real; python train.py --env=CustomWalker2D-target-v0 --model_id="walker2d_target_1" --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1151.61 +/- 250.39
Episode length: 330.60 +/- 67.38
Eval num_timesteps=1700000, episode_reward=1105.69 +/- 632.67
Episode length: 334.80 +/- 196.25
Eval num_timesteps=1800000, episode_reward=1134.83 +/- 328.08
Episode length: 334.80 +/- 71.86
Eval num_timesteps=1900000, episode_reward=804.65 +/- 338.75
Episode length: 240.40 +/- 81.51
Eval num_timesteps=2000000, episode_reward=1019.71 +/- 262.54
Episode length: 304.80 +/- 60.24
Training time: 4263.223496198654
Test reward (avg +/- std): (1235.8550028249076 +/- 439.848499813264) - Num episodes: 100


### Training on Custom Walker2D environment, with complete domain randomization (all the masses except the torso), using PPO

In [None]:
# Training using Uniform Domain Randomization (all masses except the torso)
!cd sim2real; python train.py --env=CustomWalker2D-source-v0 --model_id="walker2d_source_udr_uniform_1" --domain_rand=uniform --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1547.27 +/- 1099.27
Episode length: 452.20 +/- 280.28
Eval num_timesteps=1700000, episode_reward=2546.70 +/- 1262.40
Episode length: 716.40 +/- 349.14
Eval num_timesteps=1800000, episode_reward=2462.83 +/- 1349.13
Episode length: 702.60 +/- 365.93
Eval num_timesteps=1900000, episode_reward=3586.17 +/- 107.74
Episode length: 993.00 +/- 14.00
New best mean reward!
Eval num_timesteps=2000000, episode_reward=2282.12 +/- 1081.24
Episode length: 624.40 +/- 278.25
Training time: 3766.763552427292
Test reward (avg +/- std): (2548.07213067711 +/- 1217.4967016170572) - Num episodes: 100


In [None]:
#Training using Truncated Normal Domain Randomization (all masses except the torso)
!cd sim2real; python train.py --env=CustomWalker2D-source-v0 --model_id="walker2d_source_udr_normal_1" --domain_rand=normal --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1136.91 +/- 271.86
Episode length: 357.60 +/- 60.79
Eval num_timesteps=1700000, episode_reward=1766.80 +/- 648.77
Episode length: 525.80 +/- 208.62
Eval num_timesteps=1800000, episode_reward=1837.84 +/- 890.00
Episode length: 578.00 +/- 265.57
Eval num_timesteps=1900000, episode_reward=1526.10 +/- 880.21
Episode length: 482.40 +/- 267.11
Eval num_timesteps=2000000, episode_reward=1118.11 +/- 331.35
Episode length: 323.20 +/- 81.42
Training time: 3788.8086972236633
Test reward (avg +/- std): (1433.6702532590398 +/- 696.907142093376) - Num episodes: 100


### Training on Custom Walker2D environment, with single mass randomization, using PPO

In [None]:
randomization_types = ["uniform", "normal"]
randomize_masses = ["randomize_thigh", "randomize_leg", "randomize_foot"]

walker_env = "CustomWalker2D-source-v0"

for domain_rand in randomization_types:
    for mass in randomize_masses:
        model_id = f"walker2d_source_{mass.split('_')[1]}_{domain_rand}"

        print(f"Training Walker2D - Environment: {walker_env} - Randomization: {domain_rand} - Mass: {mass}")
        !cd sim2real && python train.py --env={walker_env} --model_id={model_id} --domain_rand={domain_rand} --{mass} --total_timesteps=2000000 --lr_schedule="exponential" --verbose=0

Training Walker2D - Environment: CustomWalker2D-source-v0 - Randomization: uniform - Mass: randomize_thigh

--- WORKING ON CUSTOM WALKER 2D SOURCE ENVIRONMENT ---

Eval num_timesteps=100000, episode_reward=283.34 +/- 6.53
Episode length: 161.20 +/- 6.05
New best mean reward!
Eval num_timesteps=200000, episode_reward=372.42 +/- 5.57
Episode length: 184.20 +/- 4.45
New best mean reward!
Eval num_timesteps=300000, episode_reward=453.64 +/- 4.11
Episode length: 194.20 +/- 1.72
New best mean reward!
Eval num_timesteps=400000, episode_reward=550.83 +/- 36.47
Episode length: 205.40 +/- 19.09
New best mean reward!
Eval num_timesteps=500000, episode_reward=1876.98 +/- 797.39
Episode length: 726.40 +/- 335.38
New best mean reward!
Eval num_timesteps=600000, episode_reward=1668.96 +/- 874.06
Episode length: 696.60 +/- 372.27
Eval num_timesteps=700000, episode_reward=905.95 +/- 70.41
Episode length: 311.40 +/- 23.25
Eval num_timesteps=800000, episode_reward=1949.02 +/- 701.43
Episode length: 653.2

### Test models

**Without mass randomization:**

In [None]:
print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_1" --env=CustomWalker2D-source-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_source_to_source_test_video.mp4"

print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_1" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_source_to_target_test_video.mp4"

print("\n----- TARGET ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_target_1" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_target_to_target_test_video.mp4"

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_source_to_source_test_video.mp4
Test reward (avg +/- std): (2376.825577337019 +/- 765.3996059101355) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_source_to_target_test_video.mp4
Test reward (avg +/- std): (2039.6931135233854 +/- 940.5783974139155) - Num episodes: 1000

----- TARGET ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_target_to_target_test_video

**With complete mass randomization:**

In [None]:
models = {
    "uniform": "models/PPO_model_walker2d_source_udr_uniform_1.zip",
    "normal": "models/PPO_model_walker2d_source_udr_normal_1.zip"
}

# Test for each type of randomization
for rand_type, model_path in models.items():
    print(f"\n===== TESTING {rand_type.upper()} RANDOMIZATION =====")

    # Test SOURCE ➔ SOURCE
    print("\n----- SOURCE ➔ SOURCE -----")
    !cd sim2real; python train.py --test={model_path} --env=CustomWalker2D-source-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_{rand_type}_source_to_source_test_video.mp4"

    # Test SOURCE ➔ TARGET
    print("\n----- SOURCE ➔ TARGET -----")
    !cd sim2real; python train.py --test={model_path} --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_{rand_type}_source_to_target_test_video.mp4"


===== TESTING UNIFORM RANDOMIZATION =====

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_uniform_source_to_source_test_video.mp4
Test reward (avg +/- std): (2854.67438447908 +/- 583.6749631509572) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_uniform_source_to_target_test_video.mp4
Test reward (avg +/- std): (2861.9709338686243 +/- 576.7817310062762) - Num episodes: 1000

===== TESTING NORMAL RANDOMIZATION =====

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/

**With single mass randomization:**

In [None]:

randomization_types = ["uniform", "normal"]
randomize_masses = ["randomize_thigh", "randomize_leg", "randomize_foot"]

for domain_rand in randomization_types:
    for mass in randomize_masses:
        print()
        print(f"Randomization type: {domain_rand} - Randomizing mass: {mass}")

        model_name = f"models/PPO_model_walker2d_source_{mass.split('_')[1]}_{domain_rand}"
        video_name_source_to_source = f"walker2d_randomized_{domain_rand}_{mass.split('_')[1]}_source_to_source_test_video.mp4"
        video_name_source_to_target = f"walker2d_randomized_{domain_rand}_{mass.split('_')[1]}_source_to_target_test_video.mp4"

        print("\n----- SOURCE ➔ SOURCE -----")
        !cd sim2real && python train.py --test={model_name} --env=CustomWalker2D-source-v0 --test_episodes=1000 --test_rendering --video_name={video_name_source_to_source}

        print("\n----- SOURCE ➔ TARGET -----")
        !cd sim2real && python train.py --test={model_name} --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name={video_name_source_to_target}


Randomization type: uniform - Randomizing mass: randomize_thigh

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_randomized_uniform_thigh_source_to_source_test_video.mp4
Test reward (avg +/- std): (3360.9074214536113 +/- 785.0970548114917) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_randomized_uniform_thigh_source_to_target_test_video.mp4
Test reward (avg +/- std): (3345.010270724061 +/- 812.0615812999199) - Num episodes: 1000

Randomization type: uniform - Randomizing mass: randomize_leg

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Ep

## 5. BigFoot: Training on a custom Walker2D environment with foot masses tripled compared to the standard configuration (in the source env)

In [None]:
# Training on source environment (No DR)
!cd sim2real; python train.py --env=CustomWalker2D-source-v0 --model_id="walker2d_source_heavy_feet_no_dr" --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1825.08 +/- 197.38
Episode length: 514.80 +/- 45.18
Eval num_timesteps=1700000, episode_reward=2652.27 +/- 1050.10
Episode length: 730.00 +/- 279.65
Eval num_timesteps=1800000, episode_reward=2502.61 +/- 1206.94
Episode length: 690.40 +/- 304.09
Eval num_timesteps=1900000, episode_reward=2580.78 +/- 1063.27
Episode length: 700.80 +/- 279.16
Eval num_timesteps=2000000, episode_reward=3491.44 +/- 532.00
Episode length: 888.60 +/- 120.64
New best mean reward!
Training time: 4293.380982160568
Test reward (avg +/- std): (2165.3284343592622 +/- 887.5447310093815) - Num episodes: 100


In [None]:
# Training on target environment (No DR)
!cd sim2real; python train.py --env=CustomWalker2D-target-v0 --model_id="walker2d_target_heavy_feet_no_dr" --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=2421.09 +/- 1118.74
Episode length: 665.20 +/- 275.14
Eval num_timesteps=1700000, episode_reward=3792.43 +/- 109.03
Episode length: 983.00 +/- 34.00
New best mean reward!
Eval num_timesteps=1800000, episode_reward=3073.00 +/- 490.45
Episode length: 888.40 +/- 136.68
Eval num_timesteps=1900000, episode_reward=3151.44 +/- 742.09
Episode length: 840.80 +/- 198.19
Eval num_timesteps=2000000, episode_reward=3101.77 +/- 1013.94
Episode length: 840.20 +/- 265.00
Training time: 4349.737284421921
Test reward (avg +/- std): (1885.2291754981588 +/- 946.7469577493339) - Num episodes: 100


In [None]:
# Training on source environment with Uniform Domain randomization
!cd sim2real; python train.py --env=CustomWalker2D-source-v0 --domain_rand=uniform --model_id="walker2d_source_heavy_feet_with_dr" --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=3327.23 +/- 745.29
Episode length: 853.60 +/- 167.70
Eval num_timesteps=1700000, episode_reward=3330.25 +/- 1490.77
Episode length: 758.40 +/- 305.95
Eval num_timesteps=1800000, episode_reward=3428.10 +/- 1626.68
Episode length: 741.40 +/- 316.91
Eval num_timesteps=1900000, episode_reward=3515.70 +/- 1266.67
Episode length: 793.00 +/- 254.25
Eval num_timesteps=2000000, episode_reward=3053.05 +/- 1522.19
Episode length: 732.80 +/- 327.29
Training time: 4388.302843570709
Test reward (avg +/- std): (2610.2305453573854 +/- 1339.756606106864) - Num episodes: 100


In [None]:
# Training on source environment with Truncated Normal Domain randomization
!cd sim2real; python train.py --env=CustomWalker2D-source-v0 --domain_rand=normal --model_id="walker2d_source_heavy_feet_with_dr_normal" --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=1997.32 +/- 1226.82
Episode length: 487.40 +/- 257.07
Eval num_timesteps=1700000, episode_reward=3281.67 +/- 1467.58
Episode length: 830.60 +/- 330.37
New best mean reward!
Eval num_timesteps=1800000, episode_reward=3828.55 +/- 592.40
Episode length: 899.20 +/- 153.79
New best mean reward!
Eval num_timesteps=1900000, episode_reward=3204.10 +/- 1241.59
Episode length: 713.00 +/- 255.90
Eval num_timesteps=2000000, episode_reward=1464.28 +/- 837.48
Episode length: 374.60 +/- 176.18
Training time: 4394.285305023193
Test reward (avg +/- std): (2472.0408017628083 +/- 1532.396057900067) - Num episodes: 100


### BigFoot: Testing

NO Domain randomization:

In [None]:
print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_heavy_feet_no_dr" --env=CustomWalker2D-source-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_heavy_feet_no_dr_source_to_source_test.mp4"

print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_heavy_feet_no_dr" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_heavy_feet_no_dr_source_to_target_test.mp4"


----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_heavy_feet_no_dr_source_to_source_test.mp4
Test reward (avg +/- std): (2957.9146232488424 +/- 894.1275858110675) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_heavy_feet_no_dr_source_to_target_test.mp4
Test reward (avg +/- std): (2894.8308793413203 +/- 896.5872505254711) - Num episodes: 1000


With domain randomization (UNIFORM)

In [None]:
# SOURCE --> SOURCE
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_heavy_feet_with_dr" --env=CustomWalker2D-source-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_heavy_feet_with_dr_source_to_source_test.mp4"

Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_heavy_feet_with_dr_source_to_source_test.mp4
Test reward (avg +/- std): (4158.027530824367 +/- 757.5385500423763) - Num episodes: 1000


In [None]:
# SOURCE --> TARGET
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_heavy_feet_with_dr" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_heavy_feet_with_dr_source_to_target_test.mp4"

Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_heavy_feet_with_dr_source_to_target_test.mp4
Test reward (avg +/- std): (4140.04923921812 +/- 784.2544573615471) - Num episodes: 1000


With domain randomization (NORMAL)

In [None]:
# SOURCE --> SOURCE
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_heavy_feet_with_dr_normal" --env=CustomWalker2D-source-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_heavy_feet_with_dr_normal_source_to_source_test.mp4"

Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_heavy_feet_with_dr_normal_source_to_source_test.mp4
Test reward (avg +/- std): (2353.8253454171468 +/- 1781.6232670314496) - Num episodes: 1000


In [None]:
# SOURCE --> TARGET
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_heavy_feet_with_dr_normal" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_heavy_feet_with_dr_normal_source_to_target_test.mp4"

Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_heavy_feet_with_dr_normal_source_to_target_test.mp4
Test reward (avg +/- std): (2314.2939306035905 +/- 1770.9294288478968) - Num episodes: 1000


## 6. Fixed seed during training of the walker2d environment

**Seed Configuration**

To ensure reproducibility and analyze the impact of randomness, the seed was consistently set across all components of the project:

*NumPy:* For generating random numbers in numerical operations.

*PyTorch:* For initializing neural network weights and other stochastic operations.

*Gym:* To control randomness in the environment and action space.

*Python Random Module:* For any standard random functions.

The seed can be specified using the parameter `--seed`, ensuring that all randomizations share the same base, guaranteeing reproducible results.



In [None]:
# Training on SOURCE environment WITHOUT Uniform Domain Randomization (UDR) and SEED = 42
!cd sim2real; python train.py --env=CustomWalker2D-source-v0 --seed=42 --model_id="walker2d_source_seed42" --total_timesteps=2000000 --lr_schedule="exponential"

# Training on SOURCE environment WITHOUT Uniform Domain Randomization (UDR) and SEED = 123
!cd sim2real; python train.py --env=CustomWalker2D-source-v0 --seed=123 --model_id="walker2d_source_seed123" --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=986.91 +/- 493.98
Episode length: 285.20 +/- 115.96
Eval num_timesteps=1700000, episode_reward=2631.03 +/- 1235.79
Episode length: 678.20 +/- 298.80
Eval num_timesteps=1800000, episode_reward=2793.55 +/- 1337.50
Episode length: 680.80 +/- 308.18
New best mean reward!
Eval num_timesteps=1900000, episode_reward=1376.64 +/- 1255.50
Episode length: 388.40 +/- 312.59
Eval num_timesteps=2000000, episode_reward=2601.54 +/- 1124.36
Episode length: 639.80 +/- 243.12
Training time: 4025.4372096061707
Test reward (avg +/- std): (1881.1000667336657 +/- 993.4588977343566) - Num episodes: 100


In [None]:
# Training on SOURCE environment with Uniform Domain Randomization (UDR) and SEED = 42
!cd sim2real; python train.py --env=CustomWalker2D-source-v0 --domain_rand=uniform --seed=42 --model_id="walker2d_source_udr_seed42" --total_timesteps=2000000 --lr_schedule="exponential"

# Training on TARGET environment with SEED = 42
!cd sim2real; python train.py --env=CustomWalker2D-target-v0 --seed=42 --model_id="walker2d_target_udr_seed42" --total_timesteps=2000000 --lr_schedule="exponential"

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=2872.74 +/- 1189.07
Episode length: 745.80 +/- 284.63
Eval num_timesteps=1700000, episode_reward=3648.63 +/- 1024.54
Episode length: 882.60 +/- 234.80
Eval num_timesteps=1800000, episode_reward=3055.46 +/- 1586.04
Episode length: 720.00 +/- 344.21
Eval num_timesteps=1900000, episode_reward=3631.84 +/- 1235.82
Episode length: 864.20 +/- 271.60
Eval num_timesteps=2000000, episode_reward=3836.39 +/- 1078.95
Episode length: 881.60 +/- 236.80
Training time: 3665.834237098694
Test reward (avg +/- std): (3386.996509596658 +/- 1294.4795078592426) - Num episodes: 100


In [None]:
# Training on SOURCE environment with Uniform Domain Randomization (UDR) and SEED = 123
!cd sim2real; python train.py --env=CustomWalker2D-source-v0 --domain_rand=uniform --seed=123 --model_id="walker2d_source_udr_seed123" --total_timesteps=2000000 --lr_schedule="exponential"

# Training on TARGET environment with SEED = 123
!cd sim2real; python train.py --env=CustomWalker2D-target-v0 --seed=123 --model_id="walker2d_target_seed123" --total_timesteps=2000000 --lr_schedule="exponential"



[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Eval num_timesteps=1600000, episode_reward=986.91 +/- 493.98
Episode length: 285.20 +/- 115.96
Eval num_timesteps=1700000, episode_reward=2631.03 +/- 1235.79
Episode length: 678.20 +/- 298.80
Eval num_timesteps=1800000, episode_reward=2793.55 +/- 1337.50
Episode length: 680.80 +/- 308.18
New best mean reward!
Eval num_timesteps=1900000, episode_reward=1376.64 +/- 1255.50
Episode length: 388.40 +/- 312.59
Eval num_timesteps=2000000, episode_reward=2601.54 +/- 1124.36
Episode length: 639.80 +/- 243.12
Training time: 3812.2031903266907
Test reward (avg +/- std): (1881.1000667336657 +/- 993.4588977343566) - Num episodes: 100


In [None]:
print("\n===== TESTING MODELS WITH SEED=42 WITHOUT UDR=====")

# SOURCE ➔ SOURCE
print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_seed42" --env=CustomWalker2D-source-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed42_no_udr_source_to_source_test.mp4"

# SOURCE ➔ TARGET
print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_seed42" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed42_no_udr_source_to_target_test.mp4"

# TARGET ➔ TARGET
print("\n----- TARGET ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_target_seed42" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed42_no_udr_target_to_target_test.mp4"


===== TESTING MODELS WITH SEED=42 WITHOUT UDR=====

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_seed42_no_udr_source_to_source_test.mp4
Test reward (avg +/- std): (4171.7620666513085 +/- 729.9510470707787) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_seed42_no_udr_source_to_target_test.mp4
Test reward (avg +/- std): (4222.225900392046 +/- 633.1598529827234) - Num episodes: 1000

----- TARGET ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 90

In [None]:
print("\n===== TESTING MODELS WITH SEED=123 WITHOUT UDR=====")

# SOURCE ➔ SOURCE
print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_seed123" --env=CustomWalker2D-source-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed123_no_udr_source_to_source_test.mp4"

# SOURCE ➔ TARGET
print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_seed123" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed123_no_udr_source_to_target_test.mp4"

# TARGET ➔ TARGET
print("\n----- TARGET ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_target_seed123" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed123_no_udr_target_to_target_test.mp4"


===== TESTING MODELS WITH SEED=123 WITHOUT UDR=====

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_seed123_no_udr_source_to_source_test.mp4
Test reward (avg +/- std): (2409.717632176848 +/- 1119.5198288526162) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_seed123_no_udr_source_to_target_test.mp4
Test reward (avg +/- std): (2450.2931949044223 +/- 1127.6418051425496) - Num episodes: 1000

----- TARGET ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episo

In [None]:
print("\n===== TESTING MODELS WITH SEED=42 AND UDR=====")

# SOURCE ➔ SOURCE
print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_udr_seed42" --env=CustomWalker2D-source-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed42_source_to_source_test.mp4"

# SOURCE ➔ TARGET
print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_udr_seed42" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed42_source_to_target_test.mp4"

# TARGET ➔ TARGET
print("\n----- TARGET ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_target_seed42" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed42_target_to_target_test.mp4"


===== TESTING MODELS WITH SEED=42 =====

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_seed42_source_to_source_test.mp4
Test reward (avg +/- std): (1161.746603501175 +/- 137.62568608540383) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_seed42_source_to_target_test.mp4
Test reward (avg +/- std): (1161.746603501175 +/- 137.62568608540383) - Num episodes: 1000

----- TARGET ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save a

In [None]:
print("\n===== TESTING MODELS WITH SEED=123 AND UDR =====")

# SOURCE ➔ SOURCE
print("\n----- SOURCE ➔ SOURCE -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_seed123" --env=CustomWalker2D-source-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed123_source_to_source_test.mp4"

# SOURCE ➔ TARGET
print("\n----- SOURCE ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_source_seed123" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed123_source_to_target_test.mp4"

# TARGET ➔ TARGET
print("\n----- TARGET ➔ TARGET -----")
!cd sim2real; python train.py --test="models/PPO_model_walker2d_target_seed123" --env=CustomWalker2D-target-v0 --test_episodes=1000 --test_rendering --video_name="walker2d_seed123_target_to_target_test.mp4"


===== TESTING MODELS WITH SEED=123 =====

----- SOURCE ➔ SOURCE -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_seed123_source_to_source_test.mp4
Test reward (avg +/- std): (3857.710568920987 +/- 110.33301119525349) - Num episodes: 1000

----- SOURCE ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video save as videos/walker2d_seed123_source_to_target_test.mp4
Test reward (avg +/- std): (3857.710568920987 +/- 110.33301119525349) - Num episodes: 1000

----- TARGET ➔ TARGET -----
Testing...
Episode 0/1000
Episode 100/1000
Episode 200/1000
Episode 300/1000
Episode 400/1000
Episode 500/1000
Episode 600/1000
Episode 700/1000
Episode 800/1000
Episode 900/1000
Test video sav