In [None]:
!unzip custom_hopper.zip

In [1]:

import gym
from stable_baselines3 import SAC
from env.custom_hopper import *
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.sac.policies import MlpPolicy
from stable_baselines3.common.monitor import Monitor
#from optimize_hyperparam import optimize
from os.path import exists
#from statistics import mean

  from .autonotebook import tqdm as notebook_tqdm


<h3>Register and train source domain environment</h3>

In [None]:
env = Monitor(gym.make("CustomHopper-source-v0"))

In [None]:
if exists("SAC_source_env.zip"):
    model = SAC.load("SAC_source_env")
else:
    model = SAC(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps = 50000, log_interval = 10)
    model.save("SAC_source_env")

<h3>Test source environment</h3>

In [None]:
eval_env = Monitor(gym.make('CustomHopper-source-v0'))

In [None]:
n_eval = 50
mean_reward, std_reward = evaluate_policy(model, eval_env, n_eval_episodes = n_eval, deterministic = True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

<h3>Register and train target domain environment</h3>

In [2]:
env = gym.make("CustomHopper-target-v0")

In [3]:
if exists("SAC_target_env.zip"):
    model = SAC.load("SAC_target_env")
    print(f"Learning rate: target domain: {model.learning_rate}")
else:
    model = SAC(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps = 50000, log_interval = 50)
    model.save("SAC_target_env")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 34.4     |
|    ep_rew_mean     | 43.1     |
| time/              |          |
|    episodes        | 50       |
|    fps             | 52       |
|    time_elapsed    | 32       |
|    total_timesteps | 1718     |
| train/             |          |
|    actor_loss      | -19.7    |
|    critic_loss     | 2.43     |
|    ent_coef        | 0.623    |
|    ent_coef_loss   | -2.14    |
|    learning_rate   | 0.0003   |
|    n_updates       | 1617     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 64.9     |
|    ep_rew_mean     | 126      |
| time/              |          |
|    episodes        | 100      |
|    fps             | 39       |
|    time_elapsed    | 162      |
|    total_timesteps | 6487     |
| train/             |

<h3>Hyperparameter optimization for source domain</h3>

In [None]:
!python3 optimize_hyperparam.py --algo sac --env CustomHopper-source-v0 -n 50000 --n-trials 10 -optimize --n-jobs 8 --conf-file standard_config.yml

<h3>Train source environment using optimized hyperparameters</h3>

In [None]:
if exists("SAaaC_source_env_opt.zip"):
    model_opt = SAC.load("SAC_source_env_opt")
else:
    gamma = 0.999               
    lr = 0.003 
    batch_size = 128          
    buff_size = 10000       
    learning_starts = 10000        
    train_freq = 10
    tau = 0.01
    log_std_init = -3.064007572504874
    model_opt = SAC(MlpPolicy, env, verbose=1, gamma=gamma, learning_rate=lr, batch_size=batch_size, buffer_size=buff_size, learning_starts=learning_starts, train_freq=train_freq, tau=tau)
    model_opt.learn(total_timesteps = 50000, log_interval = 50)
    model.save("SAC_source_env_opt")

<h3>Test source environment using optimized hyperparameters</h3>

In [None]:
n_eval = 50
mean_reward, std_reward = evaluate_policy(model_opt, eval_env, n_eval_episodes = n_eval, deterministic = True)
print(f"mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

<h3>Hyperparameter optimization for target domain</h3>

In [None]:
!python3 optimize_hyperparam.py --algo sac --env CustomHopper-target-v0 -n 500 -optimize --n-jobs 4 --conf-file standard_config.yml