In [2]:
import os
import time
import random

import gym 
import optuna
from FurutaPendulum import FurutaPendulum
from wrappers import StepTimeCtrl, HistoryWrapper

from stable_baselines3 import SAC
from stable_baselines3 import PPO
from stable_baselines3 import TD3
from stable_baselines3.common.vec_env import VecFrameStack
from stable_baselines3.common.evaluation import evaluate_policy

In [3]:
MAX_STEP_TIME = 1/60 #s
env = HistoryWrapper(StepTimeCtrl(FurutaPendulum(1000, MAX_STEP_TIME)),alpha=1e0)

In [None]:
env.uC.read_state()

In [None]:
env.update_state()
env.get_obs()

In [None]:
from stable_baselines3.common.env_checker import check_env
check_env(env, warn=True)

In [None]:
def optimize_sac(trial):
    # Define the hyperparameters to optimize
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2)
    train_freq = trial.suggest_int('train_freq', 1, 10)
    target_entropy = trial.suggest_float('target_entropy', -5, 5)
    sde_sample_freq = trial.suggest_float('sde_sample_freq', 10, 70)

    # Create the environment
    MAX_STEP_TIME = 1/60 #s
    env = HistoryWrapper(StepTimeCtrl(FurutaPendulum(1000, MAX_STEP_TIME)))
    # Create the SAC agent
    model = SAC("MlpPolicy", env, use_sde=True, sde_sample_freq=sde_sample_freq, 
                learning_rate=learning_rate,
                target_entropy=target_entropy, learning_starts=500,
                train_freq=(train_freq, "episode"))
    # Train the agent
    model.learn(total_timesteps=5000)
    env.close()
    time.sleep(60 * 5)
    # Get the final reward
    return model.get_average_reward()

study = optuna.create_study(direction='maximize')
study.optimize(optimize_sac, n_trials=100)

print(study.best_params)

In [None]:
env.close()


In [None]:
env.reset()

In [5]:
log_path = os.path.join('Training', 'Logs')
model = SAC("MlpPolicy", env, verbose=1, tensorboard_log=log_path, use_sde=True, sde_sample_freq=64)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
log_path = os.path.join('Training', 'Logs')
model = TD3("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

In [None]:
log_path = os.path.join('Training', 'Logs')
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path, use_sde=True, sde_sample_freq=128)

In [None]:
model_path = os.path.join('Training', 'Models','SAC_280K')
model = SAC.load(model_path, env)

In [6]:
for i in range(0,20):
    model.learn(total_timesteps=20000)
    model_path = os.path.join('Training', 'Models','SAC_20k'+ '_' + str(i))
    model.save(model_path)
    env.uC.reset()
    time.sleep(60 * 3)

Logging to Training\Logs\SAC_1












----------------------------------
| rollout/           |           |
|    ep_len_mean     | 270       |
|    ep_rew_mean     | -3.45e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 10        |
|    time_elapsed    | 100       |
|    total_timesteps | 1079      |
| train/             |           |
|    actor_loss      | 47.5      |
|    critic_loss     | 49.4      |
|    ent_coef        | 0.888     |
|    ent_coef_loss   | -0.112    |
|    learning_rate   | 0.0003    |
|    n_updates       | 978       |
|    std             | 0.0501    |
----------------------------------


----------------------------------
| rollout/           |           |
|    ep_len_mean     | 151       |
|    ep_rew_mean     | -2.45e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 9         |
|    time_elapsed    | 124       |
|    total_timesteps | 1210      |
| train/             |           |
|    actor_loss      | 57        |
|    critic_loss     | 6.9e+03   |
|    ent_coef        | 0.872     |
|    ent_coef_loss   | 0.0855    |
|    learning_rate   | 0.0003    |
|    n_updates       | 1109      |
|    std             | 0.0501    |
----------------------------------




----------------------------------
| rollout/           |           |
|    ep_len_mean     | 128       |
|    ep_rew_mean     | -2.19e+03 |
| time/              |           |
|    episodes        | 12        |
|    fps             | 9         |
|    time_elapsed    | 162       |
|    total_timesteps | 1540      |
| train/             |           |
|    actor_loss      | 79.7      |
|    critic_loss     | 2.64e+03  |
|    ent_coef        | 0.865     |
|    ent_coef_loss   | 0.0602    |
|    learning_rate   | 0.0003    |
|    n_updates       | 1439      |
|    std             | 0.0499    |
----------------------------------


----------------------------------
| rollout/           |           |
|    ep_len_mean     | 108       |
|    ep_rew_mean     | -2.09e+03 |
| time/              |           |
|    episodes        | 16        |
|    fps             | 9         |
|    time_elapsed    | 189       |
|    total_timesteps | 1728      |
| train/             |           |
|    actor_loss      | 92.6      |
|    critic_loss     | 1.26e+04  |
|    ent_coef        | 0.873     |
|    ent_coef_loss   | 0.106     |
|    learning_rate   | 0.0003    |
|    n_updates       | 1627      |
|    std             | 0.0499    |
----------------------------------




----------------------------------
| rollout/           |           |
|    ep_len_mean     | 97.1      |
|    ep_rew_mean     | -2.09e+03 |
| time/              |           |
|    episodes        | 20        |
|    fps             | 8         |
|    time_elapsed    | 220       |
|    total_timesteps | 1942      |
| train/             |           |
|    actor_loss      | 118       |
|    critic_loss     | 1.06e+04  |
|    ent_coef        | 0.879     |
|    ent_coef_loss   | -0.0152   |
|    learning_rate   | 0.0003    |
|    n_updates       | 1841      |
|    std             | 0.0498    |
----------------------------------














----------------------------------
| rollout/           |           |
|    ep_len_mean     | 127       |
|    ep_rew_mean     | -2.32e+03 |
| time/              |           |
|    episodes        | 24        |
|    fps             | 9         |
|    time_elapsed    | 319       |
|    total_timesteps | 3037      |
| train/             |           |
|    actor_loss      | 163       |
|    critic_loss     | 1.63e+03  |
|    ent_coef        | 0.996     |
|    ent_coef_loss   | 0.00304   |
|    learning_rate   | 0.0003    |
|    n_updates       | 2936      |
|    std             | 0.0495    |
----------------------------------






















----------------------------------
| rollout/           |           |
|    ep_len_mean     | 169       |
|    ep_rew_mean     | -2.64e+03 |
| time/              |           |
|    episodes        | 28        |
|    fps             | 10        |
|    time_elapsed    | 466       |
|    total_timesteps | 4742      |
| train/             |           |
|    actor_loss      | 242       |
|    critic_loss     | 1.84e+03  |
|    ent_coef        | 1.35      |
|    ent_coef_loss   | -0.142    |
|    learning_rate   | 0.0003    |
|    n_updates       | 4641      |
|    std             | 0.0495    |
----------------------------------


























----------------------------------
| rollout/           |           |
|    ep_len_mean     | 211       |
|    ep_rew_mean     | -2.98e+03 |
| time/              |           |
|    episodes        | 32        |
|    fps             | 10        |
|    time_elapsed    | 638       |
|    total_timesteps | 6763      |
| train/             |           |
|    actor_loss      | 318       |
|    critic_loss     | 6.78e+03  |
|    ent_coef        | 1.63      |
|    ent_coef_loss   | -0.261    |
|    learning_rate   | 0.0003    |
|    n_updates       | 6662      |
|    std             | 0.0498    |
----------------------------------




















----------------------------------
| rollout/           |           |
|    ep_len_mean     | 231       |
|    ep_rew_mean     | -3.12e+03 |
| time/              |           |
|    episodes        | 36        |
|    fps             | 10        |
|    time_elapsed    | 782       |
|    total_timesteps | 8311      |
| train/             |           |
|    actor_loss      | 377       |
|    critic_loss     | 721       |
|    ent_coef        | 1.64      |
|    ent_coef_loss   | -0.0122   |
|    learning_rate   | 0.0003    |
|    n_updates       | 8210      |
|    std             | 0.0499    |
----------------------------------














----------------------------------
| rollout/           |           |
|    ep_len_mean     | 236       |
|    ep_rew_mean     | -3.19e+03 |
| time/              |           |
|    episodes        | 40        |
|    fps             | 10        |
|    time_elapsed    | 884       |
|    total_timesteps | 9427      |
| train/             |           |
|    actor_loss      | 428       |
|    critic_loss     | 3.56e+03  |
|    ent_coef        | 1.47      |
|    ent_coef_loss   | -0.109    |
|    learning_rate   | 0.0003    |
|    n_updates       | 9326      |
|    std             | 0.0498    |
----------------------------------




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 220      |
|    ep_rew_mean     | -3.1e+03 |
| time/              |          |
|    episodes        | 44       |
|    fps             | 10       |
|    time_elapsed    | 917      |
|    total_timesteps | 9672     |
| train/             |          |
|    actor_loss      | 444      |
|    critic_loss     | 1.03e+03 |
|    ent_coef        | 1.45     |
|    ent_coef_loss   | 0.0217   |
|    learning_rate   | 0.0003   |
|    n_updates       | 9571     |
|    std             | 0.0499   |
---------------------------------




----------------------------------
| rollout/           |           |
|    ep_len_mean     | 208       |
|    ep_rew_mean     | -2.99e+03 |
| time/              |           |
|    episodes        | 48        |
|    fps             | 10        |
|    time_elapsed    | 956       |
|    total_timesteps | 10002     |
| train/             |           |
|    actor_loss      | 454       |
|    critic_loss     | 1.52e+03  |
|    ent_coef        | 1.42      |
|    ent_coef_loss   | -0.0587   |
|    learning_rate   | 0.0003    |
|    n_updates       | 9901      |
|    std             | 0.0499    |
----------------------------------




























----------------------------------
| rollout/           |           |
|    ep_len_mean     | 233       |
|    ep_rew_mean     | -3.18e+03 |
| time/              |           |
|    episodes        | 52        |
|    fps             | 10        |
|    time_elapsed    | 1132      |
|    total_timesteps | 12102     |
| train/             |           |
|    actor_loss      | 523       |
|    critic_loss     | 9.87e+03  |
|    ent_coef        | 1.53      |
|    ent_coef_loss   | 0.0703    |
|    learning_rate   | 0.0003    |
|    n_updates       | 12001     |
|    std             | 0.0502    |
----------------------------------




























----------------------------------
| rollout/           |           |
|    ep_len_mean     | 253       |
|    ep_rew_mean     | -3.35e+03 |
| time/              |           |
|    episodes        | 56        |
|    fps             | 10        |
|    time_elapsed    | 1306      |
|    total_timesteps | 14183     |
| train/             |           |
|    actor_loss      | 574       |
|    critic_loss     | 587       |
|    ent_coef        | 1.61      |
|    ent_coef_loss   | 0.319     |
|    learning_rate   | 0.0003    |
|    n_updates       | 14082     |
|    std             | 0.0507    |
----------------------------------




















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 303       |
|    ep_rew_mean     | -3.73e+03 |
| time/              |           |
|    episodes        | 60        |
|    fps             | 11        |
|    time_elapsed    | 1623      |
|    total_timesteps | 18183     |
| train/             |           |
|    actor_loss      | 645       |
|    critic_loss     | 282       |
|    ent_coef        | 1.32      |
|    ent_coef_loss   | -0.0646   |
|    learning_rate   | 0.0003    |
|    n_updates       | 18082     |
|    std             | 0.0512    |
----------------------------------


























Logging to Training\Logs\SAC_2


















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -8.25e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 12        |
|    time_elapsed    | 325       |
|    total_timesteps | 4000      |
| train/             |           |
|    actor_loss      | 737       |
|    critic_loss     | 253       |
|    ent_coef        | 0.933     |
|    ent_coef_loss   | -0.0574   |
|    learning_rate   | 0.0003    |
|    n_updates       | 23799     |
|    std             | 0.0516    |
----------------------------------












































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 931       |
|    ep_rew_mean     | -7.43e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 11        |
|    time_elapsed    | 624       |
|    total_timesteps | 7447      |
| train/             |           |
|    actor_loss      | 783       |
|    critic_loss     | 168       |
|    ent_coef        | 0.811     |
|    ent_coef_loss   | -0.0321   |
|    learning_rate   | 0.0003    |
|    n_updates       | 27246     |
|    std             | 0.0518    |
----------------------------------










































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 887       |
|    ep_rew_mean     | -7.42e+03 |
| time/              |           |
|    episodes        | 12        |
|    fps             | 12        |
|    time_elapsed    | 881       |
|    total_timesteps | 10644     |
| train/             |           |
|    actor_loss      | 810       |
|    critic_loss     | 295       |
|    ent_coef        | 0.81      |
|    ent_coef_loss   | 0.0314    |
|    learning_rate   | 0.0003    |
|    n_updates       | 30443     |
|    std             | 0.0521    |
----------------------------------




















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 915       |
|    ep_rew_mean     | -7.78e+03 |
| time/              |           |
|    episodes        | 16        |
|    fps             | 12        |
|    time_elapsed    | 1201      |
|    total_timesteps | 14644     |
| train/             |           |
|    actor_loss      | 847       |
|    critic_loss     | 201       |
|    ent_coef        | 0.786     |
|    ent_coef_loss   | -0.00464  |
|    learning_rate   | 0.0003    |
|    n_updates       | 34443     |
|    std             | 0.0526    |
----------------------------------




















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 932       |
|    ep_rew_mean     | -7.94e+03 |
| time/              |           |
|    episodes        | 20        |
|    fps             | 12        |
|    time_elapsed    | 1521      |
|    total_timesteps | 18644     |
| train/             |           |
|    actor_loss      | 881       |
|    critic_loss     | 278       |
|    ent_coef        | 0.716     |
|    ent_coef_loss   | 0.0891    |
|    learning_rate   | 0.0003    |
|    n_updates       | 38443     |
|    std             | 0.0531    |
----------------------------------


















Logging to Training\Logs\SAC_3


















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -9.14e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 13        |
|    time_elapsed    | 304       |
|    total_timesteps | 4000      |
| train/             |           |
|    actor_loss      | 909       |
|    critic_loss     | 82.9      |
|    ent_coef        | 0.589     |
|    ent_coef_loss   | 0.12      |
|    learning_rate   | 0.0003    |
|    n_updates       | 43699     |
|    std             | 0.0534    |
----------------------------------




















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -8.78e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 12        |
|    time_elapsed    | 625       |
|    total_timesteps | 8000      |
| train/             |           |
|    actor_loss      | 922       |
|    critic_loss     | 369       |
|    ent_coef        | 0.557     |
|    ent_coef_loss   | -0.0943   |
|    learning_rate   | 0.0003    |
|    n_updates       | 47699     |
|    std             | 0.0536    |
----------------------------------




















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -7.91e+03 |
| time/              |           |
|    episodes        | 12        |
|    fps             | 12        |
|    time_elapsed    | 946       |
|    total_timesteps | 12000     |
| train/             |           |
|    actor_loss      | 940       |
|    critic_loss     | 187       |
|    ent_coef        | 0.553     |
|    ent_coef_loss   | -0.467    |
|    learning_rate   | 0.0003    |
|    n_updates       | 51699     |
|    std             | 0.0542    |
----------------------------------








































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 938       |
|    ep_rew_mean     | -7.36e+03 |
| time/              |           |
|    episodes        | 16        |
|    fps             | 12        |
|    time_elapsed    | 1189      |
|    total_timesteps | 15004     |
| train/             |           |
|    actor_loss      | 941       |
|    critic_loss     | 170       |
|    ent_coef        | 0.564     |
|    ent_coef_loss   | 0.101     |
|    learning_rate   | 0.0003    |
|    n_updates       | 54703     |
|    std             | 0.0547    |
----------------------------------




















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 950       |
|    ep_rew_mean     | -7.29e+03 |
| time/              |           |
|    episodes        | 20        |
|    fps             | 12        |
|    time_elapsed    | 1509      |
|    total_timesteps | 19004     |
| train/             |           |
|    actor_loss      | 951       |
|    critic_loss     | 101       |
|    ent_coef        | 0.578     |
|    ent_coef_loss   | 0.0225    |
|    learning_rate   | 0.0003    |
|    n_updates       | 58703     |
|    std             | 0.0554    |
----------------------------------














Logging to Training\Logs\SAC_4


















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -8.87e+03 |
| time/              |           |
|    episodes        | 4         |
|    fps             | 13        |
|    time_elapsed    | 305       |
|    total_timesteps | 4000      |
| train/             |           |
|    actor_loss      | 952       |
|    critic_loss     | 94        |
|    ent_coef        | 0.578     |
|    ent_coef_loss   | -0.863    |
|    learning_rate   | 0.0003    |
|    n_updates       | 63599     |
|    std             | 0.0569    |
----------------------------------




















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -8.68e+03 |
| time/              |           |
|    episodes        | 8         |
|    fps             | 12        |
|    time_elapsed    | 628       |
|    total_timesteps | 8000      |
| train/             |           |
|    actor_loss      | 956       |
|    critic_loss     | 51.1      |
|    ent_coef        | 0.51      |
|    ent_coef_loss   | -0.692    |
|    learning_rate   | 0.0003    |
|    n_updates       | 67599     |
|    std             | 0.0582    |
----------------------------------




















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -8.42e+03 |
| time/              |           |
|    episodes        | 12        |
|    fps             | 12        |
|    time_elapsed    | 949       |
|    total_timesteps | 12000     |
| train/             |           |
|    actor_loss      | 954       |
|    critic_loss     | 121       |
|    ent_coef        | 0.493     |
|    ent_coef_loss   | -0.384    |
|    learning_rate   | 0.0003    |
|    n_updates       | 71599     |
|    std             | 0.0592    |
----------------------------------




















































----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1e+03     |
|    ep_rew_mean     | -8.21e+03 |
| time/              |           |
|    episodes        | 16        |
|    fps             | 12        |
|    time_elapsed    | 1282      |
|    total_timesteps | 16000     |
| train/             |           |
|    actor_loss      | 956       |
|    critic_loss     | 174       |
|    ent_coef        | 0.467     |
|    ent_coef_loss   | 0.0625    |
|    learning_rate   | 0.0003    |
|    n_updates       | 75599     |
|    std             | 0.0598    |
----------------------------------


































KeyboardInterrupt: 

In [None]:
episodes = 20
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

In [None]:
env.observation_space.shape

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)

In [None]:
obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    if done is True:
        obs = env.reset()

# 8. View logs

In [None]:
log_path = os.path.join('Training', 'Logs')
training_log_path = os.path.join(log_path,)
!tensorboard --logdir={training_log_path} --host localhost --port 8088