In [1]:
from email.mime import image
from urllib.request import FancyURLopener
import torch
import numpy as np
import argparse
from torch import nn as nn
from torch.nn import functional as F
from dataclasses import asdict
from dataclasses import dataclass
import os
import gtimer as gt
import random
import time


from lifelong_rl.torch.modules import LayerNorm
from lifelong_rl.core import logger
import lifelong_rl.torch.pytorch_util as ptu
from scripts.get_config import get_rorl_config
from experiment_utils.prepare_data import load_hdf5
from lifelong_rl.core.logging.logging_setup import setup_logger
from lifelong_rl.data_management.replay_buffers.env_replay_buffer import EnvReplayBuffer
from lifelong_rl.data_management.replay_buffers.mujoco_replay_buffer import MujocoReplayBuffer
from lifelong_rl.envs.env_processor import make_env
from lifelong_rl.envs.env_utils import get_dim
from lifelong_rl.samplers.data_collector.path_collector import MdpPathCollector, LatentPathCollector
from lifelong_rl.samplers.data_collector.step_collector import MdpStepCollector, RFCollector, \
    GoalConditionedReplayStepCollector
from experiment_utils.launch_experiment import launch_experiment
from experiment_configs.algorithms.offline import get_offline_algorithm
from lifelong_rl.samplers.utils.model_rollout_functions import policy
from lifelong_rl.samplers.utils.rollout_functions import rollout_with_attack, rollout


No module named 'flow'
No module named 'carla'
pybullet build time: Oct 11 2021 20:59:00


In [None]:

# Data class of Namespace(base_log_dir='results', deterministic_backup=False, env_name='halfcheetah-medium-v2', epoch=3000, eta=-1.0, eval_attack=False, eval_attack_eps=0.01, eval_attack_mode='random', eval_no_training=False, exp_prefix='RORL', load_Qs='', load_config_type='attack', load_path='', log_to_tensorboard=False, max_q_backup=False, norm_input=True, num_qs=10, num_samples=20, plr=0.0003, policy_smooth_eps=0.0, policy_smooth_reg=1, q_ood_eps=0.0, q_ood_reg=0, q_ood_uncertainty_decay=0, q_ood_uncertainty_reg=0, q_ood_uncertainty_reg_min=0, q_smooth_eps=0.0, q_smooth_reg=0.005, q_smooth_tau=0.2, qlr=0.0003, reward_mean=False, reward_norm=False, reward_std=False, seed=0, shift_reward_minzero=False, use_cpu=False)

@dataclass
class Args:
    base_log_dir: str = 'results'
    deterministic_backup: bool = False
    env_name: str = 'halfcheetah-medium-v2'
    epoch: int = 3000
    eta: float = -1.0
    eval_attack: bool = False
    eval_attack_eps: float = 0.01
    eval_attack_mode: str = 'random'
    eval_no_training: bool = False
    exp_prefix: str = 'RORL'
    load_Qs: str = ''
    load_config_type: str = 'attack'
    load_path: str = ''
    log_to_tensorboard: bool = False
    max_q_backup: bool = False
    norm_input: bool = True
    num_qs: int = 10
    num_samples: int = 20
    plr: float = 0.0003
    policy_smooth_eps: float = 0.0
    policy_smooth_reg: int = 1
    q_ood_eps: float = 0.0
    q_ood_reg: int = 0
    q_ood_uncertainty_decay: int = 0
    q_ood_uncertainty_reg: int = 0
    q_ood_uncertainty_reg_min: int = 0
    q_smooth_eps: float = 0.0
    q_smooth_reg: float = 0.005
    q_smooth_tau: float = 0.2
    qlr: float = 0.0003
    reward_mean: bool = False
    reward_norm: bool = False
    reward_std: bool = False
    seed: int = 0
    shift_reward_minzero: bool = False
    use_cpu: bool = False


In [None]:
def get_config_from_args(args):
    # Default parameters
    variant = dict(
        algorithm='',
        collector_type='step',
        env_name='hopper-random-v2',
        env_kwargs=dict(),
        replay_buffer_size=int(2e6),
        reward_mean=False,  # added for easy config checking
        reward_std=-1.0,  # added for easy config checking
        policy_kwargs=dict(
            layer_size=256,
            num_q_layers=3,
            num_p_layers=3,
        ),
        trainer_kwargs=dict(
            discount=0.99,
            soft_target_tau=5e-3,
            policy_lr=3e-4,
            qf_lr=3e-4,
            use_automatic_entropy_tuning=True,
            policy_eval_start=0,
            num_qs=10,
            target_update_period=1,
            max_q_backup=False,
            deterministic_backup=False,
            eta=-1.0,
        ),
        offline_kwargs=dict(
            num_epochs=3000,
            num_eval_steps_per_epoch=1000,
            num_trains_per_train_loop=1000,
            max_path_length=1000, 
            batch_size=256,
            save_snapshot_freq=500,
        ),
    )

    experiment_kwargs = dict(
        exp_postfix='',
        use_gpu=True if torch.cuda.is_available() else False,
        log_to_tensorboard=True,
        base_log_dir=args.base_log_dir,
    )

    # Variant
    variant['env_name'] = args.env_name
    variant['seed'] = args.seed
    variant['load_path'] = args.load_path
    variant['load_Qs'] = args.load_Qs
    variant['eval_no_training'] = args.eval_no_training
    variant['eval_attack'] = args.eval_attack
    variant['eval_attack_eps'] = args.eval_attack_eps
    variant['eval_attack_mode'] = args.eval_attack_mode


    variant['offline_kwargs']['num_epochs'] = args.epoch

    # SAC-N
    variant['trainer_kwargs']['policy_lr'] = args.plr
    variant['trainer_kwargs']['qf_lr'] = args.qlr

    variant['trainer_kwargs']['num_qs'] = args.num_qs
    variant['trainer_kwargs']['max_q_backup'] = args.max_q_backup
    variant['trainer_kwargs']['deterministic_backup'] = args.deterministic_backup

    variant['reward_mean'] = args.reward_mean
    variant['reward_std'] = args.reward_std
    
    # EDAC
    variant['trainer_kwargs']['eta'] = args.eta

    # smooth
    if args.load_config_type != '':
        rorl_config = get_rorl_config(args.env_name, args.load_config_type)
        keys = ['num_samples', 'policy_smooth_eps', 'policy_smooth_reg', 'q_smooth_eps',
                'q_smooth_reg', 'q_smooth_tau', 'q_ood_eps', 'q_ood_reg', 'q_ood_uncertainty_reg',
                'q_ood_uncertainty_reg_min', 'q_ood_uncertainty_decay']
        for key in keys:
            variant['trainer_kwargs'][key] = rorl_config[key]
    else:
        variant['trainer_kwargs']['num_samples'] = args.num_samples
        variant['trainer_kwargs']['policy_smooth_eps'] = args.policy_smooth_eps
        variant['trainer_kwargs']['policy_smooth_reg'] = args.policy_smooth_reg
        variant['trainer_kwargs']['q_smooth_eps'] = args.q_smooth_eps
        variant['trainer_kwargs']['q_smooth_reg'] = args.q_smooth_reg
        variant['trainer_kwargs']['q_smooth_tau'] = args.q_smooth_tau
        variant['trainer_kwargs']['q_ood_eps'] = args.q_ood_eps
        variant['trainer_kwargs']['q_ood_reg'] = args.q_ood_reg
        variant['trainer_kwargs']['q_ood_uncertainty_reg'] = args.q_ood_uncertainty_reg
        variant['trainer_kwargs']['q_ood_uncertainty_reg_min'] = args.q_ood_uncertainty_reg_min
        variant['trainer_kwargs']['q_ood_uncertainty_decay'] = args.q_ood_uncertainty_decay

    # experiment name
    experiment_kwargs['exp_postfix'] = ''
    
    exp_postfix = args.exp_prefix + '_{}'.format(args.num_qs)
    
    # exp_postfix += '_plr{:.4f}_qlr{:.4f}'.format(args.plr, args.qlr)
    if variant['trainer_kwargs']['max_q_backup']:
        exp_postfix += '_maxq'
    if variant['trainer_kwargs']['deterministic_backup']:
        exp_postfix += '_detq'
    if args.eta > 0:
        exp_postfix += '_eta{:.2f}'.format(args.eta)
    if args.reward_mean:
        exp_postfix += '_mean'
    if args.reward_std > 0:
        exp_postfix += '_std'

    experiment_kwargs['exp_postfix'] = exp_postfix

    experiment_kwargs['data_args'] = {
        'reward_mean': args.reward_mean,
        'reward_std': args.reward_std,
        'shift_reward_minzero': args.shift_reward_minzero,
        'reward_norm': args.reward_norm,
    }

    ####### normalize input
    variant['norm_input'] = args.norm_input
    return(variant)

In [2]:
@dataclass
class ExperimentConfig:
    algorithm: str
    collector_type: str
    env_name: str
    env_kwargs: dict
    replay_buffer_size: int
    reward_mean: bool
    reward_std: bool
    policy_kwargs: dict
    trainer_kwargs: dict
    offline_kwargs: dict
    seed: int
    load_path: str
    load_Qs: str
    eval_no_training: bool
    eval_attack: bool
    eval_attack_eps: float
    eval_attack_mode: str
    norm_input: bool

config = ExperimentConfig(
    algorithm='',
    collector_type='step',
    env_name='halfcheetah-medium-v2',
    env_kwargs={},
    replay_buffer_size=2000000,
    reward_mean=False,
    reward_std=False,
    policy_kwargs={'layer_size': 256, 'num_q_layers': 3, 'num_p_layers': 3},
    trainer_kwargs={
        'discount': 0.99,
        'soft_target_tau': 0.005,
        'policy_lr': 0.0003,
        'qf_lr': 0.0003,
        'use_automatic_entropy_tuning': True,
        'policy_eval_start': 0,
        'num_qs': 10,
        'target_update_period': 1,
        'max_q_backup': False,
        'deterministic_backup': False,
        'eta': -1.0,
        'num_samples': 20,
        'policy_smooth_eps': 0.05,
        'policy_smooth_reg': 1.0,
        'q_smooth_eps': 0.03,
        'q_smooth_reg': 0.0001,
        'q_smooth_tau': 0.2,
        'q_ood_eps': 0.0,
        'q_ood_reg': 0.0,
        'q_ood_uncertainty_reg': 0.0,
        'q_ood_uncertainty_reg_min': 0.0,
        'q_ood_uncertainty_decay': 0.0
    },
    offline_kwargs={
        'num_epochs': 3000,
        'num_eval_steps_per_epoch': 1000,
        'num_trains_per_train_loop': 1000,
        'max_path_length': 1000,
        'batch_size': 256,
        'save_snapshot_freq': 500
    },
    seed=0,
    load_path='',
    load_Qs='',
    eval_no_training=False,
    eval_attack=False,
    eval_attack_eps=0.01,
    eval_attack_mode='random',
    norm_input=True
)

config_dict = asdict(config)

In [3]:
from lifelong_rl.models.networks import ParallelizedEnsembleFlattenMLP
from lifelong_rl.policies.base.base import MakeDeterministic
from lifelong_rl.policies.models.tanh_gaussian_policy import TanhGaussianPolicy
from lifelong_rl.trainers.q_learning.sac import SACTrainer
import lifelong_rl.util.pythonplusplus as ppp
from torch.nn import functional as F


def get_config(
        variant,
        expl_env,
        eval_env,
        obs_dim,
        action_dim,
        replay_buffer,
):
    """
    Policy construction
    """

    num_qs = variant['trainer_kwargs']['num_qs']
    M = variant['policy_kwargs']['layer_size']
    num_q_layers = variant['policy_kwargs']['num_q_layers']
    num_p_layers = variant['policy_kwargs']['num_p_layers']

    # normalization
    norm_input = variant['norm_input']
    obs_norm_mean, obs_norm_std = variant['normalization_info']['obs_mean'], variant['normalization_info']['obs_std']

    qfs, target_qfs = ppp.group_init(
        2,
        ParallelizedEnsembleFlattenMLP,
        ensemble_size=num_qs,
        hidden_sizes=[M] * num_q_layers,
        input_size=obs_dim + action_dim,
        output_size=1,
        layer_norm=None,
        norm_input=norm_input,
        obs_norm_mean=obs_norm_mean,
        obs_norm_std=obs_norm_std,
    )

    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M] * num_p_layers,
        layer_norm=None,
        norm_input=norm_input,
        obs_norm_mean=obs_norm_mean,
        obs_norm_std=obs_norm_std,
    )

    trainer = SACTrainer(
        env=eval_env,
        policy=policy,
        qfs=qfs,
        target_qfs=target_qfs,
        replay_buffer=replay_buffer,
        norm_input=norm_input,
        obs_std=obs_norm_std,
        **variant['trainer_kwargs'],
    )

    if variant['load_path'] != '':
        trainer.load_snapshot(variant['load_path'])
    if variant['load_Qs'] != '':
        trainer.load_qfs(variant['load_Qs'])
    """
    Create config dict
    """

    config = dict()
    config.update(
        dict(
            trainer=trainer,
            exploration_policy=policy,
            evaluation_policy=MakeDeterministic(policy),
            exploration_env=expl_env,
            evaluation_env=eval_env,
            replay_buffer=replay_buffer,
            qfs=qfs,
        ))
    config['algorithm_kwargs'] = variant.get('algorithm_kwargs', dict())

    return config


In [4]:

experiment_kwargs = dict(
    exp_postfix='',
    use_gpu=True if torch.cuda.is_available() else False,
    log_to_tensorboard=True,
    base_log_dir='results',
)

launch_experiment(variant=config_dict, 
                    get_config=get_config,
                    get_offline_algorithm=get_offline_algorithm,
                    **experiment_kwargs)

base_experiment begin
setup logger
logging to: results/halfcheetah-medium-v2_Tue_Feb_28_21:53:36_2023_0
2023-02-28 21:53:36.977903 KST | Variant:
2023-02-28 21:53:36.979933 KST | {
  "algorithm": "",
  "collector_type": "step",
  "env_name": "halfcheetah-medium-v2",
  "env_kwargs": {},
  "replay_buffer_size": 2000000,
  "reward_mean": false,
  "reward_std": false,
  "policy_kwargs": {
    "layer_size": 256,
    "num_q_layers": 3,
    "num_p_layers": 3
  },
  "trainer_kwargs": {
    "discount": 0.99,
    "soft_target_tau": 0.005,
    "policy_lr": 0.0003,
    "qf_lr": 0.0003,
    "use_automatic_entropy_tuning": true,
    "policy_eval_start": 0,
    "num_qs": 10,
    "target_update_period": 1,
    "max_q_backup": false,
    "deterministic_backup": false,
    "eta": -1.0,
    "num_samples": 20,
    "policy_smooth_eps": 0.05,
    "policy_smooth_reg": 1.0,
    "q_smooth_eps": 0.03,
    "q_smooth_reg": 0.0001,
    "q_smooth_tau": 0.2,
    "q_ood_eps": 0.0,
    "q_ood_reg": 0.0,
    "q_ood_unc

load datafile: 100%|██████████| 21/21 [00:02<00:00,  9.21it/s]



Rewards stats before preprocessing
mean: 4.7702
std: 1.2103
max: 8.3267
min: -2.8353

Rewards stats after preprocessing
mean: 4.7702
std: 1.2103
max: 8.3267
min: -2.8353

Replay buffer size : 999000
obs dim            :  (999000, 17)
action dim         :  (999000, 6)
# terminals: 0
Mean rewards       : 4.77


KeyboardInterrupt: 