In [57]:
from gym.envs.mujoco import humanoid_v3

import rlkit.torch.pytorch_util as ptu
from rlkit.data_management.env_replay_buffer import EnvReplayBuffer
from rlkit.envs.wrappers import NormalizedBoxEnv
from rlkit.launchers.launcher_util import setup_logger
from rlkit.samplers.data_collector import MdpPathCollector
from rlkit.torch.sac.policies import TanhGaussianPolicy, MakeDeterministic
from rlkit.torch.sac.sac import SACTrainer
from rlkit.torch.networks import FlattenMlp
from rlkit.torch.torch_rl_algorithm import TorchBatchRLAlgorithm

import abc
from collections import OrderedDict

import gtimer as gt
import torch
import os
import copy

from rlkit.core import logger, eval_util
from rlkit.data_management.replay_buffer import ReplayBuffer
from rlkit.samplers.data_collector import DataCollector

import abc
# import torch
import gtimer as gt
from rlkit.core.rl_algorithm import BaseRLAlgorithm
from rlkit.data_management.replay_buffer import ReplayBuffer
from rlkit.samplers.data_collector import PathCollector

In [58]:
import abc
from collections import OrderedDict

import gtimer as gt
import os
import copy

from rlkit.core import logger, eval_util
from rlkit.data_management.replay_buffer import ReplayBuffer
from rlkit.samplers.data_collector import DataCollector

In [59]:
from rlkit.launchers.launcher_util import run_experiment

In [60]:
variant = dict(
        algorithm="SAC",
        version="normal",
        layer_size=256,
        replay_buffer_size=int(1E6),
        algorithm_kwargs=dict(
            num_epochs=1,
            num_eval_steps_per_epoch=5000,
            num_trains_per_train_loop=1000,
            num_expl_steps_per_train_loop=1000,
            min_num_steps_before_training=1000,
            max_path_length=1000,
            batch_size=256,
        ),
        trainer_kwargs=dict(
            discount=0.99,
            soft_target_tau=5e-3,
            target_update_period=1,
            policy_lr=3E-4,
            qf_lr=3E-4,
            reward_scale=1,
            use_automatic_entropy_tuning=True,
        ),
    )

In [61]:



def _get_epoch_timings():
    times_itrs = gt.get_times().stamps.itrs
    times = OrderedDict()
    epoch_time = 0
    for key in sorted(times_itrs):
        time = times_itrs[key][-1]
        epoch_time += time
        times['time/{} (s)'.format(key)] = time
    times['time/epoch (s)'] = epoch_time
    times['time/total (s)'] = gt.get_times().total
    return times


class BaseRLAlgorithm2(object, metaclass=abc.ABCMeta):
    def __init__(
            self,
            trainer,
            exploration_env,
            evaluation_env,
            exploration_data_collector: DataCollector,
            evaluation_data_collector: DataCollector,
            replay_buffer: ReplayBuffer,
            initial_epoch
    ):
        self.trainer = trainer
        self.expl_env = exploration_env
        self.eval_env = evaluation_env
        self.expl_data_collector = exploration_data_collector
        self.eval_data_collector = evaluation_data_collector
        self.replay_buffer = replay_buffer
        self._start_epoch = initial_epoch

        self.post_epoch_funcs = []

    def train(self, initial_epoch=0, epochs=None, dir_=None, exp_no=None):
        self._train(initial_epoch, epochs, dir_, exp_no)

    def _train(self):
        """
        Train model.
        """
        raise NotImplementedError('_train must implemented by inherited class')

    def get_cur_best_metric_val(self):
        cur_best_metric_val = None
        if os.path.exists('results/humanoid/cur_best_avg_rewards.pkl'):
            cur_best_metric_val = copy.deepcopy(torch.load('results/humanoid/cur_best_avg_rewards.pkl')['cur_best_metric_val'])
        else:
            cur_best_metric_val = -1* float('inf')
        return cur_best_metric_val

    def _end_epoch(self, epoch):
        print('in _end_epoch, epoch is: {}'.format(epoch))
        snapshot = self._get_snapshot()
        logger.save_itr_params(epoch, snapshot)
        # trainer_obj = self.trainer
        # ckpt_path='ckpt.pkl'
        # logger.save_ckpt(epoch, trainer_obj, ckpt_path)
        # gt.stamp('saving')
        if epoch%10==0:
            self.save_snapshot_2(epoch)
        eval_paths = self.eval_data_collector.get_epoch_paths()
        d = eval_util.get_generic_path_information(eval_paths)
        # print(d.keys())
        metric_val = d['Returns Mean']
        
        cur_best_metric_val = self.get_cur_best_metric_val()
        self.save_snapshot_2_best_only(metric_val=metric_val, cur_best_metric_val=cur_best_metric_val, min_or_max='max', epoch=epoch)
        self._log_stats(epoch)

        self.expl_data_collector.end_epoch(epoch)
        self.eval_data_collector.end_epoch(epoch)
        self.replay_buffer.end_epoch(epoch)
        self.trainer.end_epoch(epoch)

        for post_epoch_func in self.post_epoch_funcs:
            post_epoch_func(self, epoch)

    def save_snapshot_2(self, epoch):
        print('Saving snapshot 2')
        self_copy = copy.deepcopy(self)
        torch.save(copy.deepcopy({'algorithm':self_copy, 'epoch':epoch}), 'results/humanoid/ckpt.pkl')

    def get_snapshot_2(self):
        print('in get_snapshot_2')
        ckpt = {}
        ckpt = torch.load('results/humanoid/ckpt.pkl')
        self = copy.deepcopy(ckpt['algorithm'])
        epoch = ckpt['epoch']
        return epoch
    
    def get_snapshot_best(self):
        print('in get_snapshot_best')
        ckpt = None
        ckpt = torch.load('results/humanoid/ckpt-best.pkl')
        self = copy.deepcopy(ckpt['algorithm'])
        epoch = ckpt['epoch']
        return epoch

    def save_snapshot_2_best_only(self, metric_val, cur_best_metric_val, min_or_max='min', epoch=0):
        if min_or_max == 'min' and metric_val < cur_best_metric_val \
            or min_or_max == 'max' and metric_val > cur_best_metric_val:
            print('Saving snapshot best')
            print(metric_val)
            print(cur_best_metric_val)
            self_copy = copy.deepcopy(self)
            torch.save({'algorithm':self_copy, 'epoch':epoch}, 'results/humanoid/ckpt-best.pkl')
            cur_best_metric_val = metric_val
            cur_best_metric_val_copy = copy.deepcopy(cur_best_metric_val)
            torch.save({'cur_best_metric_val':cur_best_metric_val_copy}, 'results/humanoid/cur_best_avg_rewards.pkl')

    # def _resume_training(self):

    def _get_snapshot(self):
        snapshot = {}
        for k, v in self.trainer.get_snapshot().items():
            snapshot['trainer/' + k] = v
        for k, v in self.expl_data_collector.get_snapshot().items():
            snapshot['exploration/' + k] = v
        for k, v in self.eval_data_collector.get_snapshot().items():
            snapshot['evaluation/' + k] = v
        for k, v in self.replay_buffer.get_snapshot().items():
            snapshot['replay_buffer/' + k] = v
        return snapshot

    def _log_stats(self, epoch):
        logger.log("Epoch {} finished".format(epoch), with_timestamp=True)

        """
        Replay Buffer
        """
        logger.record_dict(
            self.replay_buffer.get_diagnostics(),
            prefix='replay_buffer/'
        )

        """
        Trainer
        """
        logger.record_dict(self.trainer.get_diagnostics(), prefix='trainer/')

        """
        Exploration
        """
        logger.record_dict(
            self.expl_data_collector.get_diagnostics(),
            prefix='exploration/'
        )
        expl_paths = self.expl_data_collector.get_epoch_paths()
        if hasattr(self.expl_env, 'get_diagnostics'):
            logger.record_dict(
                self.expl_env.get_diagnostics(expl_paths),
                prefix='exploration/',
            )
        logger.record_dict(
            eval_util.get_generic_path_information(expl_paths),
            prefix="exploration/",
        )
        """
        Evaluation
        """
        logger.record_dict(
            self.eval_data_collector.get_diagnostics(),
            prefix='evaluation/',
        )
        eval_paths = self.eval_data_collector.get_epoch_paths()
        if hasattr(self.eval_env, 'get_diagnostics'):
            logger.record_dict(
                self.eval_env.get_diagnostics(eval_paths),
                prefix='evaluation/',
            )
        logger.record_dict(
            eval_util.get_generic_path_information(eval_paths),
            prefix="evaluation/",
        )

        """
        Misc
        """
        # gt.stamp('logging')
        logger.record_dict(_get_epoch_timings())
        logger.record_tabular('Epoch', epoch)
        logger.dump_tabular(with_prefix=False, with_timestamp=False, file_name='logs/humanoid/log1.txt', file_name2='logs/humanoid/log2')

    @abc.abstractmethod
    def training_mode(self, mode):
        """
        Set training mode to `mode`.
        :param mode: If True, training will happen (e.g. set the dropout
        probabilities to not all ones).
        """
        pass


In [62]:
import abc
import torch
import gtimer as gt
from rlkit.core.rl_algorithm import BaseRLAlgorithm
from rlkit.data_management.replay_buffer import ReplayBuffer
from rlkit.samplers.data_collector import PathCollector


class BatchRLAlgorithm2(BaseRLAlgorithm2, metaclass=abc.ABCMeta):
    def __init__(
            self,
            trainer,
            exploration_env,
            evaluation_env,
            exploration_data_collector: PathCollector,
            evaluation_data_collector: PathCollector,
            replay_buffer: ReplayBuffer,
            batch_size,
            max_path_length,
            num_epochs,
            num_eval_steps_per_epoch,
            num_expl_steps_per_train_loop,
            num_trains_per_train_loop,
            num_train_loops_per_epoch=1,
            min_num_steps_before_training=0,
            initial_epoch=0
    ):
        super().__init__(
            trainer,
            exploration_env,
            evaluation_env,
            exploration_data_collector,
            evaluation_data_collector,
            replay_buffer,
            initial_epoch
        )
        self.batch_size = batch_size
        self.max_path_length = max_path_length
        self.num_epochs = num_epochs
        self.num_eval_steps_per_epoch = num_eval_steps_per_epoch
        self.num_trains_per_train_loop = num_trains_per_train_loop
        self.num_train_loops_per_epoch = num_train_loops_per_epoch
        self.num_expl_steps_per_train_loop = num_expl_steps_per_train_loop
        self.min_num_steps_before_training = min_num_steps_before_training

    # def store_everything(self, ckpt_path):
    #     d = {
            
    #     }
    #     torch.save(d, ckpt_path)

    def _train(self, initial_epoch=0, epochs=None, dir_=None, exp_no=None):
        
        self._start_epoch = initial_epoch
        if epochs is not None: 
            self.num_epochs = epochs
        # print('\n\n\n\nn\n\n\n\nin _train #########')
        if self.min_num_steps_before_training > 0:
            init_expl_paths = self.expl_data_collector.collect_new_paths(
                self.max_path_length,
                self.min_num_steps_before_training,
                discard_incomplete_paths=False,
            )
            self.replay_buffer.add_paths(init_expl_paths)
            self.expl_data_collector.end_epoch(-1)
         
        # for epoch in gt.timed_for(
        #         range(self._start_epoch, self.num_epochs),
        #         save_itrs=True,
        # ):

        for epoch in range(self._start_epoch, self.num_epochs):

            self.eval_data_collector.collect_new_paths(
                self.max_path_length,
                self.num_eval_steps_per_epoch,
                discard_incomplete_paths=True,
            )
            # # gt.stamp('evaluation sampling')

            for _ in range(self.num_train_loops_per_epoch):
                new_expl_paths = self.expl_data_collector.collect_new_paths(
                    self.max_path_length,
                    self.num_expl_steps_per_train_loop,
                    discard_incomplete_paths=False,
                )
                # # gt.stamp('exploration sampling', unique=False)

                self.replay_buffer.add_paths(new_expl_paths)
                # # gt.stamp('data storing', unique=False)

                self.training_mode(True)
                for _ in range(self.num_trains_per_train_loop):
                    train_data = self.replay_buffer.random_batch(
                        self.batch_size)
                    self.trainer.train(train_data)
                # # gt.stamp('training', unique=False)
                self.training_mode(False)

            self._end_epoch(epoch)


In [63]:
import abc
from collections import OrderedDict

from typing import Iterable
from torch import nn as nn

from rlkit.core.batch_rl_algorithm import BatchRLAlgorithm
from rlkit.core.online_rl_algorithm import OnlineRLAlgorithm
from rlkit.core.trainer import Trainer
from rlkit.torch.core import np_to_pytorch_batch


class TorchOnlineRLAlgorithm(OnlineRLAlgorithm):
    def to(self, device):
        for net in self.trainer.networks:
            net.to(device)

    def training_mode(self, mode):
        for net in self.trainer.networks:
            net.train(mode)


class TorchBatchRLAlgorithm2(BatchRLAlgorithm2):

    def to(self, device):
        for net in self.trainer.networks:
            net.to(device)

    def training_mode(self, mode):
        for net in self.trainer.networks:
            net.train(mode)


class TorchTrainer(Trainer, metaclass=abc.ABCMeta):
    def __init__(self):
        self._num_train_steps = 0

    def train(self, np_batch):
        self._num_train_steps += 1
        batch = np_to_pytorch_batch(np_batch)
        self.train_from_torch(batch)

    def get_diagnostics(self):
        return OrderedDict([
            ('num train calls', self._num_train_steps),
        ])

    @abc.abstractmethod
    def train_from_torch(self, batch):
        pass

    @property
    @abc.abstractmethod
    def networks(self) -> Iterable[nn.Module]:
        pass


In [64]:
def get_snapshot_3():
    print('in get_snapshot_3')
    ckpt = {}
    ckpt = torch.load('results/humanoid/ckpt.pkl')
    # self = copy.deepcopy(ckpt['algorithm'])
    epoch = ckpt['epoch']
    return epoch, copy.deepcopy(ckpt['algorithm'])

In [65]:
def experiment_function(variant):
    setup_logger('experiment-5', variant=variant)
    ptu.set_gpu_mode(True, 0)

    expl_env = humanoid_v3.HumanoidEnv()
    eval_env = humanoid_v3.HumanoidEnv()


    expl_env = NormalizedBoxEnv(expl_env)
    eval_env = NormalizedBoxEnv(eval_env)
    obs_dim = expl_env.observation_space.low.size
    action_dim = eval_env.action_space.low.size
    # print(obs_dim, action_dim)
    
    M = variant['layer_size']
    qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf1 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    target_qf2 = FlattenMlp(
        input_size=obs_dim + action_dim,
        output_size=1,
        hidden_sizes=[M, M],
    )
    policy = TanhGaussianPolicy(
        obs_dim=obs_dim,
        action_dim=action_dim,
        hidden_sizes=[M, M],
    )
    eval_policy = MakeDeterministic(policy)
    eval_path_collector = MdpPathCollector(
        eval_env,
        eval_policy,
    )
    expl_path_collector = MdpPathCollector(
        expl_env,
        policy,
    )
    replay_buffer = EnvReplayBuffer(
        variant['replay_buffer_size'],
        expl_env,
    )
    trainer = SACTrainer(
        env=eval_env,
        policy=policy,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,
        **variant['trainer_kwargs']
    )
    
    resume = False
    resume_from_best = False
    algorithm = None
    if not resume:
        algorithm = TorchBatchRLAlgorithm2(
            trainer=trainer,
            exploration_env=expl_env,
            evaluation_env=eval_env,
            exploration_data_collector=expl_path_collector,
            evaluation_data_collector=eval_path_collector,
            replay_buffer=replay_buffer,
            **variant['algorithm_kwargs']
        )
        algorithm.to(ptu.device)
        algorithm.train(initial_epoch=0, epochs=4)
    else:
    #     algorithm = TorchBatchRLAlgorithm()
    #     algorithm = TorchBatchRLAlgorithm2(
    #         trainer=trainer,
    #         exploration_env=expl_env,
    #         evaluation_env=eval_env,
    #         exploration_data_collector=expl_path_collector,
    #         evaluation_data_collector=eval_path_collector,
    #         replay_buffer=replay_buffer,
    #         **variant['algorithm_kwargs']
    #     )
        if not resume_from_best:
            initial_epoch, algorithm = get_snapshot_3()
            initial_epoch+=1
        else:
            initial_epoch, algorithm = get_snapshot_best()
            initial_epoch+=1
        algorithm.to(ptu.device)
        algorithm.train(initial_epoch=initial_epoch, epochs = 1000)

In [66]:
seed=1
torch.manual_seed(seed)

<torch._C.Generator at 0x7ff467f20970>

In [67]:
run_experiment(experiment_function, variant=variant,seed=seed)

Doodad not set up! Running experiment here.
setting seed to 1




starting super init
super done
setting torch manual seed and numpy random seed here to 1
assigning done
calculating np.prod term
thats done
initializing alpha optimizer 
thats done
auto entropy thing done
criterions done
optimizers done
done
in _end_epoch, epoch is: 0
Saving snapshot 2


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


in _end_epoch, epoch is: 1
in _end_epoch, epoch is: 2
in _end_epoch, epoch is: 3


In [68]:
# import git

In [69]:
# !ls results/

In [70]:
# algorithm = TorchBatchRLAlgorithm2(
#         trainer=trainer,
#         exploration_env=expl_env,
#         evaluation_env=eval_env,
#         exploration_data_collector=expl_path_collector,
#         evaluation_data_collector=eval_path_collector,
#         replay_buffer=replay_buffer,
#         **variant['algorithm_kwargs']
#     )
# algorithm_ = copy.deepcopy(algorithm)

In [71]:
# epoch, algorithm = get_snapshot_3()

In [72]:
# algorithm.trainer.qf1.fcs[0].weight.data

In [73]:
# algorithm_.trainer.qf1.fcs[0].weight.data

In [74]:
# !rm logs/humanoid/log2
# !>logs/humanoid/log1.txt
# !rm results/humanoid/*.pkl

In [75]:
# l = [('replay_buffer/size', '2000'), ('trainer/QF1 Loss', '22.540974'), ('trainer/QF2 Loss', '22.461214'), ('trainer/Policy Loss', '-5.3794346'), ('trainer/Q1 Predictions Mean', '-0.013111563'), ('trainer/Q1 Predictions Std', '0.006633578'), ('trainer/Q1 Predictions Max', '0.004102228'), ('trainer/Q1 Predictions Min', '-0.031592093'), ('trainer/Q2 Predictions Mean', '-0.0048158974'), ('trainer/Q2 Predictions Std', '0.0035169804'), ('trainer/Q2 Predictions Max', '0.0017666357'), ('trainer/Q2 Predictions Min', '-0.018819787'), ('trainer/Q Targets Mean', '4.6183643'), ('trainer/Q Targets Std', '1.043318'), ('trainer/Q Targets Max', '8.902103'), ('trainer/Q Targets Min', '0.38821536'), ('trainer/Log Pis Mean', '-5.393486'), ('trainer/Log Pis Std', '0.62483275'), ('trainer/Log Pis Max', '-3.6818123'), ('trainer/Log Pis Min', '-6.818282'), ('trainer/Policy mu Mean', '-8.544093e-05'), ('trainer/Policy mu Std', '0.0021631739'), ('trainer/Policy mu Max', '0.006241211'), ('trainer/Policy mu Min', '-0.008601425'), ('trainer/Policy log std Mean', '0.00043238376'), ('trainer/Policy log std Std', '0.0024977052'), ('trainer/Policy log std Max', '0.007946691'), ('trainer/Policy log std Min', '-0.007701927'), ('trainer/Alpha', '0.9997000694274902'), ('trainer/Alpha Loss', '-0.0'), ('exploration/num steps total', '2000'), ('exploration/num paths total', '14'), ('exploration/path length Mean', '142.85714285714286'), ('exploration/path length Std', '203.16174314459656'), ('exploration/path length Max', '635'), ('exploration/path length Min', '24'), ('exploration/Rewards Mean', '-0.6064347764021665'), ('exploration/Rewards Std', '0.7633642575876102'), ('exploration/Rewards Max', '2.8795207953591153'), ('exploration/Rewards Min', '-3.6407089958048813'), ('exploration/Returns Mean', '-86.63353948602385'), ('exploration/Returns Std', '113.0231192346682'), ('exploration/Returns Max', '-3.105460048148942'), ('exploration/Returns Min', '-353.8074739814667'), ('exploration/Actions Mean', '-0.007008702'), ('exploration/Actions Std', '0.6308291'), ('exploration/Actions Max', '0.99908894'), ('exploration/Actions Min', '-0.9995207'), ('exploration/Num Paths', '7'), ('exploration/Average Returns', '-86.63353948602385'), ('exploration/env_infos/final/reward_forward Mean', '0.19075281818276574'), ('exploration/env_infos/final/reward_forward Std', '0.8342500879174919'), ('exploration/env_infos/final/reward_forward Max', '1.2204116659980113'), ('exploration/env_infos/final/reward_forward Min', '-1.1107150441758207'), ('exploration/env_infos/initial/reward_forward Mean', '-0.19105378778638557'), ('exploration/env_infos/initial/reward_forward Std', '0.1324312690431134'), ('exploration/env_infos/initial/reward_forward Max', '-0.00048826690484476964'), ('exploration/env_infos/initial/reward_forward Min', '-0.3908516428560994'), ('exploration/env_infos/reward_forward Mean', '-0.014457019663503719'), ('exploration/env_infos/reward_forward Std', '0.6431258664770597'), ('exploration/env_infos/reward_forward Max', '2.5457147574562833'), ('exploration/env_infos/reward_forward Min', '-2.3861966329817252'), ('exploration/env_infos/final/reward_ctrl Mean', '-1.5349954196384974'), ('exploration/env_infos/final/reward_ctrl Std', '0.15762081865148822'), ('exploration/env_infos/final/reward_ctrl Max', '-1.3189051151275635'), ('exploration/env_infos/final/reward_ctrl Min', '-1.7733919620513916'), ('exploration/env_infos/initial/reward_ctrl Mean', '-1.7483757819448198'), ('exploration/env_infos/initial/reward_ctrl Std', '0.4890421746133651'), ('exploration/env_infos/initial/reward_ctrl Max', '-0.8430193066596985'), ('exploration/env_infos/initial/reward_ctrl Min', '-2.4695911407470703'), ('exploration/env_infos/reward_ctrl Mean', '-1.5919777567386628'), ('exploration/env_infos/reward_ctrl Std', '0.43263451266003733'), ('exploration/env_infos/reward_ctrl Max', '-0.45637834072113037'), ('exploration/env_infos/reward_ctrl Min', '-2.8311009407043457'), ('exploration/env_infos/final/reward_contact Mean', '0.0'), ('exploration/env_infos/final/reward_contact Std', '0.0'), ('exploration/env_infos/final/reward_contact Max', '-0.0'), ('exploration/env_infos/final/reward_contact Min', '-0.0'), ('exploration/env_infos/initial/reward_contact Mean', '0.0'), ('exploration/env_infos/initial/reward_contact Std', '0.0'), ('exploration/env_infos/initial/reward_contact Max', '-0.0'), ('exploration/env_infos/initial/reward_contact Min', '-0.0'), ('exploration/env_infos/reward_contact Mean', '0.0'), ('exploration/env_infos/reward_contact Std', '0.0'), ('exploration/env_infos/reward_contact Max', '-0.0'), ('exploration/env_infos/reward_contact Min', '-0.0'), ('exploration/env_infos/final/reward_survive Mean', '1.0'), ('exploration/env_infos/final/reward_survive Std', '0.0'), ('exploration/env_infos/final/reward_survive Max', '1.0'), ('exploration/env_infos/final/reward_survive Min', '1.0'), ('exploration/env_infos/initial/reward_survive Mean', '1.0'), ('exploration/env_infos/initial/reward_survive Std', '0.0'), ('exploration/env_infos/initial/reward_survive Max', '1.0'), ('exploration/env_infos/initial/reward_survive Min', '1.0'), ('exploration/env_infos/reward_survive Mean', '1.0'), ('exploration/env_infos/reward_survive Std', '0.0'), ('exploration/env_infos/reward_survive Max', '1.0'), ('exploration/env_infos/reward_survive Min', '1.0'), ('exploration/env_infos/final/x_position Mean', '-0.10269478382571114'), ('exploration/env_infos/final/x_position Std', '1.0156116367327344'), ('exploration/env_infos/final/x_position Max', '0.9451816637118873'), ('exploration/env_infos/final/x_position Min', '-1.897179957278178'), ('exploration/env_infos/initial/x_position Mean', '-0.008983047047146756'), ('exploration/env_infos/initial/x_position Std', '0.055510131153156526'), ('exploration/env_infos/initial/x_position Max', '0.06390724254550409'), ('exploration/env_infos/initial/x_position Min', '-0.09279374271110855'), ('exploration/env_infos/x_position Mean', '0.6017503453335674'), ('exploration/env_infos/x_position Std', '0.5726451692146537'), ('exploration/env_infos/x_position Max', '1.427549938991314'), ('exploration/env_infos/x_position Min', '-1.897179957278178'), ('exploration/env_infos/final/y_position Mean', '0.4782543400247053'), ('exploration/env_infos/final/y_position Std', '0.7855700043684598'), ('exploration/env_infos/final/y_position Max', '1.550614516782081'), ('exploration/env_infos/final/y_position Min', '-1.0481446711405873'), ('exploration/env_infos/initial/y_position Mean', '0.004535862316807871'), ('exploration/env_infos/initial/y_position Std', '0.06869960071253207'), ('exploration/env_infos/initial/y_position Max', '0.10433425346335046'), ('exploration/env_infos/initial/y_position Min', '-0.09282306532235318'), ('exploration/env_infos/y_position Mean', '0.8093149087377335'), ('exploration/env_infos/y_position Std', '0.49822720634726986'), ('exploration/env_infos/y_position Max', '1.85534103577678'), ('exploration/env_infos/y_position Min', '-1.0481446711405873'), ('exploration/env_infos/final/distance_from_origin Mean', '1.2936266169235655'), ('exploration/env_infos/final/distance_from_origin Std', '0.4630235812512934'), ('exploration/env_infos/final/distance_from_origin Max', '2.007668134959712'), ('exploration/env_infos/final/distance_from_origin Min', '0.5422062892864513'), ('exploration/env_infos/initial/distance_from_origin Mean', '0.08576195513360897'), ('exploration/env_infos/initial/distance_from_origin Std', '0.023391580356425377'), ('exploration/env_infos/initial/distance_from_origin Max', '0.11835900951257046'), ('exploration/env_infos/initial/distance_from_origin Min', '0.04765527722351595'), ('exploration/env_infos/distance_from_origin Mean', '1.1705145479324601'), ('exploration/env_infos/distance_from_origin Std', '0.47237975365397095'), ('exploration/env_infos/distance_from_origin Max', '2.007668134959712'), ('exploration/env_infos/distance_from_origin Min', '0.026864676182351343'), ('exploration/env_infos/final/x_velocity Mean', '0.19075281818276574'), ('exploration/env_infos/final/x_velocity Std', '0.8342500879174919'), ('exploration/env_infos/final/x_velocity Max', '1.2204116659980113'), ('exploration/env_infos/final/x_velocity Min', '-1.1107150441758207'), ('exploration/env_infos/initial/x_velocity Mean', '-0.19105378778638557'), ('exploration/env_infos/initial/x_velocity Std', '0.1324312690431134'), ('exploration/env_infos/initial/x_velocity Max', '-0.00048826690484476964'), ('exploration/env_infos/initial/x_velocity Min', '-0.3908516428560994'), ('exploration/env_infos/x_velocity Mean', '-0.014457019663503719'), ('exploration/env_infos/x_velocity Std', '0.6431258664770597'), ('exploration/env_infos/x_velocity Max', '2.5457147574562833'), ('exploration/env_infos/x_velocity Min', '-2.3861966329817252'), ('exploration/env_infos/final/y_velocity Mean', '-0.40199064342737'), ('exploration/env_infos/final/y_velocity Std', '0.5971681663931471'), ('exploration/env_infos/final/y_velocity Max', '0.4077673083866995'), ('exploration/env_infos/final/y_velocity Min', '-1.259094084272049'), ('exploration/env_infos/initial/y_velocity Mean', '-0.0018872918578139997'), ('exploration/env_infos/initial/y_velocity Std', '0.14466164945930343'), ('exploration/env_infos/initial/y_velocity Max', '0.18527643991947776'), ('exploration/env_infos/initial/y_velocity Min', '-0.24354054905268124'), ('exploration/env_infos/y_velocity Mean', '0.06630737583610093'), ('exploration/env_infos/y_velocity Std', '0.5814660994205909'), ('exploration/env_infos/y_velocity Max', '2.4733737635902187'), ('exploration/env_infos/y_velocity Min', '-1.7773695514104038'), ('exploration/env_infos/final/forward_reward Mean', '0.19075281818276574'), ('exploration/env_infos/final/forward_reward Std', '0.8342500879174919'), ('exploration/env_infos/final/forward_reward Max', '1.2204116659980113'), ('exploration/env_infos/final/forward_reward Min', '-1.1107150441758207'), ('exploration/env_infos/initial/forward_reward Mean', '-0.19105378778638557'), ('exploration/env_infos/initial/forward_reward Std', '0.1324312690431134'), ('exploration/env_infos/initial/forward_reward Max', '-0.00048826690484476964'), ('exploration/env_infos/initial/forward_reward Min', '-0.3908516428560994'), ('exploration/env_infos/forward_reward Mean', '-0.014457019663503719'), ('exploration/env_infos/forward_reward Std', '0.6431258664770597'), ('exploration/env_infos/forward_reward Max', '2.5457147574562833'), ('exploration/env_infos/forward_reward Min', '-2.3861966329817252'), ('evaluation/num steps total', '5000'), ('evaluation/num paths total', '5'), ('evaluation/path length Mean', '1000.0'), ('evaluation/path length Std', '0.0'), ('evaluation/path length Max', '1000'), ('evaluation/path length Min', '1000'), ('evaluation/Rewards Mean', '0.9983870785388878'), ('evaluation/Rewards Std', '0.04913844925352009'), ('evaluation/Rewards Max', '1.9170770036140046'), ('evaluation/Rewards Min', '-0.13733900279654443'), ('evaluation/Returns Mean', '998.3870785388877'), ('evaluation/Returns Std', '3.6701104981273907'), ('evaluation/Returns Max', '1004.5377664821074'), ('evaluation/Returns Min', '994.229294090189'), ('evaluation/Actions Mean', '-3.18751e-05'), ('evaluation/Actions Std', '0.0010393225'), ('evaluation/Actions Max', '0.0051202956'), ('evaluation/Actions Min', '-0.0029151689'), ('evaluation/Num Paths', '5'), ('evaluation/Average Returns', '998.3870785388877'), ('evaluation/env_infos/final/reward_forward Mean', '0.0006460700812505518'), ('evaluation/env_infos/final/reward_forward Std', '0.000683088293812967'), ('evaluation/env_infos/final/reward_forward Max', '0.0015115855220859675'), ('evaluation/env_infos/final/reward_forward Min', '-6.473937490869552e-05'), ('evaluation/env_infos/initial/reward_forward Mean', '-0.03758082171038936'), ('evaluation/env_infos/initial/reward_forward Std', '0.12706800538907964'), ('evaluation/env_infos/initial/reward_forward Max', '0.10714505461736121'), ('evaluation/env_infos/initial/reward_forward Min', '-0.22169929465191252'), ('evaluation/env_infos/reward_forward Mean', '-0.001608596634589209'), ('evaluation/env_infos/reward_forward Std', '0.04913838565616579'), ('evaluation/env_infos/reward_forward Max', '0.9170830225171546'), ('evaluation/env_infos/reward_forward Min', '-1.13733215985436'), ('evaluation/env_infos/final/reward_ctrl Mean', '-4.327109218138503e-06'), ('evaluation/env_infos/final/reward_ctrl Std', '2.26174565875007e-08'), ('evaluation/env_infos/final/reward_ctrl Max', '-4.29289639214403e-06'), ('evaluation/env_infos/final/reward_ctrl Min', '-4.362076651887037e-06'), ('evaluation/env_infos/initial/reward_ctrl Mean', '-3.2657525480317417e-06'), ('evaluation/env_infos/initial/reward_ctrl Std', '2.1728433187299645e-07'), ('evaluation/env_infos/initial/reward_ctrl Max', '-2.892852990044048e-06'), ('evaluation/env_infos/initial/reward_ctrl Min', '-3.553457645466551e-06'), ('evaluation/env_infos/reward_ctrl Mean', '-4.3248265230431574e-06'), ('evaluation/env_infos/reward_ctrl Std', '8.365455465240826e-07'), ('evaluation/env_infos/reward_ctrl Max', '-2.892852990044048e-06'), ('evaluation/env_infos/reward_ctrl Min', '-2.5335128157166764e-05'), ('evaluation/env_infos/final/reward_contact Mean', '0.0'), ('evaluation/env_infos/final/reward_contact Std', '0.0'), ('evaluation/env_infos/final/reward_contact Max', '-0.0'), ('evaluation/env_infos/final/reward_contact Min', '-0.0'), ('evaluation/env_infos/initial/reward_contact Mean', '0.0'), ('evaluation/env_infos/initial/reward_contact Std', '0.0'), ('evaluation/env_infos/initial/reward_contact Max', '-0.0'), ('evaluation/env_infos/initial/reward_contact Min', '-0.0'), ('evaluation/env_infos/reward_contact Mean', '0.0'), ('evaluation/env_infos/reward_contact Std', '0.0'), ('evaluation/env_infos/reward_contact Max', '-0.0'), ('evaluation/env_infos/reward_contact Min', '-0.0'), ('evaluation/env_infos/final/reward_survive Mean', '1.0'), ('evaluation/env_infos/final/reward_survive Std', '0.0'), ('evaluation/env_infos/final/reward_survive Max', '1.0'), ('evaluation/env_infos/final/reward_survive Min', '1.0'), ('evaluation/env_infos/initial/reward_survive Mean', '1.0'), ('evaluation/env_infos/initial/reward_survive Std', '0.0'), ('evaluation/env_infos/initial/reward_survive Max', '1.0'), ('evaluation/env_infos/initial/reward_survive Min', '1.0'), ('evaluation/env_infos/reward_survive Mean', '1.0'), ('evaluation/env_infos/reward_survive Std', '0.0'), ('evaluation/env_infos/reward_survive Max', '1.0'), ('evaluation/env_infos/reward_survive Min', '1.0'), ('evaluation/env_infos/final/x_position Mean', '-0.08246357007500449'), ('evaluation/env_infos/final/x_position Std', '0.21278604707291804'), ('evaluation/env_infos/final/x_position Max', '0.2762546681301775'), ('evaluation/env_infos/final/x_position Min', '-0.31765979047571263'), ('evaluation/env_infos/initial/x_position Mean', '-0.003912779431063491'), ('evaluation/env_infos/initial/x_position Std', '0.0330324112758067'), ('evaluation/env_infos/initial/x_position Max', '0.05434083206249111'), ('evaluation/env_infos/initial/x_position Min', '-0.03564545654418633'), ('evaluation/env_infos/x_position Mean', '-0.0980639978100126'), ('evaluation/env_infos/x_position Std', '0.2219732113910267'), ('evaluation/env_infos/x_position Max', '0.28447775987369606'), ('evaluation/env_infos/x_position Min', '-0.4154507641762448'), ('evaluation/env_infos/final/y_position Mean', '-0.21619497009687377'), ('evaluation/env_infos/final/y_position Std', '0.11467158032691588'), ('evaluation/env_infos/final/y_position Max', '-0.06849864226409814'), ('evaluation/env_infos/final/y_position Min', '-0.3834214088146994'), ('evaluation/env_infos/initial/y_position Mean', '0.03461183079311117'), ('evaluation/env_infos/initial/y_position Std', '0.03786474659873482'), ('evaluation/env_infos/initial/y_position Max', '0.0727745156466771'), ('evaluation/env_infos/initial/y_position Min', '-0.031890929939110295'), ('evaluation/env_infos/y_position Mean', '-0.18847055253266243'), ('evaluation/env_infos/y_position Std', '0.12900336667512582'), ('evaluation/env_infos/y_position Max', '0.11318113181673488'), ('evaluation/env_infos/y_position Min', '-0.3843479847936035'), ('evaluation/env_infos/final/distance_from_origin Mean', '0.31742811785734604'), ('evaluation/env_infos/final/distance_from_origin Std', '0.10586486027873725'), ('evaluation/env_infos/final/distance_from_origin Max', '0.41272207970165087'), ('evaluation/env_infos/final/distance_from_origin Min', '0.11858511872844475'), ('evaluation/env_infos/initial/distance_from_origin Mean', '0.05664015906534127'), ('evaluation/env_infos/initial/distance_from_origin Std', '0.023023038124026837'), ('evaluation/env_infos/initial/distance_from_origin Max', '0.090824314788785'), ('evaluation/env_infos/initial/distance_from_origin Min', '0.020441331046195648'), ('evaluation/env_infos/distance_from_origin Mean', '0.31550155666738033'), ('evaluation/env_infos/distance_from_origin Std', '0.10728671757338556'), ('evaluation/env_infos/distance_from_origin Max', '0.41675052787889233'), ('evaluation/env_infos/distance_from_origin Min', '0.006118439492803657'), ('evaluation/env_infos/final/x_velocity Mean', '0.0006460700812505518'), ('evaluation/env_infos/final/x_velocity Std', '0.000683088293812967'), ('evaluation/env_infos/final/x_velocity Max', '0.0015115855220859675'), ('evaluation/env_infos/final/x_velocity Min', '-6.473937490869552e-05'), ('evaluation/env_infos/initial/x_velocity Mean', '-0.03758082171038936'), ('evaluation/env_infos/initial/x_velocity Std', '0.12706800538907964'), ('evaluation/env_infos/initial/x_velocity Max', '0.10714505461736121'), ('evaluation/env_infos/initial/x_velocity Min', '-0.22169929465191252'), ('evaluation/env_infos/x_velocity Mean', '-0.001608596634589209'), ('evaluation/env_infos/x_velocity Std', '0.04913838565616579'), ('evaluation/env_infos/x_velocity Max', '0.9170830225171546'), ('evaluation/env_infos/x_velocity Min', '-1.13733215985436'), ('evaluation/env_infos/final/y_velocity Mean', '-0.00019060288440309048'), ('evaluation/env_infos/final/y_velocity Std', '0.0006822993069578147'), ('evaluation/env_infos/final/y_velocity Max', '0.0003515250043489848'), ('evaluation/env_infos/final/y_velocity Min', '-0.0015195556259323117'), ('evaluation/env_infos/initial/y_velocity Mean', '0.021937314969317126'), ('evaluation/env_infos/initial/y_velocity Std', '0.08664719085850509'), ('evaluation/env_infos/initial/y_velocity Max', '0.15565432328174394'), ('evaluation/env_infos/initial/y_velocity Min', '-0.08328165113421354'), ('evaluation/env_infos/y_velocity Mean', '-0.004994198702830382'), ('evaluation/env_infos/y_velocity Std', '0.04931474746785201'), ('evaluation/env_infos/y_velocity Max', '0.8030629918524109'), ('evaluation/env_infos/y_velocity Min', '-1.2179716906352427'), ('evaluation/env_infos/final/forward_reward Mean', '0.0006460700812505518'), ('evaluation/env_infos/final/forward_reward Std', '0.000683088293812967'), ('evaluation/env_infos/final/forward_reward Max', '0.0015115855220859675'), ('evaluation/env_infos/final/forward_reward Min', '-6.473937490869552e-05'), ('evaluation/env_infos/initial/forward_reward Mean', '-0.03758082171038936'), ('evaluation/env_infos/initial/forward_reward Std', '0.12706800538907964'), ('evaluation/env_infos/initial/forward_reward Max', '0.10714505461736121'), ('evaluation/env_infos/initial/forward_reward Min', '-0.22169929465191252'), ('evaluation/env_infos/forward_reward Mean', '-0.001608596634589209'), ('evaluation/env_infos/forward_reward Std', '0.04913838565616579'), ('evaluation/env_infos/forward_reward Max', '0.9170830225171546'), ('evaluation/env_infos/forward_reward Min', '-1.13733215985436'), ('time/epoch (s)', '0'), ('time/total (s)', '70.16498186439276'), ('Epoch', '0')]

In [76]:
# for ll in l:
#     print(ll[0]+'\t'+ll[1])
#     print(ll[1])

In [77]:
# import rlkit

In [78]:
# d1 = {}
# i1 = 0
# for line in rlkit.core.tabulate.tabulate(l).split('\n'):
#     print(line)
#     print(i1)
#     print(l[i1][0])
#     print(l[i1][1])
#     d1[l[i1][0]] = l[i1][1]; i1+=1
# #     self.log(line, *args, **kwargs, file_name=file_name)
# d1

In [79]:
# print(len(l))

In [80]:
# !ls logs

In [81]:
# !rm *.pkl logs/*

In [82]:
# !ls | grep pkl

In [83]:
# !cat logs/2020-04-13\ 18\:51\:02.545425.txt

In [84]:
# f = open('temp3', 'a')

In [85]:
# f.write('Hi')

In [86]:
# f.flush()

In [87]:
# import os
# os.getpid()

In [88]:
# !cat temp3

In [89]:
# !rm temp*

In [90]:
# ckpt = torch.load('ckpt.pkl')

In [91]:
# device = torch.device("cuda:0")

In [92]:
# algorithm = ckpt['algorithm']

In [93]:
# initial_epoch

In [94]:
# dir_ = 'results/humanoid/'
# f1 = dir_ + 'ckpt.pkl'
# f2 = dir_ + 'ckpt-best.pkl'
# x1 = torch.load(f1)
# x2 = torch.load(f2)
# x1

In [95]:
# x2

In [96]:
# x1['algorithm'].trainer.qf1.fc1 == x2['algorithm'].trainer.qf1.fc1

In [97]:
# fc1 = x1['algorithm'].trainer.qf1.fcs[0]

In [98]:
# fc2 = x2['algorithm'].trainer.qf1.fcs[0]

In [99]:
# fc1.weight.data

In [100]:
# fc2.weight.data

In [101]:
# import torch

In [102]:
# algorithm.train(initial_epoch=initial_epoch, epochs = 2502)

In [103]:
# type_ = 'ant'

In [104]:
# a = None
# with open('results/'+type_+'/tmp3/ckpt.pkl', 'rb') as f:
#     a = torch.load(f)

In [105]:
# b = None
# with open('results/'+type_+'/ckpt.pkl', 'rb') as f:
#     b = torch.load(f)

In [106]:
# a['algorithm'].trainer.qf1.fcs[0].weight.data

In [107]:
# a['epoch']

In [108]:
# aa = a['algorithm']
# aa2 = copy.deepcopy(aa)
# bb = b['algorithm']

In [109]:
# aa.trainer.use_automatic_entropy_tuning

In [110]:
# e, aa3 = get_snapshot_3()

In [111]:
# def get_snapshot_3():
#     print('in get_snapshot_3')
#     ckpt = {}
#     ckpt = torch.load('results/humanoid/ckpt.pkl')
#     # self = copy.deepcopy(ckpt['algorithm'])
#     epoch = ckpt['epoch']
#     return epoch, copy.deepcopy(ckpt['algorithm'])

In [112]:
# e