In [None]:
!apt-get install -y \
    libgl1-mesa-dev \
    libgl1-mesa-glx \
    libglew-dev \
    libosmesa6-dev \
    software-properties-common

!apt-get install -y patchelf
!pip install free-mujoco-py

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libgl1-mesa-dev is already the newest version (20.0.8-0ubuntu1~18.04.1).
libgl1-mesa-dev set to manually installed.
software-properties-common is already the newest version (0.96.24.32.18).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
Suggested packages:
  glew-utils
The following NEW packages will be installed:
  libgl1-mesa-glx libglew-dev libglew2.0 libosmesa6 libosmesa6-dev
0 upgraded, 5 newly installed, 0 to remove and 19 not upgraded.
Need to get 2,916 kB of archives.
After this operation, 12.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libgl1-mesa-glx amd64 20.0.8-0ubuntu1~18.04.1 [5,532 B]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libglew2.0 amd64 2.0.0-5 [140 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/univ

## Cherry TRPO MAML

In [None]:
!pip install cherry-rl learn2learn &> /dev/null

In [None]:
import random
import math
import time

from copy import deepcopy

import cherry as ch
import gym
import numpy as np
import torch
from cherry.algorithms import a2c, trpo
from cherry.models.robotics import LinearValue
from tqdm import tqdm

import learn2learn as l2l

import torch as th
import torch.nn as nn
from torch import autograd
from torch.distributions.kl import kl_divergence
from torch.nn.utils import parameters_to_vector, vector_to_parameters
from torch.distributions import Normal, Categorical
from torch.utils.tensorboard import SummaryWriter

In [None]:
env = gym.make('HalfCheetahForwardBackward-v1')
env.reset()

Compiling /usr/local/lib/python3.7/dist-packages/mujoco_py/cymj.pyx because it changed.
[1/1] Cythonizing /usr/local/lib/python3.7/dist-packages/mujoco_py/cymj.pyx
running build_ext
building 'mujoco_py.cymj' extension
creating /usr/local/lib/python3.7/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_37_linuxcpuextensionbuilder
creating /usr/local/lib/python3.7/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_37_linuxcpuextensionbuilder/temp.linux-x86_64-3.7
creating /usr/local/lib/python3.7/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_37_linuxcpuextensionbuilder/temp.linux-x86_64-3.7/usr
creating /usr/local/lib/python3.7/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_37_linuxcpuextensionbuilder/temp.linux-x86_64-3.7/usr/local
creating /usr/local/lib/python3.7/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_37_linuxcpuextensionbuilder/temp.linux-x86_64-3.7/usr/local/lib
creating /usr/local/lib/python3.7/dist-packages/mujoco_py/generated/_pyxbld_2.0.2.13_37_linuxcpuexten

array([-0.08513404,  0.08605558, -0.08362168, -0.07544462,  0.08005954,
        0.04693294,  0.03585278, -0.04871276,  0.16829012,  0.03828093,
       -0.04322466, -0.0109521 , -0.03779795,  0.06077007, -0.17164099,
       -0.0534065 ,  0.21574007,  0.0142859 ,  0.        ,  0.61486596],
      dtype=float32)

In [None]:
EPSILON = 1e-6

def linear_init(module):
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        module.bias.data.zero_()
    return module


class DiagNormalPolicy(nn.Module):

    def __init__(self, input_size, output_size, hiddens=None, activation='relu', device='cpu'):
        super(DiagNormalPolicy, self).__init__()
        self.device = device
        if hiddens is None:
            hiddens = [100, 100]
        if activation == 'relu':
            activation = nn.ReLU
        elif activation == 'tanh':
            activation = nn.Tanh
        layers = [linear_init(nn.Linear(input_size, hiddens[0])), activation()]
        for i, o in zip(hiddens[:-1], hiddens[1:]):
            layers.append(linear_init(nn.Linear(i, o)))
            layers.append(activation())
        layers.append(linear_init(nn.Linear(hiddens[-1], output_size)))
        self.mean = nn.Sequential(*layers)
        self.sigma = nn.Parameter(torch.Tensor(output_size))
        self.sigma.data.fill_(math.log(1))

    # def forward(self, state):
    #     state = state.to(self.device, non_blocking=True)
    #     loc = self.mean(state)
    #     scale = torch.exp(torch.clamp(self.sigma, min=math.log(EPSILON)))
    #     return Normal(loc=loc, scale=scale)

    def density(self, state):
        state = state.to(self.device, non_blocking=True)
        loc = self.mean(state)
        scale = torch.exp(torch.clamp(self.sigma, min=math.log(EPSILON)))
        return Normal(loc=loc, scale=scale)

    def log_prob(self, state, action):
        density = self.density(state)
        return density.log_prob(action).mean(dim=1, keepdim=True)

    def forward(self, state):
        density = self.density(state)
        action = density.sample()
        return action


In [None]:
def layer_init(layer, std=np.sqrt(2), bias_const=0.0):
    torch.nn.init.orthogonal_(layer.weight, std)
    torch.nn.init.constant_(layer.bias, bias_const)
    return layer

In [None]:
class Actor(nn.Module):
    def __init__(self, env, hidden_size=100):
        super().__init__()
        self.input_size = env.observation_space.shape[0]
        self.actor_output_size = env.action_space.shape[0]

        self.l1 = layer_init(nn.Linear(self.input_size, hidden_size))
        self.l2 = layer_init(nn.Linear(hidden_size, hidden_size))
        self.output = layer_init(nn.Linear(hidden_size, self.actor_output_size), std=0.01)
        self.activation = nn.ReLU()
        self.distribution = ch.distributions.ActionDistribution(env)

    def forward(self, x):
        x = self.activation(self.l1(x))
        x = self.activation(self.l2(x))
        x = self.output(x)
        mass = self.distribution(x)

        return mass

In [None]:
class Critic(nn.Module):
    def __init__(self, env, lr, hidden_size=32):
        super().__init__()
        self.input_size = env.observation_space.shape[0]
        self.critic_output_size = 1

        self.l1 = layer_init(nn.Linear(self.input_size, hidden_size))
        self.l2 = layer_init(nn.Linear(hidden_size, hidden_size))
        self.critic_head = layer_init(nn.Linear(hidden_size, self.critic_output_size), std=1.)
        self.activation = nn.ReLU()

        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr, eps=1e-5)

    def forward(self, x):
        x = self.activation(self.l1(x))
        x = self.activation(self.l2(x))
        value = self.critic_head(x)

        return value

In [None]:
class MAMLTRPO():
    def __init__(self, env_name,
                 actor_class=Actor, critic_class=Critic, 
                 actor_args=dict(), critic_args=dict(),
                 adapt_lr=0.1, meta_lr=1.0, 
                 adapt_steps=3,
                 adapt_batch_size=20, meta_batch_size=20,
                 gamma=0.95, tau=1.0,
                 backtrack_factor=0.5, ls_max_steps=15, max_kl=0.01,
                 num_workers=10,
                 seed=42,
                 device=None, name="MAMLTRPO", tensorboard_log="./logs"):
        
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)

        if torch.cuda.is_available():
            self.device = torch.device("cuda")
            torch.cuda.manual_seed(seed)
        else:
            self.device = torch.device("cpu")
        if device:
            self.device = torch.device(device)
        print("Running on: " + str(self.device))

        def make_env():
            env = gym.make(env_name)
            env = ch.envs.ActionSpaceScaler(env)
            return env

        env = l2l.gym.AsyncVectorEnv([make_env for _ in range(num_workers)])
        env.seed(seed)
        env.set_task(env.sample_tasks(1)[0])
        self.env = ch.envs.Torch(env)

        self.gamma = gamma
        self.tau = tau
        self.adapt_lr = adapt_lr
        self.meta_lr = meta_lr
        self.adapt_steps = adapt_steps
        self.adapt_batch_size = adapt_batch_size
        self.meta_batch_size = meta_batch_size
        self.backtrack_factor = backtrack_factor
        self.ls_max_steps = ls_max_steps
        self.max_kl = max_kl
        self.global_iteration = 0

        # self.policy = Actor(env, **actor_args).to(device)
        # self.baseline = Critic(env, lr=0.001, **critic_args).to(device)
        self.policy = DiagNormalPolicy(self.env.state_size, self.env.action_size, device=self.device)
        self.baseline = LinearValue(self.env.state_size, self.env.action_size)

        self.policy.to(self.device)
        self.baseline.to(self.device)

        if tensorboard_log is not None:
            self.run_name = name + "_" + str(int(time.time()))
            self.writer = SummaryWriter(f"{tensorboard_log}/{self.run_name}")
        else:
            self.writer = None

    
    def save(self, path="./"):
        torch.save(self.baseline.state_dict(), path + "/baseline.pt")
        torch.save(self.policy.state_dict(), path + "/policy.pt")


    def load(self, path="./"):
        self.baseline.load_state_dict(torch.load(path + "/baseline.pt"))
        self.policy.load_state_dict(torch.load(path + "/policy.pt"))


    def collect_steps(self, policy, n_episodes):
        self.env.reset()
        task = ch.envs.Runner(self.env)
        replay = task.run(policy, episodes=n_episodes).to(self.device)

        returns = ch.td.discount(self.gamma, replay.reward(), replay.done())
        self.baseline.fit(replay.state(), returns)
        values = self.baseline(replay.state())
        next_values = self.baseline(replay.next_state())
        bootstraps = values * (1.0 - replay.done()) + next_values * replay.done()
        next_value = torch.zeros(1, device=values.device)
        advantages = ch.pg.generalized_advantage(tau=self.tau,
                                        gamma=self.gamma,
                                        rewards=replay.reward(),
                                        dones=replay.done(),
                                        values=bootstraps,
                                        next_value=next_value)
        advantages = ch.normalize(advantages, epsilon=1e-8).detach()

        for i, sars in enumerate(replay):
            sars.returns = returns[i]
            sars.advantage = advantages[i]

        return replay


    # def collect_steps(self, policy, n_episodes):
    #     # replay = ch.ExperienceReplay(device=self.device)
    #     # for i in range(n_episodes):
    #     #     state = self.env.reset()

    #     #     while True:
    #     #         with torch.no_grad():
    #     #             mass = policy(state)
    #     #         action = mass.sample()
    #     #         log_prob = mass.log_prob(action).mean(dim=1, keepdim=True)
    #     #         next_state, reward, done, _ = self.env.step(action)

    #     #         replay.append(state,
    #     #                     action,
    #     #                     reward,
    #     #                     next_state,
    #     #                     done,
    #     #                     log_prob=log_prob)
                
    #     #         if done.any():
    #     #             break

    #     #         state = next_state
        
    #     self.env.reset()
    #     task = ch.envs.Runner(self.env)
    #     replay = task.run(policy, episodes=n_episodes).to(self.device)

    #     with torch.no_grad():
    #         next_state_value = self.baseline(replay[-1].next_state)
    #     values = self.baseline(replay.state())

    #     advantages = ch.generalized_advantage(self.gamma,
    #                                             self.tau,
    #                                             replay.reward(),
    #                                             replay.done(),
    #                                             values.detach(),
    #                                             next_state_value)
    #     returns = advantages + values.detach()
    #     advantages = ch.normalize(advantages, epsilon=1e-8)

    #     for i, sars in enumerate(replay):
    #         sars.returns = returns[i]
    #         sars.advantage = advantages[i]

    #     # value_loss = a2c.state_value_loss(returns, values)
    #     # self.baseline.optimizer.zero_grad()
    #     # value_loss.backward()
    #     # self.baseline.optimizer.step()

    #     self.baseline.fit(replay.state(), returns)
    #     return replay


    def maml_a2c_loss(self, train_episodes, learner):
        # Update policy and baseline
        states = train_episodes.state()
        actions = train_episodes.action()
        density = learner.density(states)
        log_probs = density.log_prob(actions).mean(dim=1, keepdim=True)

        advantages = train_episodes.advantage()
        return a2c.policy_loss(log_probs, train_episodes.advantage())


    def fast_adapt(self, clone, train_episodes, first_order=False):
        second_order = not first_order
        loss = self.maml_a2c_loss(train_episodes, clone)
        gradients = autograd.grad(loss,
                                clone.parameters(),
                                retain_graph=second_order,
                                create_graph=second_order)
        return l2l.algorithms.maml.maml_update(clone, self.adapt_lr, gradients)


    def meta_surrogate_loss(self, iteration_replays, iteration_policies, policy):
        mean_loss = 0.0
        mean_kl = 0.0
        for task_replays, old_policy in tqdm(zip(iteration_replays, iteration_policies),
                                            total=len(iteration_replays),
                                            desc='Surrogate Loss',
                                            leave=False):
            train_replays = task_replays[:-1]
            valid_episodes = task_replays[-1]
            new_policy = l2l.clone_module(policy)

            # Fast Adapt
            for train_episodes in train_replays:
                new_policy = self.fast_adapt(new_policy, train_episodes, first_order=False)

            # Useful values
            states = valid_episodes.state()
            actions = valid_episodes.action()

            # Compute KL
            old_densities = old_policy.density(states)
            new_densities = new_policy.density(states)
            kl = kl_divergence(new_densities, old_densities).mean()
            mean_kl += kl

            # Compute Surrogate Loss
            advantages = valid_episodes.advantage()
            old_log_probs = old_densities.log_prob(actions).mean(dim=1, keepdim=True).detach()
            new_log_probs = new_densities.log_prob(actions).mean(dim=1, keepdim=True)
            mean_loss += trpo.policy_loss(new_log_probs, old_log_probs, advantages)
        mean_kl /= len(iteration_replays)
        mean_loss /= len(iteration_replays)
        return mean_loss, mean_kl


    def meta_optimize(self, iteration_replays, iteration_policies):
        # Compute CG step direction
        old_loss, old_kl = self.meta_surrogate_loss(iteration_replays, iteration_policies, self.policy)

        grad = autograd.grad(old_loss,
                                self.policy.parameters(),
                                retain_graph=True)
        grad = parameters_to_vector([g.detach() for g in grad])
        Fvp = trpo.hessian_vector_product(old_kl, self.policy.parameters())
        step = trpo.conjugate_gradient(Fvp, grad)
        shs = 0.5 * torch.dot(step, Fvp(step))
        lagrange_multiplier = torch.sqrt(shs / self.max_kl)
        step = step / lagrange_multiplier
        step_ = [torch.zeros_like(p.data) for p in self.policy.parameters()]
        vector_to_parameters(step, step_)
        step = step_
        del old_kl, Fvp, grad
        old_loss.detach_()

        # Line-search
        for ls_step in range(self.ls_max_steps):
            stepsize = self.backtrack_factor ** ls_step * self.meta_lr
            clone = deepcopy(self.policy)
            for p, u in zip(clone.parameters(), step):
                p.data.add_(-stepsize, u.data)
            new_loss, kl = self.meta_surrogate_loss(iteration_replays, iteration_policies, clone)
            if new_loss < old_loss and kl < self.max_kl:
                for p, u in zip(self.policy.parameters(), step):
                    p.data.add_(-stepsize, u.data)
                break

        if self.writer is not None:
            self.writer.add_scalar("loss", new_loss, self.global_iteration)
            self.writer.add_scalar("kl", kl, self.global_iteration)


    def train(self, num_iterations=100):
        for iteration in range(num_iterations):
            self.global_iteration += 1
            iteration_reward = 0.0
            iteration_replays = []
            iteration_policies = []

            for task_config in tqdm(self.env.sample_tasks(self.meta_batch_size), leave=False, desc='Data'):
                clone = deepcopy(self.policy)
                self.env.set_task(task_config)
                task_replay = []

                # Fast Adapt
                for step in range(self.adapt_steps):
                    train_episodes = self.collect_steps(clone, n_episodes=self.adapt_batch_size)
                    self.fast_adapt(clone, train_episodes, first_order=True)
                    # self.fast_adapt(clone, train_episodes, first_order=True)
                    task_replay.append(train_episodes)

                # Compute Validation Loss
                valid_episodes = self.collect_steps(clone, n_episodes=self.adapt_batch_size)
                task_replay.append(valid_episodes)
                iteration_reward += valid_episodes.reward().sum().item() / self.adapt_batch_size
                iteration_replays.append(task_replay)
                iteration_policies.append(clone)

            # Print statistics
            print('\nIteration', self.global_iteration)
            adaptation_reward = iteration_reward / self.meta_batch_size
            print('adaptation_reward', adaptation_reward)

            if self.writer is not None:
                self.writer.add_scalar("adaptation_reward", adaptation_reward, self.global_iteration)

            self.meta_optimize(iteration_replays, iteration_policies)

In [None]:
metalearner = MAMLTRPO("HalfCheetahForwardBackward-v1")

Running on: cuda




In [None]:
metalearner.train(300)




Iteration 1
adaptation_reward -8.28092391014099


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  ../torch/csrc/utils/python_arg_parser.cpp:1174.)



Iteration 2
adaptation_reward -0.34856413841247547





Iteration 3
adaptation_reward 2.080847492218017





Iteration 4
adaptation_reward -0.5971503734588618





Iteration 5
adaptation_reward -1.7480440235137933





Iteration 6
adaptation_reward 1.8982534980773924





Iteration 7
adaptation_reward 11.04148238182068





Iteration 8
adaptation_reward 10.341353340148926





Iteration 9
adaptation_reward 17.28411301612854





Iteration 10
adaptation_reward 14.976770763397216





Iteration 11
adaptation_reward 19.711226229667666





Iteration 12
adaptation_reward 13.943080797195435





Iteration 13
adaptation_reward 17.968124423027035





Iteration 14
adaptation_reward 20.62150827407837





Iteration 15
adaptation_reward 17.79564896583557





Iteration 16
adaptation_reward 22.115227460861206





Iteration 17
adaptation_reward 23.308125534057613





Iteration 18
adaptation_reward 25.411133508682248





Iteration 19
adaptation_reward 30.91326232910156





Iteration 20
adaptation_reward 17.19877754688263





Iteration 21
adaptation_reward 26.66049586296081





Iteration 22
adaptation_reward 26.200458589792255





Iteration 23
adaptation_reward 28.52642942428589





Iteration 24
adaptation_reward 29.991001434326172





Iteration 25
adaptation_reward 36.24410778045654





Iteration 26
adaptation_reward 25.23564931869507





Iteration 27
adaptation_reward 37.48611532211304





Iteration 28
adaptation_reward 36.27908054351807





Iteration 29
adaptation_reward 34.98307246685028





Iteration 30
adaptation_reward 36.3601388168335





Iteration 31
adaptation_reward 48.249526987075804





Iteration 32
adaptation_reward 48.20226264953613





Iteration 33
adaptation_reward 55.135260162353504





Iteration 34
adaptation_reward 37.87402187347412





Iteration 35
adaptation_reward 42.60499000549316





Iteration 36
adaptation_reward 55.382625274658196





Iteration 37
adaptation_reward 50.53114169120788





Iteration 38
adaptation_reward 53.6613903427124





Iteration 39
adaptation_reward 53.0059388923645





Iteration 40
adaptation_reward 71.97300415039062





Iteration 41
adaptation_reward 64.63682746887207





Iteration 42
adaptation_reward 71.24325111389162





Iteration 43
adaptation_reward 72.58702003479003





Iteration 44
adaptation_reward 70.17856033325195





Iteration 45
adaptation_reward 72.90884925842285





Iteration 46
adaptation_reward 71.92757469177248





Iteration 47
adaptation_reward 77.81718391418457





Iteration 48
adaptation_reward 89.65004516601563





Iteration 49
adaptation_reward 38.41054718017578





Iteration 50
adaptation_reward 79.5917359161377





Iteration 51
adaptation_reward 79.2995180130005





Iteration 52
adaptation_reward 100.92394172668459





Iteration 53
adaptation_reward 109.95098472595217





Iteration 54
adaptation_reward 92.09172195434573





Iteration 55
adaptation_reward 75.43868385314943





Iteration 56
adaptation_reward 84.92533065795898





Iteration 57
adaptation_reward 103.89073081970214





Iteration 58
adaptation_reward 80.22187103271483





Iteration 59
adaptation_reward 99.12112686157226





Iteration 60
adaptation_reward 94.24565719604493





Iteration 61
adaptation_reward 97.87074371337891





Iteration 62
adaptation_reward 97.86229125976561





Iteration 63
adaptation_reward 99.85719619750978





Iteration 64
adaptation_reward 119.92230667114256





Iteration 65
adaptation_reward 99.23022994995118





Iteration 66
adaptation_reward 109.11082901000978





Iteration 67
adaptation_reward 103.466604309082





Iteration 68
adaptation_reward 95.31423065185545





Iteration 69
adaptation_reward 101.64668262481689





Iteration 70
adaptation_reward 105.79614547729493





Iteration 71
adaptation_reward 131.46594146728518





Iteration 72
adaptation_reward 93.83898712158204





Iteration 73
adaptation_reward 119.8161575317383





Iteration 74
adaptation_reward 129.66812576293944





Iteration 75
adaptation_reward 114.6803157043457





Iteration 76
adaptation_reward 126.83172302246093





Iteration 77
adaptation_reward 126.06679794311526





Iteration 78
adaptation_reward 118.24338623046876





Iteration 79
adaptation_reward 106.25312591552736





Iteration 80
adaptation_reward 108.03101013183593





Iteration 81
adaptation_reward 129.00718734741213





Iteration 82
adaptation_reward 136.9614614868164





Iteration 83
adaptation_reward 125.6036264038086





Iteration 84
adaptation_reward 126.58405395507812





Iteration 85
adaptation_reward 132.63417694091794





Iteration 86
adaptation_reward 112.66816223144531





Iteration 87
adaptation_reward 123.53140808105468





Iteration 88
adaptation_reward 152.90267028808594





Iteration 89
adaptation_reward 101.66463790893553





Iteration 90
adaptation_reward 120.25252044677737





Iteration 91
adaptation_reward 122.0551712036133





Iteration 92
adaptation_reward 107.14428894042969





Iteration 93
adaptation_reward 121.33952697753907





Iteration 94
adaptation_reward 115.83540771484374





Iteration 95
adaptation_reward 135.24816589355473





Iteration 96
adaptation_reward 118.84354217529297





Iteration 97
adaptation_reward 122.44311462402341





Iteration 98
adaptation_reward 150.19780670166017





Iteration 99
adaptation_reward 124.96628875732422





Iteration 100
adaptation_reward 143.41025878906248





Iteration 101
adaptation_reward 128.2265719604492





Iteration 102
adaptation_reward 136.34365600585934





Iteration 103
adaptation_reward 142.16400939941408





Iteration 104
adaptation_reward 133.19400268554688





Iteration 105
adaptation_reward 129.6954800415039





Iteration 106
adaptation_reward 136.14401367187503





Iteration 107
adaptation_reward 122.77431167602538





Iteration 108
adaptation_reward 133.239375





Iteration 109
adaptation_reward 134.83941772460938





Iteration 110
adaptation_reward 147.72953155517578





Iteration 111
adaptation_reward 128.5400161743164





Iteration 112
adaptation_reward 149.35482818603512





Iteration 113
adaptation_reward 125.15053131103517





Iteration 114
adaptation_reward 126.9016830444336





Iteration 115
adaptation_reward 148.26574218750002





Iteration 116
adaptation_reward 143.71024047851563





Iteration 117
adaptation_reward 136.3840689086914





Iteration 118
adaptation_reward 126.05021697998048





Iteration 119
adaptation_reward 109.76238739013672





Iteration 120
adaptation_reward 140.29455780029295





Iteration 121
adaptation_reward 145.6919470214844





Iteration 122
adaptation_reward 136.8255487060547





Iteration 123
adaptation_reward 145.49706024169922





Iteration 124
adaptation_reward 149.16690704345703





Iteration 125
adaptation_reward 122.23747314453121





Iteration 126
adaptation_reward 134.81828521728514





Iteration 127
adaptation_reward 139.4824737548828





Iteration 128
adaptation_reward 118.47483276367186





Iteration 129
adaptation_reward 125.37716156005858





Iteration 130
adaptation_reward 146.32498046875003





Iteration 131
adaptation_reward 130.77628509521486





Iteration 132
adaptation_reward 132.6172528076172





Iteration 133
adaptation_reward 136.2187088012695





Iteration 134
adaptation_reward 152.79599586486816





Iteration 135
adaptation_reward 111.99664566040039





Iteration 136
adaptation_reward 138.6254864501953





Iteration 137
adaptation_reward 123.96364501953124





Iteration 138
adaptation_reward 147.68623992919922





Iteration 139
adaptation_reward 114.55794769287111





Iteration 140
adaptation_reward 146.8922604370117





Iteration 141
adaptation_reward 125.73159271240232





Iteration 142
adaptation_reward 146.35274490356443





Iteration 143
adaptation_reward 145.07391937255858





Iteration 144
adaptation_reward 138.98472991943356





Iteration 145
adaptation_reward 139.694778137207





Iteration 146
adaptation_reward 130.5045462036133





Iteration 147
adaptation_reward 155.26862365722656





Iteration 148
adaptation_reward 151.41507919311522





Iteration 149
adaptation_reward 153.05163543701173





Iteration 150
adaptation_reward 124.78086975097658





Iteration 151
adaptation_reward 148.61216430664064





Iteration 152
adaptation_reward 141.75603240966797





Iteration 153
adaptation_reward 134.48184783935545





Iteration 154
adaptation_reward 139.4728946495056





Iteration 155
adaptation_reward 132.10548950195312





Iteration 156
adaptation_reward 130.5357879638672





Iteration 157
adaptation_reward 135.4056819152832





Iteration 158
adaptation_reward 148.00340179443361





Iteration 159
adaptation_reward 137.65231994628905





Iteration 160
adaptation_reward 159.55751281738281





Iteration 161
adaptation_reward 123.18832916259764





Iteration 162
adaptation_reward 122.95544776916503





Iteration 163
adaptation_reward 140.3839978027344





Iteration 164
adaptation_reward 130.6942874145508





Iteration 165
adaptation_reward 136.51457015991213





Iteration 166
adaptation_reward 148.25066467285154





Iteration 167
adaptation_reward 121.42385833740234





Iteration 168
adaptation_reward 146.7616128540039





Iteration 169
adaptation_reward 128.13209068298343





Iteration 170
adaptation_reward 132.6983905029297





Iteration 171
adaptation_reward 141.76994323730472





Iteration 172
adaptation_reward 162.26768615722654





Iteration 173
adaptation_reward 139.9373455810547





Iteration 174
adaptation_reward 148.18710937500003





Iteration 175
adaptation_reward 146.22262979507448





Iteration 176
adaptation_reward 145.0174154663086





Iteration 177
adaptation_reward 153.28139739990235





Iteration 178
adaptation_reward 123.85475067138671





Iteration 179
adaptation_reward 157.55834472656252





Iteration 180
adaptation_reward 156.54908020019533





Iteration 181
adaptation_reward 135.69312530517576





Iteration 182
adaptation_reward 146.76217895507813





Iteration 183
adaptation_reward 125.34659515380858





Iteration 184
adaptation_reward 141.78922424316406





Iteration 185
adaptation_reward 154.67402893066406





Iteration 186
adaptation_reward 131.17592987060544





Iteration 187
adaptation_reward 155.29208404541015





Iteration 188
adaptation_reward 149.42689178466796





Iteration 189
adaptation_reward 138.42941284179685





Iteration 190
adaptation_reward 145.46338439941408





Iteration 191
adaptation_reward 147.56166473388672





Iteration 192
adaptation_reward 137.19742553710935





Iteration 193
adaptation_reward 129.47724914550784





Iteration 194
adaptation_reward 144.3224789428711





Iteration 195
adaptation_reward 154.6925854492187





Iteration 196
adaptation_reward 147.83824768066407





Iteration 197
adaptation_reward 162.41989044189452





Iteration 198
adaptation_reward 142.7805499267578





Iteration 199
adaptation_reward 142.9035186767578





Iteration 200
adaptation_reward 124.37760345458985





Iteration 201
adaptation_reward 153.69385101318358





Iteration 202
adaptation_reward 141.27465972900393





Iteration 203
adaptation_reward 169.20961242675784





Iteration 204
adaptation_reward 164.15261322021485





Iteration 205
adaptation_reward 141.84031677246094





Iteration 206
adaptation_reward 147.3411013793945





Iteration 207
adaptation_reward 123.78508422851561





Iteration 208
adaptation_reward 151.3322821044922





Iteration 209
adaptation_reward 150.3736212158203





Iteration 210
adaptation_reward 137.52212036132815





Iteration 211
adaptation_reward 156.87165374755858





Iteration 212
adaptation_reward 146.26910766601563





Iteration 213
adaptation_reward 150.00325378417972





Iteration 214
adaptation_reward 142.3878579711914





Iteration 215
adaptation_reward 145.81812622070314





Iteration 216
adaptation_reward 159.06810913085934





Iteration 217
adaptation_reward 151.41071960449221





Iteration 218
adaptation_reward 156.3450506591797





Iteration 219
adaptation_reward 163.73144134521485





Iteration 220
adaptation_reward 134.52774917602534





Iteration 221
adaptation_reward 159.0822589111328





Iteration 222
adaptation_reward 151.38710998535154





Iteration 223
adaptation_reward 146.38701202392576





Iteration 224
adaptation_reward 154.287197265625





Iteration 225
adaptation_reward 154.50495422363278





Iteration 226
adaptation_reward 147.7494387817383





Iteration 227
adaptation_reward 142.8959823608398





Iteration 228
adaptation_reward 151.56300567626954





Iteration 229
adaptation_reward 161.15469848632813





Iteration 230
adaptation_reward 143.10298706054687





Iteration 231
adaptation_reward 183.70361480712893





Iteration 232
adaptation_reward 148.4655429077148





Iteration 233
adaptation_reward 156.69342071533202





Iteration 234
adaptation_reward 163.63286682128907





Iteration 235
adaptation_reward 151.98829040527343





Iteration 236
adaptation_reward 162.68623535156252





Iteration 237
adaptation_reward 163.16104125976562





Iteration 238
adaptation_reward 140.95082336425781





Iteration 239
adaptation_reward 146.93634796142575





Iteration 240
adaptation_reward 155.5125048828125





Iteration 241
adaptation_reward 139.80081008911134





Iteration 242
adaptation_reward 141.05599273681642





Iteration 243
adaptation_reward 160.35296409606934





Iteration 244
adaptation_reward 154.09287811279302





Iteration 245
adaptation_reward 160.75592712402343





Iteration 246
adaptation_reward 156.80777893066409





Iteration 247
adaptation_reward 160.21907653808597





Iteration 248
adaptation_reward 145.6268603515625





Iteration 249
adaptation_reward 153.14061279296874





Iteration 250
adaptation_reward 161.16396972656247





Iteration 251
adaptation_reward 174.97597076416014





Iteration 252
adaptation_reward 129.29023071289063





Iteration 253
adaptation_reward 159.71306335449214





Iteration 254
adaptation_reward 156.6143997192383





Iteration 255
adaptation_reward 138.94787590026854





Iteration 256
adaptation_reward 140.1086343383789





Iteration 257
adaptation_reward 160.68875671386718





Iteration 258
adaptation_reward 157.9180520629883





Iteration 259
adaptation_reward 131.0152032470703





Iteration 260
adaptation_reward 161.23407012939452





Iteration 261
adaptation_reward 139.00652648925782





Iteration 262
adaptation_reward 158.52531951904297





Iteration 263
adaptation_reward 174.00380737304687





Iteration 264
adaptation_reward 161.88855804443364





Iteration 265
adaptation_reward 150.66029998779297





Iteration 266
adaptation_reward 154.4602914428711





Iteration 267
adaptation_reward 140.4063949584961





Iteration 268
adaptation_reward 148.39608032226562





Iteration 269
adaptation_reward 143.58118453979492





Iteration 270
adaptation_reward 146.07932678222656





Iteration 271
adaptation_reward 155.29214904785152





Iteration 272
adaptation_reward 154.03672103881837





Iteration 273
adaptation_reward 153.18362350463866





Iteration 274
adaptation_reward 151.95979553222656





Iteration 275
adaptation_reward 155.2203369140625





Iteration 276
adaptation_reward 149.58294219970702





Iteration 277
adaptation_reward 134.40002822875977





Iteration 278
adaptation_reward 154.60589904785155





Iteration 279
adaptation_reward 146.1709017944336





Iteration 280
adaptation_reward 153.31843170166013





Iteration 281
adaptation_reward 153.01593872070316





Iteration 282
adaptation_reward 148.1270901489258





Iteration 283
adaptation_reward 157.2264254760742





Iteration 284
adaptation_reward 175.57748901367188





Iteration 285
adaptation_reward 144.26090423583986





Iteration 286
adaptation_reward 166.17122589111327





Iteration 287
adaptation_reward 151.31627212524413





Iteration 288
adaptation_reward 156.9271664428711





Iteration 289
adaptation_reward 154.16096450805665





Iteration 290
adaptation_reward 153.01113563537595





Iteration 291
adaptation_reward 155.10679199218754





Iteration 292
adaptation_reward 160.29535888671876





Iteration 293
adaptation_reward 154.7941296386719





Iteration 294
adaptation_reward 163.816259765625





Iteration 295
adaptation_reward 167.83766113281249





Iteration 296
adaptation_reward 170.13136779785157





Iteration 297
adaptation_reward 158.50057220458982





Iteration 298
adaptation_reward 153.27674530029296





Iteration 299
adaptation_reward 162.37435668945315





Iteration 300
adaptation_reward 138.08816619873045




In [None]:
metalearner.save()

In [None]:
from google.colab import files
files.download('/content/logs/MAMLTRPO_1659815445/events.out.tfevents.1659815447.214887446a74.73.0')
files.download('baseline.pt') 
files.download('policy.pt')