In [1]:
import gym
env = gym.make("LunarLander-v2", render_mode="human")
observation, info = env.reset(seed=42)
for _ in range(1000):
   action = env.action_space.sample()  # this is where you would insert your policy
   observation, reward, terminated, truncated, info = env.step(action)

   if terminated or truncated:
      observation, info = env.reset()

env.close()

  if not isinstance(terminated, (bool, np.bool8)):


In [2]:
import pickle
import os
import time
import gym

import numpy as np
import torch

from cs285.infrastructure import pytorch_util as ptu
from cs285.infrastructure import utils
from cs285.infrastructure.logger import Logger
from cs285.infrastructure.replay_buffer import ReplayBuffer
from cs285.policies.MLP_policy import MLPPolicySL
from cs285.policies.loaded_gaussian_policy import LoadedGaussianPolicy

In [3]:
MAX_NVIDEO = 2
MAX_VIDEO_LEN = 40  # we overwrite this in the code below

MJ_ENV_NAMES = ["Ant-v4", "Walker2d-v4", "HalfCheetah-v4", "Hopper-v4"]

In [4]:
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--expert_policy_file', '-epf', type=str, required=True)  # relative to where you're running this script from
parser.add_argument('--expert_data', '-ed', type=str, required=True) #relative to where you're running this script from
parser.add_argument('--env_name', '-env', type=str, help=f'choices: {", ".join(MJ_ENV_NAMES)}', required=True)
parser.add_argument('--exp_name', '-exp', type=str, default='pick an experiment name', required=True)
parser.add_argument('--do_dagger', action='store_true')
parser.add_argument('--ep_len', type=int)

parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1000)  # number of gradient steps for training policy (per iter in n_iter)
parser.add_argument('--n_iter', '-n', type=int, default=1)

parser.add_argument('--batch_size', type=int, default=1000)  # training data collected (in the env) during each iteration
parser.add_argument('--eval_batch_size', type=int,
                    default=1000)  # eval data collected (in the env) for logging metrics
parser.add_argument('--train_batch_size', type=int,
                    default=100)  # number of sampled data points to be used per gradient/train step

parser.add_argument('--n_layers', type=int, default=2)  # depth, of policy to be learned
parser.add_argument('--size', type=int, default=64)  # width of each layer, of policy to be learned
parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)  # LR for supervised learning

parser.add_argument('--video_log_freq', type=int, default=5)
parser.add_argument('--scalar_log_freq', type=int, default=1)
parser.add_argument('--no_gpu', '-ngpu', action='store_true')
parser.add_argument('--which_gpu', type=int, default=0)
parser.add_argument('--max_replay_buffer_size', type=int, default=1000000)
parser.add_argument('--save_params', action='store_true')
parser.add_argument('--seed', type=int, default=1)

input = ["--expert_policy_file", 'cs285/policies/experts/Ant.pkl', "--expert_data", 'cs285/expert_data/expert_data_Ant-v4.pkl', 
         "--env_name", "Ant-v4", "--exp_name", "bc_ant"]
args = parser.parse_args(args=input)
params = vars(args)

In [5]:
seed = 40
env = gym.make(params['env_name'], render_mode=None)
env.reset(seed=seed)

# Maximum length for episodes
params['ep_len'] = params['ep_len'] or env.spec.max_episode_steps
MAX_VIDEO_LEN = params['ep_len']

assert isinstance(env.action_space, gym.spaces.Box), "Environment must be continuous"
# Observation and action sizes
ob_dim = env.observation_space.shape[0]
ac_dim = env.action_space.shape[0]

# simulation timestep, will be used for video saving
if 'model' in dir(env):
    fps = 1/env.model.opt.timestep
else:
    fps = env.env.metadata['render_fps']


In [17]:
logger = Logger(params['logdir'])

# Set random seeds
seed = params['seed']
np.random.seed(seed)
torch.manual_seed(seed)
ptu.init_gpu(
    use_gpu=not params['no_gpu'],
    gpu_id=params['which_gpu']
)

# Set logger attributes
log_video = True
log_metrics = True


KeyError: 'logdir'

In [6]:
import abc
import itertools
from typing import Any
from torch import nn
from torch.nn import functional as F
from torch import optim

import numpy as np
import torch
from torch import distributions

from cs285.infrastructure import pytorch_util as ptu
from cs285.policies.base_policy import BasePolicy


def build_mlp(
        input_size: int,
        output_size: int,
        n_layers: int,
        size: int
) -> nn.Module:
    """
        Builds a feedforward neural network

        arguments:
            n_layers: number of hidden layers
            size: dimension of each hidden layer
            activation: activation of each hidden layer

            input_size: size of the input layer
            output_size: size of the output layer
            output_activation: activation of the output layer

        returns:
            MLP (nn.Module)
    """
    layers = []
    in_size = input_size
    for _ in range(n_layers):
        layers.append(nn.Linear(in_size, size))
        layers.append(nn.Tanh())
        in_size = size
    layers.append(nn.Linear(in_size, output_size))

    mlp = nn.Sequential(*layers)
    return mlp

In [7]:
class MLPPolicySL(BasePolicy, nn.Module, metaclass=abc.ABCMeta):
    """
    Defines an MLP for supervised learning which maps observations to continuous
    actions.

    Attributes
    ----------
    mean_net: nn.Sequential
        A neural network that outputs the mean for continuous actions
    logstd: nn.Parameter
        A separate parameter to learn the standard deviation of actions

    Methods
    -------
    forward:
        Runs a differentiable forwards pass through the network
    update:
        Trains the policy with a supervised learning objective
    """
    def __init__(self,
                 ac_dim,
                 ob_dim,
                 n_layers,
                 size,
                 learning_rate=1e-4,
                 training=True,
                 nn_baseline=False,
                 **kwargs
                 ):
        super().__init__(**kwargs)

        # init vars
        self.ac_dim = ac_dim
        self.ob_dim = ob_dim
        self.n_layers = n_layers
        self.size = size
        self.learning_rate = learning_rate
        self.training = training
        self.nn_baseline = nn_baseline

        self.mean_net = build_mlp(
            input_size=self.ob_dim,
            output_size=self.ac_dim,
            n_layers=self.n_layers, size=self.size,
        )
        self.mean_net.to(ptu.device)
        self.logstd = nn.Parameter(

            torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device)
        )
        self.logstd.to(ptu.device)
        self.optimizer = optim.Adam(
            itertools.chain([self.logstd], self.mean_net.parameters()),
            self.learning_rate
        )

    def save(self, filepath):
        """
        :param filepath: path to save MLP
        """
        torch.save(self.state_dict(), filepath)

    def forward(self, observation: torch.FloatTensor) -> Any:
        """
        Defines the forward pass of the network

        :param observation: observation(s) to query the policy
        :return:
            action: sampled action(s) from the policy
        """
        # TODO: implement the forward pass of the network.
        # You can return anything you want, but you should be able to differentiate
        # through it. For example, you can return a torch.FloatTensor. You can also
        # return more flexible objects, such as a
        # `torch.distributions.Distribution` object. It's up to you!

        action_mean = self.mean_net(observation)
        action_logstd = self.logstd
        action_dist = torch.distributions.normal.Normal(action_mean, torch.exp(action_logstd))
        
        return action_dist


    def update(self, observations, actions):
        """
        Updates/trains the policy

        :param observations: observation(s) to query the policy
        :param actions: actions we want the policy to imitate
        :return:
            dict: 'Training Loss': supervised learning loss
        """
        # TODO: update the policy and return the loss
        self.optimizer.zero_grad()
        loss = torch.sum((self.forward(observations).sample(actions.shape[0]) - actions)**2)
        loss.backward()
        self.optimizer.step()
        
        return {
            # You can add extra logging information here, but keep this line
            'Training Loss': ptu.to_numpy(loss),
        }

In [8]:
actor = MLPPolicySL(
    ac_dim,
    ob_dim,
    params['n_layers'],
    params['size'],
    learning_rate=params['learning_rate'],
)

In [30]:
replay_buffer = ReplayBuffer(params['max_replay_buffer_size'])

#######################
## LOAD EXPERT POLICY
#######################

print('Loading expert policy from...', params['expert_policy_file'])
expert_policy = LoadedGaussianPolicy(params['expert_policy_file'])
expert_policy.to(ptu.device)
print('Done restoring expert policy...')

Loading expert policy from... cs285/policies/experts/Ant.pkl
obs (1, 111) (1, 111)
Done restoring expert policy...


In [11]:
def sample_trajectory(env, policy, max_path_length, render=False):
    """Sample a rollout in the environment from a policy."""
    
    # initialize env for the beginning of a new rollout
    ob =  env.reset() # TODO: initial observation after resetting the env

    # init vars
    obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
    steps = 0
    while True:

        # render image of the simulated env
        if render:
            if hasattr(env, 'sim'):
                img = env.sim.render(camera_name='track', height=500, width=500)[::-1]
            else:
                img = env.render(mode='single_rgb_array')
            image_obs.append(cv2.resize(img, dsize=(250, 250), interpolation=cv2.INTER_CUBIC))
    
        # TODO use the most recent ob to decide what to do
        ac = policy(ob).sample() # HINT: this is a numpy array
        ac = ac[0]

        # TODO: take that action and get reward and next ob
        next_ob, rew, done, _ = env.step(ac)
        
        # TODO rollout can end due to done, or due to max_path_length
        steps += 1
        rollout_done = int(done or steps >= max_path_length) # HINT: this is either 0 or 1
        
        # record result of taking that action
        obs.append(ob)
        acs.append(ac)
        rewards.append(rew)
        next_obs.append(next_ob)
        terminals.append(rollout_done)

        ob = next_ob # jump to next timestep

        # end the rollout if the rollout ended
        if rollout_done:
            break

    return {"observation" : np.array(obs, dtype=np.float32),
            "image_obs" : np.array(image_obs, dtype=np.uint8),
            "reward" : np.array(rewards, dtype=np.float32),
            "action" : np.array(acs, dtype=np.float32),
            "next_observation": np.array(next_obs, dtype=np.float32),
            "terminal": np.array(terminals, dtype=np.float32)}

In [12]:
sample_trajectory(env, actor, 100)

TypeError: linear(): argument 'input' (position 1) must be Tensor, not tuple

In [36]:
ob =  env.reset()
np.array(actor(torch.tensor(ob[0], dtype=torch.float32)).sample())

array([ 0.9715774 ,  0.7519275 ,  0.3866682 ,  1.4880048 , -0.40164468,
        0.6947728 , -0.13829237, -1.4387997 ], dtype=float32)

In [26]:
torch.tensor(ob[0], dtype=torch.double)

tensor([ 0.8238,  0.9941,  0.0979, -0.0219, -0.0408,  0.0161,  0.0706, -0.0384,
         0.0448, -0.0244,  0.0012, -0.0785,  0.0741, -0.0177,  0.0309, -0.0092,
        -0.0965, -0.1108,  0.0053,  0.0418, -0.0473, -0.1283,  0.0810,  0.0263,
        -0.1864, -0.1090,  0.1192], dtype=torch.float64)

In [37]:
ob =  env.reset()
ob

(array([ 7.96861620e-01,  9.94054782e-01,  6.09826866e-02, -5.65110720e-02,
        -7.03043401e-02, -7.21872158e-02, -1.40094501e-03, -9.58007719e-02,
        -9.68530448e-02,  6.37948411e-02, -4.38224707e-02, -9.88979949e-02,
        -2.74399558e-04, -7.05942358e-02, -4.69545353e-02, -5.99215471e-02,
        -1.72873846e-01,  5.28312850e-03,  2.43504421e-02, -2.30624374e-01,
        -6.31208015e-02,  5.48171028e-02,  5.99633659e-02, -1.12662131e-01,
         3.37050015e-02, -1.14437962e-01, -5.91980448e-02]),
 {})