In [1]:
import gym
from tqdm import tqdm
import numpy as np

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

In [3]:
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

In [4]:
env = gym.make("LunarLanderContinuous-v2")


<br>
We create an expert RL agent and let it learn to solve a task by interacting with the evironment.<br>


In [5]:
try:
    ppo_expert = PPO.load("ppo_expert")
except Exception as e:
    good_hyperparams = {
        "batch_size": 64,
        "gae_lambda": 0.98,
        "gamma": 0.999,
        "n_epochs": 4,
        "ent_coef": 0.01,
    }
    ppo_expert = PPO(MlpPolicy, env, verbose=True, **good_hyperparams)
    ppo_expert.learn(total_timesteps=1e6)
    ppo_expert.save("ppo_expert")


<br>
We also create a student RL agent, which will later be trained with the expert dataset<br>


In [6]:
good_hyperparams = {
    "batch_size": 64,
    "gae_lambda": 0.98,
    "gamma": 0.999,
    "n_epochs": 4,
    "ent_coef": 0.01,
}
ppo_student = PPO(MlpPolicy, env, verbose=True, **good_hyperparams)

Using cpu device
Wrapping the env in a DummyVecEnv.



<br>
We now let our expert interact with the environment (except we already have expert data) <br>
and store resultant expert observations and actions to build an expert dataset.<br>


In [7]:
num_interactions = int(1e5)

In [8]:
try:
    expert_numpy_dataset = np.load("expert_actions_and_observations.npz")
    expert_actions = expert_numpy_dataset["expert_actions"]
    expert_observations = expert_numpy_dataset["expert_observations"]
except Exception as e:
    expert_observations = np.empty((num_interactions, env.observation_space.shape[0]))
    expert_actions = np.empty((num_interactions, env.action_space.shape[0]))
    obs = env.reset()
    for i in tqdm(range(num_interactions)):
        action, _ = ppo_expert.predict(obs, deterministic=True)
        expert_observations[i] = obs
        expert_actions[i] = action
        obs, reward, done, info = env.step(action)
        if done:
            obs = env.reset()
    np.savez_compressed(
        "expert_actions_and_observations",
        expert_actions=expert_actions,
        expert_observations=expert_observations,
    )


<br>
- To seamlessly use PyTorch in the training process, we subclass an `ExpertDataset` from PyTorch's base `Dataset`.<br>
- Note that we initialize the dataset with the previously generated expert observations and actions.<br>
- We further implement Python's `__getitem__` and `__len__` magic functions to allow PyTorch's dataset-handling to access arbitrary rows in the dataset and inform it about the length of the dataset.<br>
- For more information about PyTorch's datasets, you can read: https://pytorch.org/docs/stable/data.html.<br>


In [9]:
from torch.utils.data.dataset import Dataset, random_split
import numpy as np

In [10]:
class ExpertDataSet(Dataset):
    def __init__(self, expert_observations, expert_actions):
        self.observations = expert_observations
        self.actions = expert_actions
    def __getitem__(self, index):
        return (self.observations[index], self.actions[index])
    def __len__(self):
        return len(self.observations)


<br>
We now instantiate the `ExpertDataSet` and split it into training and test datasets.<br>


In [11]:
expert_dataset = ExpertDataSet(expert_observations, expert_actions)
train_size = int(0.8 * len(expert_dataset))
test_size = len(expert_dataset) - train_size
train_expert_dataset, test_expert_dataset = random_split(
    expert_dataset, [train_size, test_size]
)

In [12]:
print("test_expert_dataset: ", len(test_expert_dataset))
print("train_expert_dataset: ", len(train_expert_dataset))

test_expert_dataset:  20000
train_expert_dataset:  80000



<br>
NOTE: The supervised learning section of this code is adapted from: https://github.com/pytorch/examples/blob/master/mnist/main.py<br>
1. We extract the policy network of our RL student agent.<br>
2. We load the (labeled) expert dataset containing expert observations as inputs and expert<br>
actions as targets.<br>
3. We perform supervised learning, that is, we adjust the policy network's parameters such<br>
that given expert observations as inputs to the network, its outputs match the targets (expert<br>
actions).<br>
By training the policy network in this way the corresponding RL student agent is taught to behave<br>
like the expert agent that was used to created the expert dataset (Behavior Cloning).<br>


In [13]:
def pretrain_agent(
    batch_size=64,
    epochs=1000,
    gamma=0.7,
    learning_rate=1.0,
    log_interval=100,
    no_cuda=True,
    seed=1,
    test_batch_size=64,
):
    use_cuda = not no_cuda and torch.cuda.is_available()
    torch.manual_seed(seed)
    device = torch.device("cuda" if use_cuda else "cpu")
    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    # Extract initial policy
    model = ppo_student.policy.to(device)
    def train(model, device, train_loader, optimizer):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            action_prediction = output[0].double()
            criterion = nn.MSELoss()
            loss = criterion(action_prediction, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                print(
                    "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                        epoch,
                        batch_idx * len(data),
                        len(train_loader.dataset),
                        100.0 * batch_idx / len(train_loader),
                        loss.item(),
                    )
                )
    def test(model, device, test_loader):
        model.eval()
        test_loss = 0
        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data)
                action_prediction = output[0].double()
                criterion = nn.MSELoss()
                test_loss = criterion(action_prediction, target)
        test_loss /= len(test_loader.dataset)
        print(f"Test set: Average loss: {test_loss:.4f}")

    # Here, we use PyTorch's `DataLoader` to our load previously created `ExpertDataset`s for training
    # and testing
    train_loader = torch.utils.data.DataLoader(
        dataset=train_expert_dataset, batch_size=batch_size, shuffle=True, **kwargs
    )
    test_loader = torch.utils.data.DataLoader(
        dataset=test_expert_dataset, batch_size=test_batch_size, shuffle=True, **kwargs,
    )

    # Define an Optimizer and a learning rate schedule.
    optimizer = optim.Adadelta(model.parameters(), lr=learning_rate)
    scheduler = StepLR(optimizer, step_size=1, gamma=gamma)

    # Now we are finally ready to train the policy model.
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, optimizer)
        test(model, device, test_loader)
        scheduler.step()

    # Implant the trained policy network back into the RL student agent
    ppo_student.policy = model


<br>
Having defined the training procedure we can now run the pretraining!<br>


In [14]:
pretrain_agent(
    epochs=3,
    gamma=0.7,
    learning_rate=1.0,
    log_interval=100,
    no_cuda=True,
    seed=1,
    batch_size=64,
    test_batch_size=1000,
)
ppo_student.save("ppo_student")

Test set: Average loss: 0.0000
Test set: Average loss: 0.0000
Test set: Average loss: 0.0000



<br>
Finally, let us test how well our RL agent student learned to mimic the behavior of the expert<br>


In [15]:
num_interactions = 3000
obs = env.reset()
for i in range(num_interactions):
    action, _ = ppo_student.predict(obs, deterministic=True)
    expert_observations[i] = obs
    expert_actions[i] = action
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        obs = env.reset()