# End-to-End Learning | Practical Session Part II | Training Neural Network

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
import gym.wrappers

from nets.imitator import Policy
from dataset import MarkovProcess
from early_stopping import EarlyStopping
from utils import make_Dirs, save_Result, get_EnvInfo
from nets.abstract import Abstract

ratio = [16, 10]
plt.rcParams["figure.figsize"] = ratio
plt.rcParams.update({"font.size": 22})

### Imitator Policy
We use a simple network for the imitator policy. Two consecudive convolutional layers are followed by MaxPooling and flattening.
Afterwards two fully connected layers are inserted. The last layer has as activation function the Tanh function.

![Tanh Function](resources/tanh.png)

As loss function we use the L2 norm between expert data and predicted one:

$ l(x,y) = L = {l_1, ..., L_N}^T, l_n = (x_n - y_n)^2 $


In [None]:
class Policy(Abstract):
    def __init__(
        self,
        s_dim,
        a_dim,
        opt_params={
            "lr": 1e-3,
            "eps": 1e-8,
            "weight_decay": 0.0,
            "amsgrad": False,
            "tadam": False,
        },
        use_cuda=False,
    ):
        super(Policy, self).__init__(s_dim, a_dim, use_cuda)

        self.loss = nn.MSELoss(reduction="sum")
        self.conv1 = nn.Conv2d(1, 16, kernel_size=5)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=5)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(32 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 3)

        self.tanh = nn.Tanh()

        nns = self._modules.items()
        self.init("policy", nns, opt_params)

    def forward(self, x):

        input = self.convert(x, self.s_dim)

        h1 = self.pool(F.relu(self.conv1(input)))
        h2 = self.pool(F.relu(self.conv2(h1)))
        h3 = h2.view(-1, 32 * 5 * 5)
        h4 = F.relu(self.fc1(h3))
        h5 = F.relu(self.fc2(h4))
        action = self.tanh(self.fc3(h5))

        return action

    def criterion(self, a_imitator, a_exp):
        loss = self.loss(a_imitator, a_exp)
        return loss

    def reset(self):
        return np.zeros(self.a_dim)

In [None]:
def epoch_policy(dataset, policy, n_epoch, mode):
    loss_sum = 0.0
    loss_num = 0
    if "train" in mode:
        policy.train()
    else:
        policy.eval()
    for batch_idx, (so, a_) in enumerate(dataset):
        output = policy(so)
        loss = policy.criterion(output, a_).mean()
        if policy.training:
            policy.optimizer.zero_grad()
            loss.backward()
            policy.optimizer.step()
        loss_sum += loss.data.item()
        loss_num += 1
    loss_sum /= loss_num
    print(
        "{}-th epoch {} of policy was end: \n\tloss = {}".format(
            n_epoch, mode, loss_sum
        )
    )
    return loss_sum

## Parameter

In [None]:
DATA_NAME = "./expert/summary.csv"
METHOD = "Lecture_End_to_End_PracticeSession"
SAVE_DIR = "./result/" + METHOD + "/"
make_Dirs(SAVE_DIR)

BATCH_SIZE = 128
TRAIN_VALID = 0.95
EARLY_LENGTH = 5
N_EPOCH = 50

In [None]:
# prepare environment and agent (imitator)
env = gym.make("CarRacing-v0")
s_dim, a_dim, transform = get_EnvInfo(env)

# specify the random seeds
torch.manual_seed(0)
np.random.seed(0)
env.seed(0)
# set expert dataset
dataset = MarkovProcess(DATA_NAME, transform=transform)
len_train = int(TRAIN_VALID * len(dataset))
train_loader, valid_loader = torch.utils.data.random_split(
    dataset, [len_train, len(dataset) - len_train]
)
train_loader = torch.utils.data.DataLoader(
    train_loader, batch_size=BATCH_SIZE, shuffle=True
)
valid_loader = torch.utils.data.DataLoader(
    valid_loader, batch_size=BATCH_SIZE, shuffle=True
)

### Show Example Training Data
State is resized from 96x96x3 RGB image to a grayscale 32x32x1 image.

In [None]:
for batch_idx, (so, a_) in enumerate(dataset):
    print("The internal representation of the state: ")
    print("\n")
    print(so)
    print("\n")
    print("The executed action at this timestamp was: ", a_)
    plt.imshow(so.permute(1, 2, 0), cmap="gray")
    plt.show()
    break

### Training of the Network

In [None]:
policy = Policy(s_dim, a_dim)
stopper = EarlyStopping(length=EARLY_LENGTH)

# prepare buffers to store results
train_loss_policy = []
valid_loss_policy = []

In [None]:
# optimize policy by the expert dataset
print("Start learning policy!")
for n_epoch in range(1, N_EPOCH + 1):
    train_loss_policy.append(epoch_policy(train_loader, policy, n_epoch, "train"))
    valid_loss_policy.append(epoch_policy(valid_loader, policy, n_epoch, "valid"))
    # early stopping
    if stopper(valid_loss_policy[-1]):
        print("Early Stopping to avoid overfitting!")
        break
print("Finished learning policy!")
# save trained model
policy.release(SAVE_DIR)
# close everything
env.close()

### Plotting the Learning Curve

In [None]:
training_data = {"train": train_loss_policy, "valid": valid_loss_policy}
save_Result(SAVE_DIR, "loss_policy", training_data)
plt.clf()
for key, val in training_data.items():
    plt.plot(range(len(val)), val, label=key)
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("MSE Loss")
plt.show()