# **過擬合（Overfitting）**
從模型調校當中了解分別需要查看訓練集以及驗證集的模型表現結果，然而在驗證集上若沒有如訓練集表現的，其中一個可能發生的原因即是模型過擬合在訓練集上，此份程式碼會介紹在過擬合情況產生時，如何在模型上做抑制的手段。

## 本章節內容大綱
* ### [Regularization](#Regularization)
* ### [Early Stopping](#EarlyStopping)
* ### [Dropout](#Dropout)
* ### [Parameter Initialization](#ParameterInitialization)
* ### [Batch Normalization](#BatchNormalization)
-----------------

## 匯入套件

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# PyTorch 相關套件
import torch
import torch.nn as nn
import torch.nn.functional as F

## 創建資料集／載入資料集（Dataset Creating / Loading）

In [None]:
# 上傳資料
!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/DL/Data_part3.zip
!unzip -q Data_part3.zip

In [None]:
train_df = pd.read_csv('./Data/News_train.csv')
test_df = pd.read_csv('./Data/News_test.csv')

In [None]:
train_df.head()

In [None]:
X_df = train_df.iloc[:, :-1].values
y_df = train_df.y_category.values

In [None]:
X_test = test_df.iloc[:, :-1].values
y_test = test_df.y_category.values

## 資料前處理（Data Preprocessing）

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Feature scaling
sc = StandardScaler()
X_scale = sc.fit_transform(X_df, y_df)
X_test_scale = sc.transform(X_test)

In [None]:
# train, valid/test dataset split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_scale, y_df,
                                                      test_size=0.2,
                                                      random_state=5566,
                                                      stratify=y_df)

In [None]:
print(f'X_train shape: {X_train.shape}')
print(f'X_valid shape: {X_valid.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_valid shape: {y_valid.shape}')

In [None]:
# build dataset and dataloader
train_ds = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                          torch.tensor(y_train, dtype=torch.long))
valid_ds = torch.utils.data.TensorDataset(torch.tensor(X_valid, dtype=torch.float32),
                                          torch.tensor(y_valid, dtype=torch.long))
test_ds = torch.utils.data.TensorDataset(torch.tensor(X_test_scale, dtype=torch.float32),
                                         torch.tensor(y_test, dtype=torch.long))

BATCH_SIZE = 64
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE)

## 模型建置（Model Building）

In [None]:
NUM_CLASS = 11

def build_model(input_shape, num_class):
    torch.manual_seed(5566)
    model = nn.Sequential(
        nn.Linear(input_shape, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, num_class),
    )
    return model

In [None]:
model = build_model(X_train.shape[1], NUM_CLASS)
print(model)

## 模型訓練（Model Training）

In [None]:
optimizer = torch.optim.NAdam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss() # 多元分類損失函數

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')
model = model.to(device)

In [None]:
def train_epoch(model, optimizer, loss_fn, train_dataloader, val_dataloader):
    # 訓練一輪
    model.train()
    total_train_loss = 0
    total_train_correct = 0
    for x, y in tqdm(train_dataloader, leave=False):
        optimizer.zero_grad() # 梯度歸零
        x, y = x.to(device), y.to(device) # 將資料移至GPU
        y_pred = model(x) # 計算預測值
        loss = loss_fn(y_pred, y) # 計算誤差
        loss.backward() # 反向傳播計算梯度
        optimizer.step() # 更新模型參數
        total_train_loss += loss.item()
        # 利用argmax計算最大值是第n個類別，與解答比對是否相同
        total_train_correct += ((y_pred.argmax(dim=1) == y).sum().item())

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_acc = total_train_correct / len(train_dataloader.dataset)

    return avg_train_loss, avg_train_acc

def test_epoch(model, loss_fn, val_dataloader):
    # 驗證一輪
    model.eval()
    total_val_loss = 0
    total_val_correct = 0
    # 關閉梯度計算以加速
    with torch.no_grad():
        for x, y in val_dataloader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            total_val_loss += loss.item()
            # 利用argmax計算最大值是第n個類別，與解答比對是否相同
            total_val_correct += ((y_pred.argmax(dim=1) == y).sum().item())

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_acc = total_val_correct / len(val_dataloader.dataset)

    return avg_val_loss, avg_val_acc

def run(model, optimizer, loss_fn, train_loader, valid_loader, verbose=1):
    train_loss_log = []
    val_loss_log = []
    train_acc_log = []
    val_acc_log = []
    for epoch in tqdm(range(20)):
        avg_train_loss, avg_train_acc = train_epoch(model, optimizer, loss_fn, train_loader, valid_loader)
        avg_val_loss, avg_val_acc = test_epoch(model, loss_fn, valid_loader)
        train_loss_log.append(avg_train_loss)
        val_loss_log.append(avg_val_loss)
        train_acc_log.append(avg_train_acc)
        val_acc_log.append(avg_val_acc)
        if verbose == 1:
            print(f'Epoch: {epoch}, Train Loss: {avg_train_loss:.3f}, Val Loss: {avg_val_loss:.3f} | Train Acc: {avg_train_acc:.3f}, Val Acc: {avg_val_acc:.3f}')
    return train_loss_log, train_acc_log, val_loss_log, val_acc_log

In [None]:
train_loss_log, train_acc_log, val_loss_log, val_acc_log = run(model, optimizer, loss_fn, train_loader, valid_loader)

## 模型評估（Model Evaluation）

In [None]:
plt.figure(figsize=(15, 4))
plt.subplot(1, 2, 1)
plt.plot(range(len(train_loss_log)), train_loss_log, label='train_loss')
plt.plot(range(len(val_loss_log)), val_loss_log, label='valid_loss')
plt.xlabel('Epochs')
plt.ylabel('Binary crossentropy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(len(train_acc_log)), train_acc_log, label='train_acc')
plt.plot(range(len(val_acc_log)), val_acc_log, label='valid_acc')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Print the results of testing data
print('============================')
print('Testing data')
print('============================')
test_loss, test_acc = test_epoch(model, loss_fn, test_loader)
print(f'loss: {test_loss}')
print(f'acc: {test_acc}')

## 過擬合抑制策略

![](https://hackmd.io/_uploads/B1rmk5Ubp.png)


<a name="Regularization"></a>
* ## Regularization
<img src="https://hackmd.io/_uploads/B15Nk98ZT.png" width="50%" height="50%">

In [None]:
# L1, L2 regularization 計算
def add_regularization(loss, model, l1_alpha=0, l2_alpha=0):
    l1_alpha = float(l1_alpha)
    l2_alpha = float(l2_alpha)
    l1_norm = sum(torch.linalg.norm(p, ord=1) for p in model.parameters())
    l2_norm = sum(p.pow(2).sum() for p in model.parameters())
    regularization = l1_alpha * l1_norm + l2_alpha * l2_norm
    return loss + regularization

# 加入 L1, L2 regularization 計算
def train_epoch(model, optimizer, loss_fn, train_dataloader, l1_alpha=0, l2_alpha=0):
    # 訓練一輪
    model.train()
    total_train_loss = 0
    total_train_correct = 0
    for x, y in tqdm(train_dataloader, leave=False):
        x, y = x.to(device), y.to(device) # 將資料移至GPU
        y_pred = model(x) # 計算預測值
        loss = loss_fn(y_pred, y) # 計算誤差
        loss = add_regularization(loss, model, l1_alpha, l2_alpha) # L1, L2 regularization
        optimizer.zero_grad() # 梯度歸零
        loss.backward() # 反向傳播計算梯度
        optimizer.step() # 更新模型參數

        total_train_loss += loss.item()
        # 利用argmax計算最大值是第n個類別，與解答比對是否相同
        total_train_correct += ((y_pred.argmax(dim=1) == y).sum().item())

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_acc = total_train_correct / len(train_dataloader.dataset)

    return avg_train_loss, avg_train_acc

def test_epoch(model, loss_fn, val_dataloader, l1_alpha=0, l2_alpha=0):
    # 驗證一輪
    model.eval()
    total_val_loss = 0
    total_val_correct = 0
    # 關閉梯度計算以加速
    with torch.no_grad():
        for x, y in val_dataloader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            loss = add_regularization(loss, model, l1_alpha, l2_alpha) # L1, L2 regularization
            total_val_loss += loss.item()
            # 利用argmax計算最大值是第n個類別，與解答比對是否相同
            total_val_correct += ((y_pred.argmax(dim=1) == y).sum().item())

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_acc = total_val_correct / len(val_dataloader.dataset)

    return avg_val_loss, avg_val_acc

def run(epochs, model, optimizer, loss_fn, train_loader, valid_loader, l1_alpha, l2_alpha, verbose=1):
    train_loss_log = []
    train_acc_log = []
    val_loss_log = []
    val_acc_log = []
    for epoch in tqdm(range(epochs)):
        avg_train_loss, avg_train_acc = train_epoch(model, optimizer, loss_fn, train_loader, l1_alpha, l2_alpha)
        avg_val_loss, avg_val_acc = test_epoch(model, loss_fn, valid_loader, l1_alpha, l2_alpha)
        train_loss_log.append(avg_train_loss)
        train_acc_log.append(avg_train_acc)
        val_loss_log.append(avg_val_loss)
        val_acc_log.append(avg_val_acc)
        if verbose == 1:
            print(f'Epoch: {epoch}, Train Loss: {avg_train_loss:.3f}, Val Loss: {avg_val_loss:.3f} | Train Acc: {avg_train_acc:.3f}, Val Acc: {avg_val_acc:.3f}')
    return train_loss_log, train_acc_log, val_loss_log, val_acc_log

In [None]:
# 以下放置要比較的 regularizer 數值
l1_l2_list = [(0, 0), (1e-3, 0), (0, 1e-2), (1e-3, 1e-2)]

# 建立兩個 list 記錄選用不同 regularizer 數值的訓練結果
train_loss_list = []
train_acc_list = []

# 建立兩個 list 記錄選用不同 regularizer 數值的驗證結果
valid_loss_list = []
valid_acc_list = []

# 建立一個 list 紀錄選用不同 regularizer 數值的測試結果
test_eval = []

# 迭代不同的 regularizer 數值去訓練模型
for l1_alpha, l2_alpha in tqdm(l1_l2_list):
    print('Training a model with regularizer L1: {}, L2: {}'
          .format(l1_alpha, l2_alpha))

    # 確保每次都是訓練新的模型，而不是接續上一輪的模型
    model = build_model(X_train.shape[1], NUM_CLASS)
    model = model.to(device)
    optimizer = torch.optim.NAdam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()
    history = run(20, model, optimizer, loss_fn, train_loader, valid_loader,
                  l1_alpha, l2_alpha,
                  verbose=0)
    # 將訓練過程記錄下來
    train_loss_list.append(history[0])
    train_acc_list.append(history[1])
    valid_loss_list.append(history[2])
    valid_acc_list.append(history[3])
    test_eval.append(test_epoch(model, loss_fn, test_loader,
                                l1_alpha, l2_alpha))
print('----------------- training done! -----------------')

In [None]:
# 視覺化訓練過程
plt.figure(figsize=(15, 7))

train_line = ()
valid_line = ()

# 繪製 Training loss
plt.subplot(121)
for k in range(len(l1_l2_list)):
    l1, l2 = l1_l2_list[k]
    loss = train_loss_list[k]
    val_loss = valid_loss_list[k]
    train_l = plt.plot(
        range(len(loss)), loss,
        label=f'Training    L1: {l1}, L2: {l2}')
    valid_l = plt.plot(
        range(len(val_loss)), val_loss, '--',
        label=f'Validation L1: {l1}, L2: {l2}')

    train_line += tuple(train_l)
    valid_line += tuple(valid_l)
plt.title('Loss')

# 繪製 Training accuracy
plt.subplot(122)
train_acc_line = []
valid_acc_line = []
for k in range(len(l1_l2_list)):
    l1, l2 = l1_l2_list[k]
    acc = train_acc_list[k]
    val_acc = valid_acc_list[k]
    plt.plot(range(len(acc)), acc,
             label=f'Training    L1: {l1}, L2: {l2}')
    plt.plot(range(len(val_acc)), val_acc, '--',
             label=f'Validation L1: {l1}, L2: {l2}')
plt.title('Accuracy')

first_legend = plt.legend(handles=train_line,
                          bbox_to_anchor=(1.05, 1))

plt.gca().add_artist(first_legend)
plt.legend(handles=valid_line,
           bbox_to_anchor=(1.05, 0.8))
plt.show()

In [None]:
# Print the results of testing data
for k in range(len(l1_l2_list)):
    print('============================')
    print(f'(l1, l2) = {l1_l2_list[k]}')
    print('============================')
    print(f'loss: {test_eval[k][0]}')
    print(f'acc: {test_eval[k][1]}\n')

<a name="EarlyStopping"></a>
* ## Early Stopping

In [None]:
model = build_model(X_train.shape[1], NUM_CLASS)
model = model.to(device)
optimizer = torch.optim.NAdam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()

In [None]:
# 加入earlystop
def run(epochs, model, optimizer, loss_fn,
        train_loader, valid_loader,
        l1_alpha=0, l2_alpha=0,
        early_stop=True,
        n_patience=5, # 訓練過程經過 n_patience 次沒有進步之後停止
        verbose=1):
    train_loss_log = []
    train_acc_log = []
    val_loss_log = []
    val_acc_log = []
    # early stoping 設定
    best_val_loss = float('inf')
    patience = 0

    for epoch in tqdm(range(epochs)):
        avg_train_loss, avg_train_acc = train_epoch(model, optimizer, loss_fn, train_loader, l1_alpha, l2_alpha)
        avg_val_loss, avg_val_acc = test_epoch(model, loss_fn, valid_loader, l1_alpha, l2_alpha)
        train_loss_log.append(avg_train_loss)
        train_acc_log.append(avg_train_acc)
        val_loss_log.append(avg_val_loss)
        val_acc_log.append(avg_val_acc)
        if verbose == 1:
            print(f'Epoch: {epoch}, Train Loss: {avg_train_loss:.3f}, Val Loss: {avg_val_loss:.3f} | Train Acc: {avg_train_acc:.3f}, Val Acc: {avg_val_acc:.3f}')
        if early_stop:
            # Early stopping檢查
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                patience = 0
            else:
                patience += 1
            if patience >= n_patience:
                print(f'Early stopping at epoch {epoch}')
                break
    return train_loss_log, train_acc_log, val_loss_log, val_acc_log

In [None]:
train_loss, train_acc, valid_loss, valid_acc = run(20, model, optimizer, loss_fn, train_loader, valid_loader,
                                                   l1_alpha=0, l2_alpha=0)

In [None]:
plt.figure(figsize=(15, 4))
plt.subplot(1, 2, 1)
plt.plot(range(len(train_loss)), train_loss, label='train_loss')
plt.plot(range(len(valid_loss)), valid_loss, label='valid_loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(range(len(train_acc)), train_acc, label='train_acc')
plt.plot(range(len(valid_acc)), valid_acc, label='valid_acc')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
# Print the results of testing data
print('============================')
print('Testing data')
print('============================')
test_loss, test_acc = test_epoch(model, loss_fn, test_loader)
print(f'loss: {test_loss}')
print(f'acc : {test_acc}')

<a name="Dropout"></a>
* ## Dropout
![](https://hackmd.io/_uploads/HJePycUba.png)


In [None]:
NUM_CLASS = 11

def build_model_dropout(input_shape, num_class, droprate):
    torch.manual_seed(5566)
    model = nn.Sequential(
        nn.Linear(input_shape, 64),
        nn.Dropout(droprate),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Dropout(droprate),
        nn.Tanh(),
        nn.Linear(64, num_class),
    )
    return model

In [None]:
# 以下放置要比較的 dropout rate
dropout_rates = [0, 0.1, 0.3, 0.5]

# 建立兩個 list 記錄選用不同 dropout rate 的訓練結果
train_loss_list = []
train_acc_list = []

# 建立兩個 list 記錄選用不同 dropout rate 的驗證結果
valid_loss_list = []
valid_acc_list = []

# 建立一個 list 紀錄選用不同 dropout rate 數值的測試結果
test_eval = []

# 迭代不同的 dropout rate 去訓練模型
for drop_r in dropout_rates:
    print('Training a model with dropout rate: {}'
          .format(drop_r))

    # 確保每次都是訓練新的模型，而不是接續上一輪的模型
    model = build_model_dropout(X_train.shape[1],
                                NUM_CLASS,
                                drop_r)
    model = model.to(device)
    optimizer = torch.optim.NAdam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()

    # 確保每次都設定一樣的參數
    history = run(20, model, optimizer, loss_fn, train_loader, valid_loader,
                  early_stop=False,
                  verbose=1)

    # 將訓練結果記錄下來
    train_loss_list.append(history[0])
    train_acc_list.append(history[1])
    valid_loss_list.append(history[2])
    valid_acc_list.append(history[3])
    test_eval.append(test_epoch(model, loss_fn, test_loader))
print('----------------- training done! -----------------')

In [None]:
# 視覺化訓練過程
plt.figure(figsize=(15, 7))

train_line = ()
valid_line = ()

# 繪製 Training loss
plt.subplot(121)
for k in range(len(dropout_rates)):
    loss = train_loss_list[k]
    val_loss = valid_loss_list[k]
    train_l = plt.plot(
        range(len(loss)), loss,
        label=f'Training    dropout rate:{dropout_rates[k]}')
    valid_l = plt.plot(
        range(len(val_loss)), val_loss, '--',
        label=f'Validation dropout rate:{dropout_rates[k]}')

    train_line += tuple(train_l)
    valid_line += tuple(valid_l)
plt.title('Loss')

# 繪製 Training accuracy
plt.subplot(122)
train_acc_line = []
valid_acc_line = []
for k in range(len(dropout_rates)):
    acc = train_acc_list[k]
    val_acc = valid_acc_list[k]
    plt.plot(range(len(acc)), acc,
             label=f'Training    dropout rate:{dropout_rates[k]}')
    plt.plot(range(len(val_acc)), val_acc, '--',
             label=f'Validation dropout rate:{dropout_rates[k]}')
plt.title('Accuracy')

first_legend = plt.legend(handles=train_line,
                          bbox_to_anchor=(1.05, 1))

plt.gca().add_artist(first_legend)
plt.legend(handles=valid_line,
           bbox_to_anchor=(1.05, 0.8))
plt.show()

In [None]:
# Print the results of testing data
for k in range(len(dropout_rates)):
    print('============================')
    print(f'dropout_rate = {dropout_rates[k]}')
    print('============================')
    print(f'loss: {test_eval[k][0]}')
    print(f'acc: {test_eval[k][1]}\n')

<a name="ParameterInitialization"></a>
* ## Parameter Initialization
torch.nn.init: https://pytorch.org/docs/stable/nn.init.html

In [None]:
import functools
import math

def lecun_normal_(tensor: torch.Tensor) -> torch.Tensor:
    # Assuming that the weights' input dimension is the last.
    input_size = tensor.shape[-1]
    std = math.sqrt(1/input_size)
    with torch.no_grad():
        return tensor.normal_(-std,std)

# 針對不同網路層採取不同初始化方法
def weights_init(m, init_fn):
    if isinstance(m, nn.Linear):
        init_fn(m.weight)
        torch.nn.init.zeros_(m.bias)

def build_model_init(input_shape, num_class, init_fn):
    torch.manual_seed(5566)
    model = nn.Sequential(
        nn.Linear(input_shape, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, num_class),
    )
    model.apply(functools.partial(weights_init, init_fn=init_fn))
    return model

In [None]:
# 以下放置要比較的 initializer
init_l = [
    torch.nn.init.xavier_normal_, # glorot init
    torch.nn.init.kaiming_normal_, # he init
    lecun_normal_,
    torch.nn.init.normal_,
    torch.nn.init.trunc_normal_,
]

# 建立兩個 list 記錄選用不同 initializer 的訓練結果
train_loss_list = []
train_acc_list = []

# 建立兩個 list 記錄選用不同 initializer 的驗證結果
valid_loss_list = []
valid_acc_list = []

# 建立一個 list 紀錄選用不同 initializer 數值的測試結果
test_eval = []

# 迭代不同的 initializer 去訓練模型
for init in init_l:
    print(f'Training model, init = {init}')

    # 確保每次都是訓練新的模型，而不是接續上一輪的模型
    model = build_model_init(X_train.shape[1],
                             NUM_CLASS,
                             init)
    model = model.to(device)
    optimizer = torch.optim.NAdam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()
    history = run(20, model, optimizer, loss_fn, train_loader, valid_loader,
                  early_stop=False,
                  verbose=0)

    # 將訓練結果記錄下來
    train_loss_list.append(history[0])
    train_acc_list.append(history[1])
    valid_loss_list.append(history[2])
    valid_acc_list.append(history[3])
    test_eval.append(test_epoch(model, loss_fn, test_loader))
print('----------------- training done! -----------------')

In [None]:
# 視覺化訓練過程
plt.figure(figsize=(15, 7))

train_line = ()
valid_line = ()

# 繪製 Training loss
plt.subplot(121)
for k in range(len(init_l)):
    loss = train_loss_list[k]
    val_loss = valid_loss_list[k]
    train_l = plt.plot(
        range(len(loss)), loss,
        label=f'Training    init: {init_l[k]}')
    valid_l = plt.plot(
        range(len(val_loss)), val_loss, '--',
        label=f'Validation init: {init_l[k]}')

    train_line += tuple(train_l)
    valid_line += tuple(valid_l)
plt.title('Loss')

# 繪製 Training accuracy
plt.subplot(122)
train_acc_line = []
valid_acc_line = []
for k in range(len(init_l)):
    acc = train_acc_list[k]
    val_acc = valid_acc_list[k]
    plt.plot(range(len(acc)), acc,
             label=f'Training    init: {init_l[k]}')
    plt.plot(range(len(val_acc)), val_acc, '--',
             label=f'Validation init: {init_l[k]}')
plt.title('Accuracy')

first_legend = plt.legend(handles=train_line,
                          bbox_to_anchor=(1.05, 1))

plt.gca().add_artist(first_legend)
plt.legend(handles=valid_line,
           bbox_to_anchor=(1.05, 0.75))
plt.show()

In [None]:
# Print the results of testing data
for k in range(len(init_l)):
    print('============================')
    print(f'initializer = {init_l[k]}')
    print('============================')
    print(f'loss: {test_eval[k][0]}')
    print(f'acc: {test_eval[k][1]}\n')

<a name="BatchNormalization"></a>
* ## Batch Normalization

In [None]:
class LinearBN(nn.Module):
    def __init__(self, in_features, out_features, bn=True):
        super().__init__()
        self.linear = nn.Linear(in_features, out_features)
        if bn:
            self.bn = nn.BatchNorm1d(out_features)
        else:
            self.bn = nn.Identity()
        self.act = nn.Tanh()
    def forward(self, x):
        return self.act(self.bn(self.linear(x)))


def build_model_bn(input_shape, num_class, bn=True):
    torch.manual_seed(5566)
    model = nn.Sequential(
        LinearBN(input_shape, 64, bn),
        LinearBN(64, 64, bn),
        nn.Linear(64, num_class),
    )
    return model

In [None]:
BN = [False, True]

# 建立兩個 list 記錄是否加入 BatchNormalization 的訓練結果
train_loss_list = []
train_acc_list = []

# 建立兩個 list 記錄是否加入 BatchNormalization 的驗證結果
valid_loss_list = []
valid_acc_list = []

# 建立一個 list 紀錄是否加入 BatchNormalization 的測試結果
test_eval = []

# 迭代是否加入 BatchNormalization 去訓練模型
for bn in BN:
    print('Training a model with BatchNormalization: {}'
          .format(str(bn)))

    # 確保每次都是訓練新的模型，而不是接續上一輪的模型
    model = build_model_bn(X_train.shape[1], NUM_CLASS, bn)
    model = model.to(device)
    optimizer = torch.optim.NAdam(model.parameters(), lr=0.001)
    loss_fn = nn.CrossEntropyLoss()
    history = run(20, model, optimizer, loss_fn, train_loader, valid_loader,
                  early_stop=False,
                  verbose=0)

    train_loss_list.append(history[0])
    train_acc_list.append(history[1])
    valid_loss_list.append(history[2])
    valid_acc_list.append(history[3])
    test_eval.append(test_epoch(model, loss_fn, test_loader))
print('----------------- training done! -----------------')

In [None]:
# 視覺化訓練過程
plt.figure(figsize=(15, 7))

train_line = ()
valid_line = ()

# 繪製 Training loss
plt.subplot(121)
for k in range(len(BN)):
    loss = train_loss_list[k]
    val_loss = valid_loss_list[k]
    train_l = plt.plot(
        range(len(loss)), loss,
        label=f'Training    BatchNormalization:{str(BN[k])}')
    valid_l = plt.plot(
        range(len(val_loss)), val_loss, '--',
        label=f'Validation BatchNormalization:{str(BN[k])}')

    train_line += tuple(train_l)
    valid_line += tuple(valid_l)
plt.title('Loss')

# 繪製 Training accuracy
plt.subplot(122)
train_acc_line = []
valid_acc_line = []
for k in range(len(BN)):
    acc = train_acc_list[k]
    val_acc = valid_acc_list[k]
    plt.plot(range(len(acc)), acc,
             label=f'Training    BatchNormalization:{str(BN[k])}')
    plt.plot(range(len(val_acc)), val_acc, '--',
             label=f'Validation BatchNormalization:{str(BN[k])}')
plt.title('Accuracy')

first_legend = plt.legend(handles=train_line,
                          bbox_to_anchor=(1.05, 1))

plt.gca().add_artist(first_legend)
plt.legend(handles=valid_line,
           bbox_to_anchor=(1.05, 0.75))
plt.show()

In [None]:
# Print the results of testing data
for k in range(len(BN)):
    print('============================')
    print(f'BatchNormalization = {BN[k]}')
    print('============================')
    print(f'loss: {test_eval[k][0]}')
    print(f'acc: {test_eval[k][1]}\n')

---
### Quiz
請試著利用 Data/pkgo_train.csv 做多元分類問題，預測五個種類的 pokemon，並使用 Data/pkgo_test.csv 驗證結果。

若出現 Overfitting 的情況，嘗試使用以上抑制 Overfitting 的方法調整訓練模型的策略。