# **常見訓練設定**
此份程式碼將會介紹隨著訓練過程，可以調整或者紀錄的函式。

## 本章節內容大綱
* ### EarlyStopping（已於 part3/2_Overfitting.ipynb 介紹）
* ### [ModelCheckpoint](#ModelCheckpoint)
* ### [LearningRateSchedular](#LearningRateSchedular)
* ### [CSVLogger](#CSVLogger)

## 匯入套件

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# PyTorch 相關套件
import torch
import torch.nn as nn
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')

## 創建資料集／載入資料集（Dataset Creating / Loading）

In [None]:
# 上傳資料
!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/DL/Data_part3.zip
!unzip -q Data_part3.zip

In [None]:
train_df = pd.read_csv('./Data/News_train.csv')
test_df = pd.read_csv('./Data/News_test.csv')

In [None]:
train_df.head()

In [None]:
X_df = train_df.iloc[:, :-1].values
y_df = train_df.y_category.values

In [None]:
X_test = test_df.iloc[:, :-1].values
y_test = test_df.y_category.values

## 資料前處理（Data Preprocessing）

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
# Feature scaling
sc = StandardScaler()
X_scale = sc.fit_transform(X_df, y_df)
X_test_scale = sc.transform(X_test)

In [None]:
# train, valid/test dataset split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_scale, y_df,
                                                      test_size=0.2,
                                                      random_state=5566,
                                                      stratify=y_df)

In [None]:
print(f'X_train shape: {X_train.shape}')
print(f'X_valid shape: {X_valid.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_valid shape: {y_valid.shape}')

In [None]:
# build dataset and dataloader
train_ds = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                                          torch.tensor(y_train, dtype=torch.long))
valid_ds = torch.utils.data.TensorDataset(torch.tensor(X_valid, dtype=torch.float32),
                                          torch.tensor(y_valid, dtype=torch.long))
test_ds = torch.utils.data.TensorDataset(torch.tensor(X_test_scale, dtype=torch.float32),
                                         torch.tensor(y_test, dtype=torch.long))

BATCH_SIZE = 64
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_ds, batch_size=BATCH_SIZE)
test_loader = torch.utils.data.DataLoader(test_ds, batch_size=BATCH_SIZE)

## 模型建置（Model Building）

In [None]:
NUM_CLASS = 11

def build_model(input_shape, num_class):
    torch.manual_seed(5566)
    model = nn.Sequential(
        nn.Linear(input_shape, 64),
        nn.Tanh(),
        nn.Linear(64, 64),
        nn.Tanh(),
        nn.Linear(64, num_class),
    )
    return model

<a name="ModelCheckpoint"></a>
* ## ModelCheckpoint

In [None]:
model = build_model(X_train.shape[1], NUM_CLASS)
model = model.to(device)

optimizer = torch.optim.NAdam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss() # 多元分類損失函數

In [None]:
def train_epoch(model, optimizer, loss_fn, train_dataloader, val_dataloader):
    # 訓練一輪
    model.train()
    total_train_loss = 0
    total_train_correct = 0
    for x, y in tqdm(train_dataloader, leave=False):
        optimizer.zero_grad() # 梯度歸零
        x, y = x.to(device), y.to(device) # 將資料移至GPU
        y_pred = model(x) # 計算預測值
        loss = loss_fn(y_pred, y) # 計算誤差
        loss.backward() # 反向傳播計算梯度
        optimizer.step() # 更新模型參數
        total_train_loss += loss.item()
        # 利用argmax計算最大值是第n個類別，與解答比對是否相同
        total_train_correct += ((y_pred.argmax(dim=1) == y).sum().item())

    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_train_acc = total_train_correct / len(train_dataloader.dataset)

    return avg_train_loss, avg_train_acc

def test_epoch(model, loss_fn, val_dataloader):
    # 驗證一輪
    model.eval()
    total_val_loss = 0
    total_val_correct = 0
    # 關閉梯度計算以加速
    with torch.no_grad():
        for x, y in val_dataloader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            total_val_loss += loss.item()
            # 利用argmax計算最大值是第n個類別，與解答比對是否相同
            total_val_correct += ((y_pred.argmax(dim=1) == y).sum().item())

    avg_val_loss = total_val_loss / len(val_dataloader)
    avg_val_acc = total_val_correct / len(val_dataloader.dataset)

    return avg_val_loss, avg_val_acc

BEST_MODEL_PATH = './Data/best.pth'  # 最佳模型位置
LAST_MODEL_PATH = './Data/last.pth'  # 最佳模型位置

def run(model, optimizer, loss_fn, train_loader, valid_loader, verbose=1):
    train_loss_log = []
    val_loss_log = []
    train_acc_log = []
    val_acc_log = []
    best_val_loss = np.inf

    for epoch in tqdm(range(20)):
        avg_train_loss, avg_train_acc = train_epoch(model, optimizer, loss_fn, train_loader, valid_loader)
        avg_val_loss, avg_val_acc = test_epoch(model, loss_fn, valid_loader)
        train_loss_log.append(avg_train_loss)
        val_loss_log.append(avg_val_loss)
        train_acc_log.append(avg_train_acc)
        val_acc_log.append(avg_val_acc)
        if verbose == 1:
            print(f'Epoch: {epoch}, Train Loss: {avg_train_loss:.3f}, Val Loss: {avg_val_loss:.3f} | Train Acc: {avg_train_acc:.3f}, Val Acc: {avg_val_acc:.3f}')
        # Model check point
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), BEST_MODEL_PATH)
        torch.save(model.state_dict(), LAST_MODEL_PATH)
    return train_loss_log, train_acc_log, val_loss_log, val_acc_log

In [None]:
_ = run(model, optimizer, loss_fn, train_loader, valid_loader)

<a name="Learning Rate Schedular"></a>
* ## torch.optim.lr_scheduler: https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate

In [None]:
class CustomLRScheduler:
    def __init__(self, optimizer):
        self.optimizer = optimizer

    def step(self, epoch):
        if epoch < 10:
            lr = 0.001
        elif epoch < 15:
            lr = 0.0001
        else:
            lr = 0.00001
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = lr

def run(model, optimizer, loss_fn, train_loader, valid_loader,
        scheduler=None,
        verbose=1):
    train_loss_log = []
    val_loss_log = []
    train_acc_log = []
    val_acc_log = []
    best_val_loss = np.inf

    for epoch in tqdm(range(20)):
        avg_train_loss, avg_train_acc = train_epoch(model, optimizer, loss_fn, train_loader, valid_loader)
        avg_val_loss, avg_val_acc = test_epoch(model, loss_fn, valid_loader)
        train_loss_log.append(avg_train_loss)
        val_loss_log.append(avg_val_loss)
        train_acc_log.append(avg_train_acc)
        val_acc_log.append(avg_val_acc)
        if verbose == 1:
            print(f'Epoch: {epoch}, Train Loss: {avg_train_loss:.3f}, Val Loss: {avg_val_loss:.3f} | Train Acc: {avg_train_acc:.3f}, Val Acc: {avg_val_acc:.3f}')
        # Model check point
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), BEST_MODEL_PATH)
        torch.save(model.state_dict(), LAST_MODEL_PATH)
        if type(scheduler) == CustomLRScheduler:
            scheduler.step(epoch)
        elif type(scheduler) == torch.optim.lr_scheduler.ReduceLROnPlateau:
            scheduler.step(avg_val_loss)
        else:
            pass
    return train_loss_log, train_acc_log, val_loss_log, val_acc_log

In [None]:
# rlp = callbacks.ReduceLROnPlateau(
#     monitor='val_loss',  # 是否進步的指標
#     factor=0.1,  # 以 factor 的倍數調整 learning rate
#     patience=5,  # 經過 patience 次沒有進步調整 learning rate
#     verbose=2,
#     mode='min')

In [None]:
# 建立兩個 list 記錄選用不同 learing rate schedular 的訓練結果
train_loss_list = []
train_acc_list = []

# 建立兩個 list 記錄選用不同 learning rate schedular 的驗證結果
valid_loss_list = []
valid_acc_list = []

callback_l = {
    'None': None,
    'CustomLRScheduler': CustomLRScheduler,
    'ReduceLROnPlateau': torch.optim.lr_scheduler.ReduceLROnPlateau,
}
# for cb in callback_l:
for _, scheduler_type in callback_l.items():
    print('Training a model with callbacks: {}'
          .format(scheduler_type))
    model = build_model(X_train.shape[1], NUM_CLASS)
    model = model.to(device)
    optimizer = torch.optim.NAdam(model.parameters(), lr=0.001)
    if scheduler_type is not None:
        scheduler = scheduler_type(optimizer)
    else:
        scheduler = None
    loss_fn = nn.CrossEntropyLoss()

    # model.compile(optimizer='nadam',
    #               loss='categorical_crossentropy',
    #               metrics=['acc'])
    # history = model.fit(X_train, y_train,
    #                     epochs=20,
    #                     batch_size=64,
    #                     validation_data=(X_valid, y_valid),
    #                     callbacks=[callback_l[cb]],
    #                     verbose=0)
    history = run(model, optimizer, loss_fn, train_loader, valid_loader, scheduler, verbose=0)

    # 將訓練過程記錄下來
    train_loss_list.append(history[0])
    train_acc_list.append(history[1])
    valid_loss_list.append(history[2])
    valid_acc_list.append(history[3])
print('----------------- training done! -----------------')

In [None]:
# 視覺化訓練過程
plt.figure(figsize=(15, 5))

train_line = ()
valid_line = ()

# 繪製 Training loss
plt.subplot(121)
for k, cb in enumerate(callback_l):
    loss = train_loss_list[k]
    val_loss = valid_loss_list[k]
    train_l = plt.plot(
        range(len(loss)), loss,
        label=f'Training    callback:{cb}')
    valid_l = plt.plot(
        range(len(val_loss)), val_loss, '--',
        label=f'Validation callback:{cb}')

    train_line += tuple(train_l)
    valid_line += tuple(valid_l)
plt.title('Loss')

# 繪製 Training accuracy
plt.subplot(122)
train_acc_line = []
valid_acc_line = []
for k, cb in enumerate(callback_l):
    acc = train_acc_list[k]
    val_acc = valid_acc_list[k]
    plt.plot(range(len(acc)), acc,
             label=f'Training    callback:{cb}')
    plt.plot(range(len(val_acc)), val_acc, '--',
             label=f'Validation callback:{cb}')
plt.title('Accuracy')

first_legend = plt.legend(handles=train_line,
                          bbox_to_anchor=(1.05, 1))

plt.gca().add_artist(first_legend)
plt.legend(handles=valid_line,
           bbox_to_anchor=(1.05, 0.7))

<a name="CSVLogger"></a>
* ## CSVLogger

In [None]:
model = build_model(X_train.shape[1], NUM_CLASS)
model = model.to(device)
optimizer = torch.optim.NAdam(model.parameters(), lr=0.001)
loss_fn = nn.CrossEntropyLoss()
history = run(model, optimizer, loss_fn, train_loader, valid_loader,
              scheduler=None, verbose=0)

In [None]:
# 將訓練過程記錄下來
import pandas as pd
df = pd.DataFrame()
df['train_loss'] = history[0]
df['train_acc'] = history[1]
df['valid_loss'] = history[2]
df['valid_acc'] = history[3]
df.to_csv('logs.csv', index=False)
df

---
wandb（補充教材）: https://docs.wandb.ai/v/zh-hans/quickstart