# **模型訓練（迴歸問題）**
此份程式碼會講解針對迴歸型任務在模型訓練上需要注意的細節。

## 本章節內容大綱
* ### [創建資料集／載入資料集（Dataset Creating/ Loading）](#DatasetCreating/Loading)
* ### [資料前處理（Data Preprocessing）](#DataPreprocessing)
* ### [模型建置（Model Building）](#ModelBuilding)
* ### [模型訓練（Model Training）](#ModelTraining)
* ### [模型評估（Model Evaluation）](#ModelEvaluation)
---

## 匯入套件

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

# PyTorch 相關套件
import torch
import torch.nn as nn

<a name="DatasetCreating/Loading"></a>
## 創建資料集／載入資料集（Dataset Creating / Loading）

In [None]:
# 下載資料集
!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/DL/Data_part2.zip
!unzip -q Data_part2.zip

In [None]:
train_df = pd.read_csv('./Data/FilmRating_train.csv')
test_df = pd.read_csv('./Data/FilmRating_test.csv')

In [None]:
train_df.head()

* #### 電影評價資料集
資料集總共 2612 筆，
欄位包括預算 (budget)、電影類型 (genres)、關鍵字詞 (keywords)、知名度 (popularity)、製作公司 (production_companies)、國家 (production_countries)、收入 (revenue)、時長 (runtime)、卡司 (cast)、導演 (director)、距離發布時間 (n_days)、評分 (score)，多項欄位是以 leave-one-out encoding 方式轉換數值。


In [None]:
X_df = train_df.iloc[:, :-1].values
y_df = train_df.score.values

In [None]:
X_test = test_df.iloc[:, :-1].values
y_test = test_df.score.values

<a name="DataPreprocessing"></a>
## 資料前處理（Data Preprocessing）

* ### 資料正規化（Data Normalization）
    - 減少過度關注的特徵（由特定數字範圍造成的影響）
    - 避免更新方向偏離，較容易收斂

對於測試資料，需使用「訓練資料」的統計量去做轉換，避免改變兩組資料間的分布關係
![](https://hackmd.io/_uploads/S1m3KtLZp.png)


In [None]:
'''Normalize'''
X_scale = (X_df-X_df.min(axis=0)) / (X_df.max(axis=0)-X_df.min(axis=0))
X_test_scale = (X_test-X_df.min(axis=0)) / (X_df.max(axis=0)-X_df.min(axis=0))

# 其他寫法
# from sklearn.preprocessing import MinMaxScaler
# sc = MinMaxScaler(feature_range=(0, 1))
# X_scale = sc.fit_transform(X_df)
# X_test_scale = sc.transform(X_test)

# '''Standardize'''
# X_scale = (X_df-X_df.mean(axis=0)) / (X_df.std(axis=0))
# X_test_scale = (X_test-X_df.mean(axis=0)) / (X_df.std(axis=0))

# 其他寫法
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_scale = sc.fit_transform(X_df)
# X_test_scale = sc.transform(X_test)

* ### 資料切分（Data Splitting）

In [None]:
# train, valid/test dataset split
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_scale, y_df,
                                                      test_size=0.2,
                                                      random_state=5566)

In [None]:
print(f'X_train shape: {X_train.shape}')
print(f'X_valid shape: {X_valid.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_valid shape: {y_valid.shape}')

In [None]:
# Build torch dataset and dataloader
from torch.utils.data import TensorDataset, DataLoader

BATCH_SIZE = 16

train_dataset = TensorDataset(torch.from_numpy(X_train).float(),
                              torch.from_numpy(y_train).unsqueeze(1).float())
valid_dataset = TensorDataset(torch.from_numpy(X_valid).float(),
                              torch.from_numpy(y_valid).unsqueeze(1).float())

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
for x, y in train_loader:
    print(x.shape, y.shape)
    break

<a name="ModelBuilding"></a>
## 模型建置（Model Building）

In [None]:
torch.manual_seed(5566)

model = nn.Sequential(
    nn.Linear(X_train.shape[1], 64),
    nn.Sigmoid(),
    nn.Linear(64, 32),
    nn.Sigmoid(),
    nn.Linear(32, 1)
)

print(model)

<a name="ModelTraining"></a>
## 模型訓練（Model Training）

* ### 優化器 (optimizer)、損失函數 (loss function)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'device: {device}')
model = model.to(device)

optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

In [None]:
def train_epoch(model, optimizer, loss_fn, train_dataloader, val_dataloader):
    # 訓練一輪
    model.train()
    total_train_loss = 0
    for x, y in tqdm(train_dataloader, leave=False):
        x, y = x.to(device), y.to(device) # 將資料移至GPU
        y_pred = model(x) # 計算預測值
        loss = loss_fn(y_pred, y) # 計算誤差
        optimizer.zero_grad() # 梯度歸零
        loss.backward() # 反向傳播計算梯度
        optimizer.step() # 更新模型參數

        total_train_loss += loss.item()
    # 驗證一輪
    model.eval()
    total_val_loss = 0
    # 關閉梯度計算以加速
    with torch.no_grad():
        for x, y in val_dataloader:
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            total_val_loss += loss.item()

    return total_train_loss / len(train_dataloader), total_val_loss / len(val_dataloader)

In [None]:
train_loss_log = []
val_loss_log = []
for epoch in tqdm(range(20)):
    train_loss, val_loss = train_epoch(model, optimizer, loss_fn, train_loader, valid_loader)
    train_loss_log.append(train_loss)
    val_loss_log.append(val_loss)
    print(f'Epoch: {epoch}, Train Loss: {train_loss:.3f}, Val Loss: {val_loss:.3f}')

<a name="ModelEvaluation"></a>
## 模型評估（Model Evaluation）

* ### 視覺化訓練過程的評估指標 （Visualization）

In [None]:
plt.figure(figsize=(15, 4))
plt.yscale('log')
plt.plot(range(len(train_loss_log)), train_loss_log, label='train_loss')
plt.plot(range(len(val_loss_log)), val_loss_log, label='valid_loss')

plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()

* ### 模型預測（Model predictions）

In [None]:
# y_pred = model(X_valid)
# print(f'預測結果： {y_pred[:5, 0]}')
# print(f'目標值： {y_valid[:5]}')

# predict all validation data
model.eval()

y_pred = []
y_true = []
with torch.no_grad():
    for x, y in valid_loader:
        x, y = x.to(device), y.to(device)
        y_pred.append(model(x).cpu())
        y_true.append(y.cpu())

y_pred = torch.cat(y_pred).numpy()
y_true = torch.cat(y_true).numpy()

* ### 視覺化結果

In [None]:
plt.figure(figsize=(15, 4))
plt.plot(range(len(y_pred)), y_pred, label='prediction')
plt.plot(range(len(y_valid)), y_valid, label='groundtruth')
plt.plot(range(len(y_pred)), y_pred[:, 0]-y_valid, label='difference')

plt.legend()
plt.xlabel('Samples')
plt.ylabel('Values')
plt.show()