末端層
* 不追加函數作用在末端層 -> 末端層上面的每個神經元輸出值域[$-\infty$, $+\infty$]$ -> 處理回歸問題
* 追加Sigmoid(二元分類)或Softmax(多元分類)函數作用在末端層 -> 末端層上面的每個神經元輸出值域[$0$, $1$]$ -> 處理分類問題

# 本筆記目的：

1. 理解Linear Layer的輸入/輸出資料大小。
2. 能簡單的利用Linear Layer來建立並訓練Multi-layer perceptron。

---

### 測試Dense Layer I/O, 並以Dense Layer建立模型
* Sequential model: Logistic Regression
* Sequential model: Softmax Regression
* 練習：建立Multilayer Perceptron模型，並且丟簡單資料進去做訓練

---

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd
import sklearn
import os

import torch

sns.set()
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
from torch.nn import Sequential
from torch.nn import Linear, Conv2d, MaxPool2d
from torch.nn import Sigmoid, Softmax, ReLU  # activation function

from torch.optim import SGD

from torch.utils.data import DataLoader, TensorDataset

from torch.nn import CrossEntropyLoss

---

## A. 測試Dense Layer I/O, 並以Dense Layer建立模型。

### Sequential model: Logistic Regression

In [4]:
rand_data = np.random.normal(0, 1, (5, 3))  # [BS=5, num_features=3]
rand_data = torch.Tensor(rand_data)  # 常態分佈的亂數資料當input;
# 5個樣本，每個樣本有3個特徵

In [6]:
isinstance(torch.nn.Linear(3, 1), torch.nn.Module)

True

In [159]:
# 定義模型
print(f"Input shape= {rand_data.shape}")
model = Sequential(
    Linear(3, 1),  # [BS, 3] -> [BS, 1]
    Sigmoid(),  # [BS, 1] -> [BS, 1]
)  # 預測y=1的機率

out = model(rand_data)
print(out)
print(f"Output shape= {out.shape}")
# 應該會回傳5個小於1的數值。分別為各樣本的預測機率

Input shape= torch.Size([5, 3])
tensor([[0.3231],
        [0.2389],
        [0.3234],
        [0.3298],
        [0.3060]], grad_fn=<SigmoidBackward0>)
Output shape= torch.Size([5, 1])


---

### Sequential model: Softmax Regression

In [160]:
rand_data = np.random.normal(0, 1, (5, 3))  # [BS=5, num_features=3]
rand_data = torch.Tensor(rand_data)  # 常態分佈的亂數資料當input,
# 5個樣本，每個樣本有3個特徵

In [161]:
# 定義模型
model = Sequential(
    Linear(3, 3),  # [BS=5, num_features=3] -> [BS=5, num_features=3]
    Softmax(dim=-1),  # [BS=5, num_features=3] -> [BS=5, num_features=3]
)

model(rand_data).sum(
    axis=-1
)  # 驗證Softmax輸出：P_A+P_B+P_C=1 ([BS, num_probabilities=3] -> [BS,])

tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000], grad_fn=<SumBackward1>)

---

### 練習：建立Multilayer Perceptron的模型，並且將$X_{new}$, $y_{new}$丟進去做訓練。

In [162]:
X = np.zeros((90, 3))
for j in range(0, 30):
    X[j, :] = 0.0
for j in range(30, 60):
    X[j, :] = 1.0
for j in range(60, 90):
    X[j, :] = 2.0

y = X[:, 0].astype(np.int64)

In [163]:
# Data standarization
from sklearn.preprocessing import StandardScaler

scaler = (
    StandardScaler()
)  # (1)估算每個資料欄位的mean, std (2)把N(mean, std^2)變成N(0, 1)
# (2)怎麼做呢? X'= (X - mean) / std

scaler = scaler.fit(X)
X_new = scaler.transform(X)

# One-hot encoding
y_new = np.eye(3)[y]

X_new = X_new.astype(np.float32)
y_new = y_new.astype(np.float32)

In [None]:
# === 建立模型 ===

# 練習於此

In [165]:
assert isinstance(Sequential(), torch.nn.Module)
assert isinstance(torch.nn.Linear(1, 1), torch.nn.Module)

In [235]:
def train_step(
    dataloader,
    model: torch.nn.Module,
    loss_fn,
    optimizer,
    verbose_every: int = 999,
) -> None:
    """訓練一個epoch。"""
    for iteration, (batch_x, batch_y) in enumerate(dataloader):

        #########################
        ## 練習: 添加正傳遞與倒傳遞
        pred_y = model(batch_x)
        loss_value = loss_fn(pred_y, batch_y)

        # print(
        #     f"right before backward: {model[0]} has grad= {model[0].weight.grad is not None}",
        # )
        # loss_value.backward()
        # print(
        #     f"right after backward: {model[0]} has grad= {model[0].weight.grad is not None}",
        # )
        #########################

        if iteration + 1 % verbose_every == 0:
            loss = loss_value.item()
            print("loss={:.4f}".format(iteration, loss))

        optimizer.step()  # 梯度更新只要執行 step()即可。這個步驟會將每個權重
        optimizer.zero_grad()


def test_step(
    dataloader,
    model,
    loss_fn,
):
    """結束一個epoch的訓練後，測試模型表現。"""
    size = len(dataloader.dataset)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for iteration, (batch_x, batch_y) in enumerate(dataloader):
            pred_y = model(batch_x)

            test_loss += loss_fn(pred_y, batch_y).item()
            correct += (pred_y.argmax(axis=1) == batch_y).type(torch.float).sum().item()

    test_loss /= size
    correct /= size

    print("test_loss={:.4f}, accuracy={:.2f}".format(test_loss, correct))

In [None]:
# 載入資料成為DataLoader
X_new, y_new = torch.tensor(X_new), torch.tensor(y_new)
y = torch.tensor(y).to(torch.int64)
td = TensorDataset(X_new, y)
dl = DataLoader(td, batch_size=32, shuffle=True)

# model = Sequential(
#     Linear(3, num_units),
#     ReLU(),
#     Linear(num_units, num_units),  # R_+30
#     ReLU(),
#     Linear(num_units, num_units),  # R_+15
#     ReLU(),
#     Linear(num_units, num_units),  # R_-45
#     ReLU(),
#     Linear(num_units, num_units),  # R_+10
#     ReLU(),
#     Linear(num_units, 3),
#     # Softmax(),
# )

num_units: int = 100
num_hidden_layers: int = 6
ActivationClass: torch.nn.ReLU | torch.nn.Softmax = torch.nn.ReLU
model = Sequential(
    Linear(3, num_units),
    ActivationClass(),
    *[
        Linear(num_units, num_units),
        ActivationClass(),
    ]
    * num_hidden_layers,
    Linear(num_units, 3),
)

# 申論題: 短而肥或長而瘦, 哪個建模策略比較好? (模型權重數量要差不多)

# 宣告模型訓練設定
num_epochs = 200
learning_rate = 0.2

# 定義優化器, Loss函數
ce_loss = CrossEntropyLoss()
opt = SGD(
    model.parameters(),
    lr=learning_rate,
)

# 訓練模型
for j in range(num_epochs):
    train_step(dl, model, ce_loss, opt)
    test_step(dl, model, ce_loss)

  X_new, y_new = torch.tensor(X_new), torch.tensor(y_new)
  y = torch.tensor(y).to(torch.int64)


right before backward: Linear(in_features=3, out_features=100, bias=True) has grad= False
right after backward: Linear(in_features=3, out_features=100, bias=True) has grad= True
right before backward: Linear(in_features=3, out_features=100, bias=True) has grad= False
right after backward: Linear(in_features=3, out_features=100, bias=True) has grad= True
right before backward: Linear(in_features=3, out_features=100, bias=True) has grad= False
right after backward: Linear(in_features=3, out_features=100, bias=True) has grad= True
test_loss=0.0366, accuracy=0.33
right before backward: Linear(in_features=3, out_features=100, bias=True) has grad= False
right after backward: Linear(in_features=3, out_features=100, bias=True) has grad= True
right before backward: Linear(in_features=3, out_features=100, bias=True) has grad= False
right after backward: Linear(in_features=3, out_features=100, bias=True) has grad= True
right before backward: Linear(in_features=3, out_features=100, bias=True) has 

In [None]:
assert hasattr(model, "__getitem__")
assert hasattr(model, "__iter__")

In [236]:
for name, param in model.named_parameters():
    print(name, param.shape)

0.weight torch.Size([100, 3])
0.bias torch.Size([100])
2.weight torch.Size([100, 100])
2.bias torch.Size([100])
14.weight torch.Size([3, 100])
14.bias torch.Size([3])
