- training process
    - defining neural network architectures
    - handing data
    - specifying loss function: a measure of ftness
    - training the model
- monitoring
- save and load

## Linear Regression 的解析解法

$$
l^{(i)}(\mathbf{w}, b)=\frac{1}{2}\left(\hat{y}^{(i)}-y^{(i)}\right)^{2}
$$
$$
L(\mathbf{w}, b)=\frac{1}{n} \sum_{i=1}^{n} l^{(i)}(\mathbf{w}, b)=\frac{1}{n} \sum_{i=1}^{n} \frac{1}{2}\left(\mathbf{w}^{\top} \mathbf{x}^{(i)}+b-y^{(i)}\right)^{2}
$$
$$
\mathbf{w}^{*}, b^{*}=\underset{\mathbf{w}, b}{\operatorname{argmin}} L(\mathbf{w}, b)
$$
因为线性回归的形式比较简单, 是凸优化问题," 其损失函数是严格的凸函数, 有唯一的全局最优解. 通过计算梯度为0, 我们可以求得参数 W 的最优解 (bias 可以通过扩展 x 融入 W 参数中), 是损失函数最小.

$$
\mathbf{w}^{*}=\left(\mathbf{X}^{\top} \mathbf{X}\right)^{-1} \mathbf{X}^{\top} \mathbf{y}
$$

## 梯度下降解法

当我们面对高维和非凸损失函数的时候, 我们还可以用梯度下降的方法, 有效的训练我们的模型. 在凸损失面上, 梯度下降算法使我们最终能到达全局最优点; 而对于非凸损失面来说, 我们也能到达相对较好的局部最优点.

$$
(\mathbf{w}, b) \leftarrow(\mathbf{w}, b)-\frac{\eta}{|\mathcal{B}|} \sum_{i \in \mathcal{B}} \partial_{(\mathbf{w}, b)} l^{(i)}(\mathbf{w}, b)
$$

## Linear Regression 的 Squared Loss 对应的噪声假设

## Linear 具体实现

In [49]:
import torch
import numpy as np
import sys; sys.path.insert(0, '../')
import d2l_utils
%matplotlib inline

In [50]:
torch.manual_seed(0)
# torch.cuda.manual_seed_all(0)

torch.__version__               # PyTorch version
# torch.version.cuda              # Corresponding CUDA version
# torch.backends.cudnn.version()  # Corresponding cuDNN version
# torch.cuda.get_device_name(0)   # GPU type

'1.3.1'

In [51]:
b_true = 4.2
W_true = np.array([2, -3.4])
X, y = d2l_utils.synthetic_data(W_true, b_true, num_examples=1000)
X.shape, y.shape

((1000, 2), (1000,))

In [52]:
def data_set(bs):
    data_size = len(X)
    # random index
    index = list(range(data_size))
    np.random.shuffle(index)
    for i in range(int(data_size/bs)):
        batch_index = index[i*bs:(i+1)*bs]
        yield X[batch_index], y[batch_index]

In [53]:
for feats, labels in data_set(bs=10): 
    print(feats, '\n', labels) 
    break

[[ 0.47159115 -0.67109709]
 [ 1.87359844 -0.62738453]
 [ 1.30957152  1.38793651]
 [-1.29053027  0.60308994]
 [ 1.20951341  0.60867067]
 [ 1.73368678 -0.39315388]
 [-0.81228793  1.57981897]
 [-0.07403614  0.17417899]
 [-1.08244985  1.32969859]
 [ 1.28990722 -2.62680009]] 
 [ 7.42199552 10.09935919  2.11825675 -0.43454829  4.54963239  9.00763023
 -2.79516574  3.45236455 -2.4811429  15.69762414]


### numpy 实现 解析解

In [54]:
X_ = np.concatenate([np.ones((1000,1)), X], axis=1)

In [55]:
np.linalg.inv(X_.T@X_)@(X_.T)@y

array([ 4.20027501,  2.00031696, -3.39964813])

### numpy 实现 梯度下降

In [56]:
def linear_regression(feats, W, b):
    return feats@W+b

def square_loss(y_hat, label):
    return np.sum((y_hat-label)**2)/2

def cal_grad(X, y_hat, label):
    return X.T@(y_hat-label)

In [57]:
# train
W = np.random.randn(2)
b = np.zeros(1)
epoch = 50
bs = 48
lr = 1e-2

for _ in range(epoch):
    for X_batch, y_batch in data_set(bs):
        y_hat_batch = linear_regression(X_batch, W, b)
        W -= lr*cal_grad(X_batch, y_hat_batch, y_batch)/bs
        b -= lr*cal_grad(np.ones(len(X_batch)), y_hat_batch, y_batch)/bs
    #print('loss: %s' % square_loss(y_hat_batch, y_batch))

b, W

(array([4.20001938]), array([ 2.00001581, -3.39960359]))

### pytorch 自动求导 实现

In [58]:
W = torch.randn((2,1), dtype=torch.float32, requires_grad=True)
b = torch.zeros(1, dtype=torch.float32, requires_grad=True)

epoch = 50
bs = 48
lr = 1e-2
loss = torch.nn.MSELoss() # lambda y1,y2: ((y1-y2)**2).mean()

def linreg(inputs, W, b):
    return torch.mm(inputs, W) + b

for _ in range(epoch):
    for X_batch, labels in data_set(bs):
        X_batch = torch.tensor(X_batch, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.float32)
            
        output = linreg(X_batch, W, b)
        l = loss(output.reshape(labels.shape), labels)
        
        l.backward()
        W.data -= lr*W.grad
        b.data -= lr*b.grad
        W.grad.zero_()
        b.grad.zero_()
    #print('loss: %s' % loss(output, labels).mean().item())
b, W

(tensor([4.2003], requires_grad=True),
 tensor([[ 2.0003],
         [-3.3997]], requires_grad=True))

### pytorch API 实现

In [59]:
b_true = 4.2
W_true = np.array([2, -3.4])
X, y = d2l_utils.synthetic_data(W_true, b_true, num_examples=1000)

batch_size = 10
data_set = torch.utils.data.TensorDataset(
    torch.tensor(X).float(),
    torch.tensor(y).float()
)
data_iter = torch.utils.data.DataLoader(data_set, batch_size, shuffle=True)

In [60]:
for i in data_iter:
    print(i)
    break

[tensor([[ 1.4374,  1.4527],
        [ 1.7666,  1.8294],
        [ 0.2987,  1.4632],
        [-0.7965, -0.2847],
        [ 0.2974, -0.7641],
        [ 2.2987, -1.1603],
        [ 0.9579, -0.1704],
        [ 0.7150,  1.6190],
        [ 1.7622,  1.0498],
        [-0.6944, -1.8239]]), tensor([ 2.1394,  1.5071, -0.1910,  3.5566,  7.3934, 12.7346,  6.6952,  0.1188,
         4.1637,  9.0244])]


In [61]:
w = torch.empty((2,1), dtype=torch.float32, requires_grad=True)
b = torch.empty(1, dtype=torch.float32, requires_grad=True)

class LinearNet(torch.nn.Module):
    def __init__(self):
        super(LinearNet, self).__init__()
        self.linear = torch.nn.Linear(in_features=2, out_features=1, bias=True)
        torch.nn.init.normal_(self.linear.weight, 0., 0.1)
        torch.nn.init.constant_(self.linear.bias, 0.)
    
    def forward(self, x):
        return self.linear(x)

net = LinearNet()
print(net)

LinearNet(
  (linear): Linear(in_features=2, out_features=1, bias=True)
)


In [62]:
for param in net.parameters():
    print(param)

Parameter containing:
tensor([[0.1488, 0.0219]], requires_grad=True)
Parameter containing:
tensor([0.], requires_grad=True)


In [45]:
loss = torch.nn.MSELoss()
print(loss)

optimizer = torch.optim.SGD(net.parameters(), lr=0.03, )
print(optimizer)

# optimizer =optim.SGD([
#     # 如果对某个参数不指定学习率，就使用最外层的默认学习率
#     {'params': net.subnet1.parameters()}, # lr=0.03
#     {'params': net.subnet2.parameters(), 'lr': 0.01}
# ], lr=0.03)net.linear.bias

MSELoss()
SGD (
Parameter Group 0
    dampening: 0
    lr: 0.03
    momentum: 0
    nesterov: False
    weight_decay: 0
)


In [46]:
num_epochs = 5
for epoch in range(1, num_epochs + 1):
    for X, y in data_iter:
        optimizer.zero_grad() # 梯度清零，等价于net.zero_grad()
        output = net(X)
        l = loss(output, y.view(output.shape))
        l.backward()
        optimizer.step()
    print('epoch %d, loss: %f' % (epoch, l.item()))
net.linear.weight.data, net.linear.bias.data

epoch 1, loss: 0.000258
epoch 2, loss: 0.000158
epoch 3, loss: 0.000093
epoch 4, loss: 0.000099
epoch 5, loss: 0.000117


(tensor([[ 2.0001, -3.4003]]), tensor([4.1999]))

In [47]:
net = LinearNet() # initiallized
print(net)

optimizer_w = torch.optim.SGD([net.linear.weight], lr=0.03, weight_decay=0.01)
optimizer_b = torch.optim.SGD([net.linear.bias], lr=0.03)

num_epochs = 5
optimizers = [optimizer_w, optimizer_b]
for epoch in range(1, num_epochs+1):
    for X, y in data_iter:
        for opti in optimizers:
            opti.zero_grad()
        y_hat = net(X)
        l = loss(y_hat, y.view(y_hat.shape))
        l.backward()
        
        for opti in optimizers:
            opti.step()
    print('epoch %d, loss: %f' % (epoch, l.item()))   
net.linear.weight.data, net.linear.bias.data

LinearNet(
  (linear): Linear(in_features=2, out_features=1, bias=True)
)
epoch 1, loss: 0.000667
epoch 2, loss: 0.000659
epoch 3, loss: 0.000376
epoch 4, loss: 0.000209
epoch 5, loss: 0.000356


(tensor([[ 1.9910, -3.3827]]), tensor([4.2025]))