In [1]:
import numpy as np
import torch
import pandas as pd
from torch.utils import data
from torch import nn
import matplotlib.pyplot as plt


## 步骤

读入数据集，预处理

1. `NA` 值的处理
2. 连续值标准化（减去均值，除以标准差）
3. 离散值转为热独码
4. 将数据集转为 `data` 格式
5. 构造 $K$ 折交叉检验的分割函数

构造训练过程、训练函数

1. 定义网络结构
2. 构造对数均方误差函数
3. 构造 $K$ 折交叉训练的函数

开始训练

1. 设置超参数
2. 开始训练
3. 若训练效果不好，返回第 1 步，重调超参数
4. 最终在整个数据集上训练这个网络
5. 传入测试集



## 预处理

In [2]:
train_data = pd.read_csv("./dataset/train.csv")
test_data = pd.read_csv("./dataset/test.csv")

In [3]:
# .iloc 取一个表格的一部分
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000


In [5]:
# print(test_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])

     Id  MSSubClass MSZoning  LotFrontage  YrSold SaleType SaleCondition
0  1461          20       RH         80.0    2010       WD        Normal
1  1462          20       RL         81.0    2010       WD        Normal
2  1463          60       RL         74.0    2010       WD        Normal
3  1464          60       RL         78.0    2010       WD        Normal


In [4]:
# pd.concat 将两个数据拼接起来
# pd.concat([train_data.iloc[:, :-1], test_data], axis=0)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [7]:
# 删除第一列：id
all_data = pd.concat([train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]], axis=0)

In [11]:
# 查看各列的数据格式
train_data.dtypes.iloc[:6]

Id               int64
MSSubClass       int64
MSZoning        object
LotFrontage    float64
LotArea          int64
Street          object
dtype: object

In [12]:
# 提取所有连续型（数值型）变量
numeric_features = all_data.dtypes[all_data.dtypes != 'object'].index

In [16]:
# 计算平均数、标准差
type(all_data["MSSubClass"]), all_data["MSSubClass"].mean(), all_data["MSSubClass"].std()

(pandas.core.series.Series, 57.1377183967112, 42.51762782915043)

In [17]:
all_data[numeric_features] = all_data[numeric_features].apply(
    lambda column: (column - column.mean()) / (column.std()),
    axis='index' # 沿 index (y) 轴应用函数，即对各列分别应用
)

In [18]:
# 所有均值已经置 0，因此将 NA 用平均值代替，只需要用 0 代替即可
all_data[numeric_features] = all_data[numeric_features].fillna(0)

In [20]:
# 处理离散值。通过 get_dummies 函数将离散值转换为独热码
# 缺省 columns 参数，pandas 会自动将独热码转换为
all_data = pd.get_dummies(all_data, dummy_na=True)

In [21]:
all_data.shape # 总特征有 79 个增加到 331 个

(2919, 331)

将数据转化为 Tensor

In [24]:
num_train = train_data.shape[0]

train_tensor = torch.tensor(all_data[:num_train].values, dtype=torch.float32)
test_tensor = torch.tensor(all_data[num_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)

train_tensor.shape, test_tensor.shape, train_labels.shape

(torch.Size([1460, 331]), torch.Size([1459, 331]), torch.Size([1460, 1]))

构造 $k$ 折交叉训练的分割函数

In [28]:
a = torch.zeros(0, 3) # 定义 0x3 张量，这样定义的好处是接下来可以和 n x 3 张量结合
b = torch.ones(4, 3)

a, torch.cat([a, b], dim=0)

(tensor([], size=(0, 3)),
 tensor([[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]]))

In [40]:
def get_k_fold_data(k: int, i: int, x: torch.Tensor, y: torch.Tensor):

    assert x.shape[0] == y.shape[0], \
        "The number of rows of x and y must be the same!"

    x_train = torch.zeros(0, x.shape[1])
    y_train = torch.zeros(0, y.shape[1])

    fold_size = x.shape[0] // k
    
    for j in range(k):
        if j == 1:
            x_test = x[fold_size * j : fold_size * (j + 1), :]
            y_test = y[fold_size * j : fold_size * (j + 1), :]
        else:
            x_train = torch.cat([x_train, x[fold_size * j : fold_size * (j + 1), :]], axis=0)
            y_train = torch.cat([y_train, y[fold_size * j : fold_size * (j + 1), :]], axis=0)
    return x_train, y_train, x_test, y_test

## 构造训练函数

In [22]:
in_features = all_data.shape[1]
in_features

331

产生空白网络

In [23]:
def get_net() -> nn.Sequential:
    """
    获得一个新的网络，用于 K 折交叉验证
    """
    net = nn.Sequential(
        nn.Linear(in_features, 64),
        nn.ReLU(),
        nn.Linear(64, 1)
    )

    def init_weight(m: nn.Module):
        if type(m) == nn.Linear:
            nn.init.normal_(m.weight, std=0.01)
            nn.init.zeros_(m.bias)
    
    net.apply(init_weight)

    return net

构造对数均方误差函数

In [30]:
loss = nn.MSELoss()

def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
    return rmse

In [33]:
log_rmse(lambda x: x , torch.tensor(2), torch.tensor(1))

tensor(0.6931)

构造根据给定的训练集、测试集的训练函数

In [47]:
def train(net: nn.Sequential, x_train, y_train, x_test, y_test,
    loss, lr, epochs, batch_size, weight_decay):
    """
    根据给定训练集、测试集训练的函数
    默认的优化器为Adam
    """
    train_loss, test_loss = [], []
    train_iter = data.DataLoader(data.TensorDataset(x_train, y_train), batch_size=batch_size, shuffle=True)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)

    for epoch in range(epochs):
        for x, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(x), y)
            l.backward()
            optimizer.step()
        
        # 计算损失值
        train_loss.append(log_rmse(net, x_train, y_train))
        if x_test is not None:
            test_loss.append(log_rmse(net, x_test, y_test))
        
    return train_loss, test_loss

构造 $K$ 折交叉训练的函数

In [49]:
from d2l import torch as d2l

def k_fold_train(train_tensor, train_labels,
    k: int, lr: float, epochs: int, batch_size: int, weight_decay: int):
    """
    开始 K 折交叉训练
    """
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        net = get_net()
        train_loss, test_loss = train(net,
            *get_k_fold_data(k, i, train_tensor, train_labels),
            loss=loss, lr=lr, epochs=epochs, batch_size=batch_size, weight_decay=weight_decay)
        train_l_sum += train_loss[-1]
        valid_l_sum += test_loss[-1]
        # if i == 0:
        #     d2l.plot(list(range(1, epochs + 1)), [train_loss, test_loss],
        #                 xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
        #                 legend=['train', 'valid'], yscale='log')

        print(f"Fold {i + 1}, train log rmse {float(train_loss[-1])}, validation log rmse {float(test_loss[-1])}")
    return train_l_sum / k, valid_l_sum / k


In [51]:
k, num_epochs, lr, weight_decay, batch_size = 5, 400, 10, 20, 256
train_l, valid_l = k_fold_train(train_tensor, train_labels, k, lr, num_epochs, batch_size, weight_decay)
print(f"训练平均：{float(train_l)}，验证平均：{float(valid_l)}")

Fold 1, train log rmse 0.32655221223831177, validation log rmse 0.6385515928268433
Fold 2, train log rmse 0.06955071538686752, validation log rmse 0.659576952457428
Fold 3, train log rmse 0.058088816702365875, validation log rmse 0.16610398888587952
Fold 4, train log rmse 0.1016976609826088, validation log rmse 0.16441212594509125
Fold 5, train log rmse 0.3626170754432678, validation log rmse 0.1733943074941635
训练平均：0.1837013065814972，验证平均：0.3604077994823456
