In [1]:
import numpy as np
import pandas as pd

import torch 
from torch import nn
from torch.utils import data

from d2l import torch as d2l

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 读取和处理数据集

In [None]:
# 读取数据集
sample_submission = pd.read_csv('./dataset/sample_submission.csv')
train_data = pd.read_csv('./dataset/train.csv')
test_data = pd.read_csv('./dataset/test.csv')

# 数据集形状
sample_submission.shape, train_data.shape, test_data.shape

In [None]:
# 查看前 5 行样本
train_data.head()

In [None]:
# 训练集概览
train_data.info()

In [None]:
# 连接训练集和测试集特征
all_features = pd.concat((train_data.loc[:, train_data.columns != 'Sold Price'], test_data.iloc[:, 1:]))
all_features = all_features.drop(['Id', 'State', 'Summary', 'Address'], axis=1)
all_features.info()

In [None]:
# 预处理

# 提取数值类特征的索引
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index

# 数值特征标准化
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))

# 填充均值
all_features[numeric_features] = all_features[numeric_features].fillna(0)

# 转换为标准日期格式
all_features['Listed On'] = pd.to_datetime(all_features['Listed On'], format="%Y-%m-%d")
all_features['Last Sold On'] = pd.to_datetime(all_features['Last Sold On'], format="%Y-%m-%d")

# 截取出数值特征，和类别数较少的特征`Type`
features = list(numeric_features)
features.append('Type')
all_features = all_features[features]

# 对特征`Type`进行独热编码（Nan值也作为一列特征进行编码）
all_features = pd.get_dummies(all_features, dummy_na=True)

# 确保所有数据类型都为float32或int类型
all_features = all_features.astype('float32')

# 预览前5个样本
all_features.head()

In [14]:
# 转换为张量格式
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(train_data['Sold Price'].values.reshape(-1, 1), dtype=torch.float32)

in_features = train_features.shape[1] # 输入特征数

# 模型和损失函数

In [15]:
# 损失函数
loss = nn.MSELoss()

def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))  # 为了在取对数时稳定数值，将小于1的值设置为1
    rmse = torch.sqrt(loss(
        torch.log(clipped_preds),
        torch.log(labels)
    ))
    return rmse

In [16]:
# 模型
def get_net():
    net = nn.Sequential(
        nn.Linear(in_features, 1024),
        nn.ReLU(),
        nn.Linear(1024, 512),
        nn.ReLU(),
        nn.Linear(512, 128),
        nn.ReLU(),
        nn.Linear(128, 1)
    )
    return net

# 训练

In [17]:
def load_array(data_arrays, batch_size, is_train=True):
    """构造PyTorch数据迭代器"""
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

In [18]:
def train(net, 
          train_features, train_labels, test_features, test_labels,
          num_epochs, lr, weight_decay, batch_size):
    """训练过程"""
    train_ls, test_ls = [], []
    train_iter = load_array((train_features, train_labels), batch_size)
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr=lr, weight_decay=weight_decay)
    
    for epoch in range(num_epochs):
        for X, y in train_iter:

            X = X.to(device)
            y = y.to(device)
            optimizer.zero_grad()
            l = loss(net(X), y).to(device)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    
    return train_ls, test_ls


In [19]:
# k折交叉验证输入
def get_k_fold_data(k, i, X, y):

    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None

    for j in range(k):
        idx = slice(j * fold_size, (j + 1)*fold_size)
        X_part, y_part = X[idx, :], y[idx]

        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    
    return X_train, y_train, X_valid, y_valid

# k折交叉验证
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
           batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net().to(device)
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                     legend=['train', 'valid'], yscale='log') 
        print(f'fold {i + 1}, train log rmse {float(train_ls[-1]):f}, '
              f'valid log rmse {float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

In [None]:
train_features=train_features.to(device)
train_labels=train_labels.to(device)
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 0.01, 0.1, 256
train_l, valid_l, model= k_fold(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
      f'平均验证log rmse: {float(valid_l):f}')

# 预测

In [None]:
# 模型预测方法
def train_and_pred(train_features, test_feature, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net().to(device)
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
             ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'train log rmse {float(train_ls[-1]):f}')
    preds = net(test_features).detach().cpu().numpy()
    return preds

In [None]:
test_features = test_features.to(device)
train_features = train_features.to(device)
train_labels = train_labels.to(device)
preds = train_and_pred(train_features, test_features, train_labels, test_data,
                       num_epochs, lr, weight_decay, batch_size) 