In [1]:
import torch
from torch import nn
from d2l import torch as d2l
import pandas as pd

数据预处理

In [2]:
train_data = pd.read_csv('california-house-prices/train.csv')
test_x = pd.read_csv('california-house-prices/test.csv')
test_y = pd.read_csv('california-house-prices/sample_submission.csv')

train_x = train_data.iloc[:, 3:40]
train_x_numeric_features = train_x.dtypes[train_x.dtypes != 'object'].index
train_x = train_x[train_x_numeric_features]
train_x = train_x.apply(
    lambda x : (x - x.mean()) / x.std()
)
train_x = train_x.fillna(0)
train_y = train_data.iloc[:, 2]

test_x_numeric_features = test_x.dtypes[test_x.dtypes != 'object'].index
test_x = test_x[test_x_numeric_features]
test_x = test_x.apply(
    lambda x : (x - x.mean()) / x.std()
)
test_x = train_x.fillna(0)
test_y = test_y.iloc[:, 1]

train_x = torch.tensor(train_x.values, dtype = torch.float32)
train_y = torch.tensor(train_y.values, dtype = torch.float32)
test_x = torch.tensor(test_x.values, dtype = torch.float32)
test_y = torch.tensor(test_y.values, dtype = torch.float32)

In [3]:
loss = nn.MSELoss()
num_inputs, num_outputs = train_x.shape[1], 1

def log_rmse(y_hat, y):
    clipped_preds = torch.clamp(y_hat, 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(y)))
    return rmse

def get_net():
    return nn.Sequential(nn.Linear(num_inputs, num_outputs))

def init_weights(m):
    if type(m) == nn.Module:
        nn.init.normal_(m.weight, mean = 0, std = 1)
        nn.init.zeros_(m.bias)

net = get_net()
net = net.to(torch.device('cuda'))

交叉验证

In [4]:
def get_k_fold_data(k, i, train_x, train_y):

    assert k > 1

    fold_size = train_x.shape[0] // k

    cross_train_x, cross_train_y = None, None
    cross_validate_x, cross_validate_y = None, None

    for j in range(k):

        idx = slice(j * fold_size, (j + 1) * fold_size)
        x_part, y_part = train_x[idx], train_y[idx]

        if j == i:
            cross_validate_x, cross_validate_y = x_part, y_part
        elif cross_train_x is None:
            cross_train_x, cross_train_y = x_part, y_part
        else:
            cross_train_x = torch.cat([cross_train_x, x_part], dim = 0)
            cross_train_y = torch.cat([cross_train_y, y_part], dim = 0)

    return cross_train_x, cross_train_y, cross_validate_x, cross_validate_y

In [5]:
def train(epochs, train_iter, lr, weight_decay):

    net.apply(init_weights)
    updater = torch.optim.Adam(net.parameters(), lr = lr, weight_decay = weight_decay)

    for epoch in range(epochs):

        metric = d2l.Accumulator(2)

        for x, y in train_iter:

            x = x.to(torch.device('cuda'))
            y = y.to(torch.device('cuda'))

            updater.zero_grad()
            y_hat = net(x)
            l = log_rmse(y_hat, y.reshape(y_hat.shape))
            l.sum().backward()
            updater.step()

            metric.add(l.sum(), l.numel())

        current_loss = metric[0] / metric[1]

    return current_loss

def test(test_iter):

    metric = d2l.Accumulator(2)

    for x, y in test_iter:
        with torch.no_grad():

            x = x.to(torch.device('cuda'))
            y = y.to(torch.device('cuda'))

            y_hat = net(x)
            l = log_rmse(y_hat, y.reshape(y_hat.shape))

            metric.add(l.sum(), l.numel())

    return metric[0] / metric[1]

In [6]:
def k_fold(k, train_x, train_y, cross_batch_size, epochs, lr, weight_decay):

    cross_train_l_sum, cross_validate_l_sum = 0.0, 0.0

    for i in range(k):
        cross_train_x, cross_train_y, cross_validate_x, cross_validate_y = get_k_fold_data(k, i, train_x, train_y)

        cross_train_iter = d2l.load_array((cross_train_x, cross_train_y), batch_size = cross_batch_size)
        cross_validate_iter = d2l.load_array((cross_validate_x, cross_validate_y), batch_size = cross_batch_size, is_train = False)

        cross_train_l_sum += train(epochs, cross_train_iter, lr, weight_decay)
        cross_validate_l_sum += test(cross_validate_iter)

    return cross_train_l_sum / k, cross_validate_l_sum / k

In [None]:
cross_batch_size, batch_size, epochs, lr, weight_decay = 256, 256, 500, 1, 0.0

k_fold(5, train_x, train_y, cross_batch_size, epochs, lr, weight_decay)