In [None]:
# Set the Kaggle Notebook to enable automatic code completion
%config Completer.use_jedi = False

In [None]:
%matplotlib inline

import os

import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset, DataLoader

In [None]:
# Device configuration
print(f"GPU available: {torch.cuda.is_available()}")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load Data

In [None]:
# Load data from .csv by pandas
base_path = '/kaggle/input/home-data-for-ml-course'
labeled_data = pd.read_csv(os.path.join(base_path, 'train.csv'))
submit_data = pd.read_csv(os.path.join(base_path, 'test.csv'))

In [None]:
# When pandas reads csv, it regards None as NA by default.
# But in the MasVnrType feature, None represents a specific value, which needs to be restored from NA to None
labeled_data['MasVnrType'] = labeled_data['MasVnrType'].fillna('None')
submit_data['MasVnrType'] = submit_data['MasVnrType'].fillna('None')

In [None]:
print(f'{labeled_data.shape = }')
print(f'{submit_data.shape = }')
print(labeled_data.iloc[:4, [0, 1,2, 3, -3, -2, -1]])

In [None]:
# Concat the features of the training set and the test set
all_features = pd.concat((
    labeled_data.iloc[:, 1:-1],    # Remove the ID features and label from the training sets
    submit_data.iloc[:, 1:]        # Remove the ID features from the test sets
))
labels = labeled_data.iloc[:, -1]
submit_idxs = submit_data.iloc[:, 0]

In [None]:
# Data Preprocessing

# 1. Preprocess Numeric Features
# 1.1 Data Normalization
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 2.1 Mean Imputation
all_features[numeric_features] = all_features[numeric_features].fillna(0)

# 2. Preporcess Non-numeric Features
# The pandas monothermal encoding conversion result is a bool type vector, which will be converted to torch if it is placed in the same table as the numeric type. Tensor Times Error
all_features = pd.get_dummies(all_features, dummy_na=True).astype(np.float32)   # dummy_na=True, treat 'na' as a valid feature and create an indicator feature for it

In [None]:
print(f'{all_features.shape = }')

In [None]:
# Transforms to torch.Tensor
labeled_size = labeled_data.shape[0]
labeled_features = torch.tensor(all_features[:labeled_size].values, dtype=torch.float32)
submit_features = torch.tensor(all_features[labeled_size:].values, dtype=torch.float32)
submit_idxs = torch.tensor(submit_idxs.values.reshape(-1, 1), dtype=torch.int32)
labels = torch.tensor(labels.values.reshape(-1, 1), dtype=torch.float32)

In [None]:
print(labeled_features.shape, labels.shape)
print(submit_features.shape)

In [None]:
# Split labeled data to: 80% for training; 20% for testing;
train_size = int(0.8 * len(labeled_data))
train_features, train_labels = labeled_features[:train_size, :], labels[:train_size, :]
test_features, test_labels = labeled_features[train_size:, :], labels[train_size:, :]

In [None]:
print(train_features.shape, train_labels.shape)
print(test_features.shape, test_labels.shape)

In [None]:
# Build DataLoader
batch_size = 64
train_iter = DataLoader(TensorDataset(train_features, train_labels), batch_size=batch_size, shuffle=True)
test_iter = DataLoader(TensorDataset(test_features, test_labels), batch_size=batch_size, shuffle=False)
labeled_iter = DataLoader(TensorDataset(labeled_features, labels), batch_size=batch_size, shuffle=True)

# Design Model Architecture

In [None]:
# Model Hyper-Parameters
inputs_num, outputs_num = labeled_features.shape[1], labels.shape[1]
hidden_size1 = 2048
dropout1 = 0.2

In [None]:
# Model Architecture
mlp = nn.Sequential(
    nn.Linear(inputs_num, hidden_size1),
    nn.ReLU(),
    nn.Dropout(dropout1),
    nn.Linear(hidden_size1, outputs_num),
)

# Train Model

In [None]:
# Set Hyper-Parameters
num_epochs = 110
lr = 1e-1
net = mlp
loss = nn.MSELoss()
weight_decay = 4e-1
optimizer = torch.optim.Adam(net.parameters(), lr=lr, weight_decay=weight_decay)

In [None]:
# Define RMSE
def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

In [None]:
# Start Training
def evaluate_by_test_dataset(net, test_iter):
    net.eval()    # Set model to evaluating mode
    trace_data = torch.tensor([0, 0]).type(torch.float32)    # sum[loss]; num[test_sample]
    with torch.no_grad():
        for X, y in test_iter:
            X, y = X, y
            y_hat = net(X)
            l = loss(y_hat, y)
            log_rmse_loss = log_rmse(net, X, y)
            trace_data += torch.tensor([float(log_rmse_loss) * y.numel(), y.numel()])
    return trace_data[0] / trace_data[1]    # average[loss]

def train_epoch(net, train_iter, loss, optimizer):
    net.train()    # Set model to training mode
    trace_data = torch.tensor([0, 0]).type(torch.float32)    # sum[loss]; num[train_sample]
    for X, y in train_iter:
        X, y = X, y
        y_hat = net(X)
        l = loss(y_hat, y)
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        log_rmse_loss = log_rmse(net, X, y)
        trace_data += torch.tensor([float(log_rmse_loss) * y.numel(), y.numel()])
    return trace_data[0] / trace_data[1]    # average[loss]

def train_in_training_dataset(net, train_iter, test_iter, loss, optimizer, num_epochs):
    train_loss_trace, test_loss_trace = [], []
    for epoch in range(num_epochs):
        train_loss = train_epoch(net, train_iter, loss, optimizer)
        test_loss = evaluate_by_test_dataset(net, test_iter)
        train_loss_trace.append(train_loss)
        test_loss_trace.append(test_loss)
        if (epoch + 1) % 5 == 0:
            print(f"epoch {epoch + 1:0>{len(str(num_epochs))}}/{num_epochs}: ", end="")
            print(f"train_loss {train_loss:>10.8f}, test_loss {test_loss:>10.8f}")
    return train_loss_trace, test_loss_trace

In [None]:
# Train in splited training dataset
train_loss_trace, test_loss_trace = train_in_training_dataset(net, train_iter, test_iter, loss, optimizer, num_epochs)

In [None]:
# Illustrate train loss and test loss
epoch_xs = np.arange(1, num_epochs + 1)
plt.figure(figsize=(8, 5))
plt.plot(epoch_xs, train_loss_trace, 'bo-', label='Training Loss')
plt.plot(epoch_xs, test_loss_trace, 'ro-', label='Testing Loss')
plt.title('Training and Testing Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Log RMSE Loss')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Train in total labeled dataset
def train_in_labeled_dataset(net, labeled_iter, loss, optimizer, num_epochs):
    train_loss_trace = []
    for epoch in range(num_epochs):
        train_loss = train_epoch(net, labeled_iter, loss, optimizer)
        train_loss_trace.append(train_loss)
        if (epoch + 1) % 5 == 0:
            print(f"epoch {epoch + 1:0>{len(str(num_epochs))}}/{num_epochs}: ", end="")
            print(f"train_loss {train_loss:>10.8f}")
    return train_loss_trace

def reset_weights(m):
    if hasattr(m, 'reset_parameters'):
        m.reset_parameters()

net.apply(reset_weights)
train_loss_trace = train_in_labeled_dataset(net, labeled_iter, loss, optimizer, num_epochs)

# Generate Submission

In [None]:
# Model prediction
def predict(net, submit_features):
    net.eval()
    with torch.no_grad():
        rst = net(submit_features)
    return rst

prediction_rst = predict(net, submit_features).reshape(-1, 1)
print(submit_idxs.shape, prediction_rst.shape)

In [None]:
# Save result to .csv
df = pd.DataFrame({
    'Id': submit_idxs.cpu().numpy().reshape(-1),
    'SalePrice': prediction_rst.cpu().numpy().reshape(-1),
})
df.to_csv('submission.csv', index=False)