In [42]:
import pandas as pd
from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import torch
import torch.nn.functional as F
from torch import Tensor
from torch.nn import Linear, Module, ModuleList

from torch_frame.data import Dataset, DataLoader
from torch_frame import TensorFrame, stype
from torch_frame.nn.conv import TabTransformerConv
from torch_frame.nn.encoder import (
    EmbeddingEncoder,
    LinearEncoder,
    StypeWiseFeatureEncoder,
)

def prepare_dataset(df):
    # For categorical features, replace NA with "Missing"
    categorical_columns = df.select_dtypes(include=['object']).columns
    for key in categorical_columns:
        df[key] = df[key].astype('category').cat.add_categories("Missing").fillna("Missing")
    # categorical_columns = df.select_dtypes(include=['category']).columns
    return df

In [18]:
train_file_path = "data/house-prices/train.csv"
dataset_df = pd.read_csv(train_file_path)
dataset_df = dataset_df.drop('Id', axis=1)
target_column = 'SalePrice'
dataset_df = prepare_dataset(dataset_df)
col_to_stype = {key: stype.categorical for key in dataset_df.select_dtypes(include=['category']).columns.to_list()}
col_to_stype.update({key: stype.numerical for key in dataset_df.select_dtypes(exclude=['category']).columns.to_list()})
dataset = Dataset(dataset_df, col_to_stype=col_to_stype, target_col=target_column)
dataset.materialize()

dataset.tensor_frame

TensorFrame(
  num_cols=79,
  num_rows=1460,
  categorical (43): ['Alley', 'BldgType', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual', 'CentralAir', 'Condition1', 'Condition2', 'Electrical', 'ExterCond', 'ExterQual', 'Exterior1st', 'Exterior2nd', 'Fence', 'FireplaceQu', 'Foundation', 'Functional', 'GarageCond', 'GarageFinish', 'GarageQual', 'GarageType', 'Heating', 'HeatingQC', 'HouseStyle', 'KitchenQual', 'LandContour', 'LandSlope', 'LotConfig', 'LotShape', 'MSZoning', 'MasVnrType', 'MiscFeature', 'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofMatl', 'RoofStyle', 'SaleCondition', 'SaleType', 'Street', 'Utilities'],
  numerical (36): ['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtFullBath', 'BsmtHalfBath', 'BsmtUnfSF', 'EnclosedPorch', 'Fireplaces', 'FullBath', 'GarageArea', 'GarageCars', 'GarageYrBlt', 'GrLivArea', 'HalfBath', 'KitchenAbvGr', 'LotArea', 'LotFrontage', 'LowQualFinSF', 'MSSubClass', 'MasVnrArea', 'MiscVal', '

In [43]:
class ExampleTransformer(Module):
    def __init__(
        self,
        channels, out_channels, num_layers, num_heads,
        col_stats, col_names_dict,
    ):
        super().__init__()
        self.encoder = StypeWiseFeatureEncoder(
            out_channels=channels,
            col_stats=col_stats,
            col_names_dict=col_names_dict,
            stype_encoder_dict={
                stype.categorical: EmbeddingEncoder(),
                stype.numerical: LinearEncoder()
            },
        )
        self.convs = ModuleList([
            TabTransformerConv(
                channels=channels,
                num_heads=num_heads,
            ) for _ in range(num_layers)
        ])
        self.decoder = Linear(channels, out_channels)

    def forward(self, tf: TensorFrame) -> Tensor:
        x, _ = self.encoder(tf)
        for conv in self.convs:
            x = conv(x)
        out = self.decoder(x.mean(dim=1))
        return out

In [47]:
stype_encoder_dict = {
    stype.categorical: EmbeddingEncoder(),
    stype.numerical: LinearEncoder(),
}

device = 'cuda:5'

from torch_frame.nn.models.ft_transformer import FTTransformer

# model = FTTransformer(
#     channels=16,
#     out_channels=1,
#     num_layers=2,
#     col_stats=dataset.col_stats,
#     col_names_dict=dataset.tensor_frame.col_names_dict,
#     stype_encoder_dict=stype_encoder_dict,
# ).to(device)

model = ExampleTransformer(
    channels=32,
    out_channels=1,
    num_layers=2,
    num_heads=8,
    col_stats=dataset.col_stats,
    col_names_dict=dataset.tensor_frame.col_names_dict,
).to(device)

train_loader = DataLoader(dataset.tensor_frame, batch_size=128,
                          shuffle=True)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-6)

for epoch in range(500):
    print('===Epoch {}==='.format(epoch))
    total_loss = 0
    total_bsz = 0
    for tf in train_loader:
        tf = tf.to(device)
        pred = model.forward(tf)
        loss = F.mse_loss(pred, tf.y)
        bsz = len(tf.y)
        total_loss += loss.item() * bsz
        total_bsz += bsz
        optimizer.zero_grad()
        loss.backward()
    print('average loss = {}'.format(total_loss / total_bsz))

===Epoch 0===
average loss = 39039305767.27671
===Epoch 1===
average loss = 39039304487.97808
===Epoch 2===
average loss = 39039305469.89589
===Epoch 3===
average loss = 39039304673.139725
===Epoch 4===
average loss = 39039304431.86849
===Epoch 5===
average loss = 39039304246.70685
===Epoch 6===
average loss = 39039305138.84931
===Epoch 7===
average loss = 39039305251.0685
===Epoch 8===
average loss = 39039305385.73151
===Epoch 9===
average loss = 39039305015.40822
===Epoch 10===
average loss = 39039304555.309586
===Epoch 11===
average loss = 39039304465.53425
===Epoch 12===
average loss = 39039303982.99178
===Epoch 13===
average loss = 39039303926.882195
===Epoch 14===
average loss = 39039305290.34521
===Epoch 15===
average loss = 39039303921.27123
===Epoch 16===
average loss = 39039305800.94247
===Epoch 17===
average loss = 39039304925.63287
===Epoch 18===
average loss = 39039305004.1863
===Epoch 19===
average loss = 39039305991.715065
===Epoch 20===
average loss = 39039304454.31233


KeyboardInterrupt: 

In [37]:
model.forward(dataset.tensor_frame.to(device))

tensor([[-0.3355],
        [ 0.2470],
        [ 0.1111],
        ...,
        [-0.3686],
        [ 0.2081],
        [-0.1134]], device='cuda:5', grad_fn=<AddmmBackward0>)

In [25]:
test_file_path = "data/house-prices/test.csv"
test_data = pd.read_csv(test_file_path)
ids = test_data.pop('Id')

test_data = prepare_dataset(test_data)
col_to_stype = {key: stype.categorical for key in test_data.select_dtypes(include=['category']).columns.to_list()}
col_to_stype.update({key: stype.numerical for key in test_data.select_dtypes(exclude=['category']).columns.to_list()})
test_dataset = Dataset(test_data, col_to_stype=col_to_stype)
test_dataset.materialize()

model(test_dataset.tensor_frame.to(device))

tensor([[-0.0792],
        [-0.2398],
        [-0.0740],
        ...,
        [-0.6281],
        [-0.3431],
        [-0.5273]], device='cuda:5', grad_fn=<AddmmBackward0>)