In [None]:
ID_ITEM = "id"
LABEL_ITEM = "loan_paid_back"
IGNORE_ITEMS = [ID_ITEM, LABEL_ITEM]
VARIANCE_BINS = 24

DIM_MODEL = 1
DIM_FEEDFORWARD = 1
NUM_LAYERS = 0

VAL_PER_STEP = 1024
NUM_VAL_CYCLES = 32
BATCH_SIZE = 512
LEARNING_RATE = 1e-5
WEIGHT_DECAY = 1e-2
DROPOUT = 0

In [None]:
# 数据预处理
import pandas as pd
import numpy as np
from collections.abc import Callable
from tqdm import tqdm

ContinuousFeatureType = int | float
FeatureType = str | bool | ContinuousFeatureType


def create_feature_discretizer(feature: list[FeatureType]) -> tuple[Callable[[FeatureType], int], int]:
    """
    创建特征离散化函数，将特征值映射到整数索引
    
    对于类别特征或低方差数值特征: 使用查表法直接映射
    对于高方差数值特征: 基于 5%-95% 分位数进行分箱离散化
    
    Args:
        feature: 特征值列表，支持字符串或数值类型
        
    Returns:
        - 离散化函数: 输入特征值，返回整数索引
        - 分箱数量: 离散化后的类别总数
    """
    if isinstance(feature[0], str) or len(set(feature)) < VARIANCE_BINS:
        # 类别特征或低基数数值特征，创建值到索引的映射
        value_to_index = {value: idx for idx, value in enumerate(set(feature))}
        return (lambda token, mapping=value_to_index: mapping[token]), len(value_to_index)

    # 高基数数值特征，基于分位数进行分箱离散化
    feature = np.array(feature)
    lower_bound, upper_bound = np.percentile(feature, [5, 95])
    value_range = upper_bound - lower_bound + 1e-8
    return (lambda value, lower_bound=lower_bound, value_range=value_range: int(min(max((value - lower_bound) / value_range, 0), 1) * (VARIANCE_BINS - 1) + 0.5)), VARIANCE_BINS 


def create_feature_standardizer(feature: list[ContinuousFeatureType]) -> tuple[Callable[[ContinuousFeatureType], float], Callable[[float], float]]:
    """
    创建特征标准化函数，将数值特征转换为标准正态分布
    
    仅对高方差数值特征有效，类别特征返回 None
    使用均值和标准差进行标准化: (value - mean) / std
    对异常值比归一化更鲁棒
    
    Args:
        feature: 特征值列表
        
    Returns:
        标准化函数 - 将原始值转换为标准化值
        反向标准化函数 - 将标准化值转换回原始尺度值
    """
    # 高基数数值特征，创建基于均值和标准差的标准化函数
    feature = np.array(feature)
    mean = np.mean(feature)
    std = np.std(feature)

    # 避免除零错误
    if std < 1e-8:
        standardizer = lambda _: 0.0
        inverser = lambda _, mean=mean: mean
    else:
        standardizer = lambda value, mean=mean, std=std: (value - mean) / std
        inverser = lambda value, mean=mean, std=std: value * std + mean

    return standardizer, inverser

# 读取数据
raw_data = pd.read_csv("/kaggle/input/playground-series-s5e11/train.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s5e11/test.csv")

# 获取特征离散化函数和分箱数量
feature_discretizers_and_bins = [
    (feature_name, create_feature_discretizer(list(feature)))
    for feature_name, feature in raw_data.items()
    if feature_name not in IGNORE_ITEMS
]
feature_discretizers = {feature_type: discretizer for feature_type, (discretizer, bin_count) in feature_discretizers_and_bins}
feature_bin_counts = {feature_type: bin_count for feature_type, (discretizer, bin_count) in feature_discretizers_and_bins}

# 获取特征标准化函数
feature_standardizers = {
    feature_name: create_feature_standardizer(list(feature))[0]
    for feature_name, feature in raw_data.items()
    if feature_name not in IGNORE_ITEMS and not (isinstance(list(feature)[0], str) or len(set(feature)) < VARIANCE_BINS)
}

# 对特征列排序，先处理离散特征，再处理连续特征
sorted_columns = sorted([
    feature_name
    for feature_name in raw_data
    if feature_name not in IGNORE_ITEMS
], key=lambda x: (x in feature_standardizers, x))

# 获取标签标准化函数及其逆向函数
label_standardizer, label_inverser = create_feature_standardizer(raw_data[LABEL_ITEM])

# 转换数据集
dataset = []
test_dataset = []
for source_data, target_dataset in [(raw_data, dataset), (test_data, test_dataset)]:
    has_label = LABEL_ITEM in source_data
    for row_idx in tqdm(range(len(source_data[sorted_columns[0]]))):
        discretized_features = []  # 离散化特征值
        standardized_features = []  # 标准化特征值
        for feature_name in sorted_columns:
            feature_value = source_data[feature_name][row_idx]

            # 离散化处理（所有特征）
            discretized_features.append(feature_discretizers[feature_name](feature_value))

            # 标准化处理（仅连续特征）
            if feature_name in feature_standardizers:
                standardized_features.append(feature_standardizers[feature_name](feature_value))

        item_id = source_data[ID_ITEM][row_idx]
        label_value = label_standardizer(source_data[LABEL_ITEM][row_idx]) if has_label else float("nan")
        target_dataset.append((item_id, discretized_features, standardized_features, label_value))

# 切割带标签的数据集，分为训练集和验证集
split_point = int(len(dataset) * 0.9)
train_dataset = dataset[:split_point]
val_dataset = dataset[split_point:]


In [None]:
# 模型的定义
import torch
from torch import nn
from torch.nn import functional as F


class SwiGLU(nn.Module):
    def __init__(self, dim_model: int, dim_feedforward: int, dropout: float = 0.):
        super().__init__()
        self.linear1 = nn.Linear(dim_model, dim_feedforward * 2)
        self.linear2 = nn.Linear(dim_feedforward, dim_model)
        self.scale = nn.Parameter(torch.zeros(1))
        self.norm = nn.BatchNorm1d(dim_model)
        self.dropout = nn.Dropout(dropout)

        # 初始化权重
        for module in [self.linear1, self.linear2]:
            nn.init.xavier_uniform_(module.weight)
            nn.init.zeros_(module.bias)

    def forward(self, x: torch.Tensor):
        residual = x
        x = self.norm(x)
        gate, value = self.linear1(x).chunk(2, dim=-1)
        x = self.linear2(self.dropout(value * F.silu(gate)))
        return residual + x * self.scale


class FeatureInteractionPredictor(nn.Module):
    def __init__(
        self,
        categorical_cardinalities: list[int],  # 离散特征的类别数量 [feature1_classes, feature2_classes, ...]
        num_continuous: int,                   # 连续特征的数量
        dim_model: int,                        # 模型隐藏层维度
        dim_feedforward: int,                  # 前馈网络中间层维度
        num_layers: int,                       # SwiGLU 层数
        dropout: float = 0.                    # Dropout 比率
    ):
        super().__init__()
        self.dim_model = dim_model

        # 离散特征嵌入权重，控制每个特征嵌入的重要性
        self.embedding_weights = nn.Parameter(torch.zeros(len(categorical_cardinalities)))

        # 连续特征处理层
        self.continuous_projection = nn.Linear(num_continuous, 1)

        # 特征嵌入层，将离散特征映射到连续向量空间
        self.feature_embeddings = nn.ModuleList(
            nn.Embedding(num_categories, dim_model)
            for num_categories in categorical_cardinalities
        )

        # SwiGLU 交互层，学习特征间的高阶交互
        self.interaction_layers = nn.ModuleList(
            SwiGLU(dim_model, dim_feedforward, dropout)
            for _ in range(num_layers)
        )

        # 预测器，将隐藏表示映射到预测值
        self.predictor = nn.Linear(dim_model, 1)

        # 输出缩放因子，控制预测输出的尺度
        self.output_scale = nn.Parameter(torch.zeros(1))

        # 初始化权重
        nn.init.zeros_(self.predictor.bias)
        nn.init.zeros_(self.continuous_projection.bias)
        for module in [self.predictor, self.continuous_projection, *self.feature_embeddings]:
            nn.init.xavier_uniform_(module.weight)

    def forward(self, categorical_features: torch.LongTensor, continuous_features: torch.Tensor) -> torch.Tensor:
        # 加权求和所有特征的嵌入表示
        x = torch.zeros(categorical_features.size(0), self.dim_model, device=categorical_features.device)
        for feature_idx, feature_column in enumerate(categorical_features.T):  # 遍历每个特征列
            embedding = self.feature_embeddings[feature_idx](feature_column)
            weight = self.embedding_weights[feature_idx]
            x = x + embedding * weight

        # 通过交互层学习特征间复杂关系
        for interaction_layer in self.interaction_layers:
            x = interaction_layer(x)

        # 预测并调整输出尺度
        x = self.predictor(x).squeeze(-1) * self.output_scale

        # 加入连续特征的线性贡献
        x = x + self.continuous_projection(continuous_features).squeeze(-1)
        return x


In [None]:
# 数据集和 collate_fn
import json
from typing import Any
import pandas as pd
from torch.utils.data import Dataset


# 定义一个数据集
class FeatureInteractionDataset(Dataset):
    def __init__(self, dataset: list[tuple[int, list[int], list[float], float]]):
        super().__init__()
        self.dataset = dataset

    def __getitem__(self, idx: int) -> tuple[int, list[int], list[float], float]:
        return self.dataset[idx]

    def __len__(self) -> int:
        return len(self.dataset)


def collate_fn(batch: list[tuple[int, list[int], list[float], float]]):
    batch = [
        torch.tensor(item)
        for item in zip(*batch)
    ]
    return [
        item.to(dtype=torch.float32) if item.is_floating_point() else item
        for item in batch
    ]


In [None]:
# 训练循环
import torch
from tqdm import tqdm
from torch import nn, optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.amp import GradScaler, autocast
from torch.utils.tensorboard import SummaryWriter

# 获取设备
device = "cuda" if torch.cuda.is_available() else "cpu"

# 创建模型、优化器、梯度缩放器
model = FeatureInteractionPredictor(
    [feature_bin_counts[feature_name] for feature_name in sorted_columns],
    len(feature_standardizers),
    DIM_MODEL, DIM_FEEDFORWARD, NUM_LAYERS, DROPOUT
).to(device)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scaler = GradScaler(device)

# 创造训练、验证、测试加载器
train_loader, val_loader, test_loader = [
    DataLoader(FeatureInteractionDataset(dataset), batch_size=batch_size, collate_fn=collate_fn)
    for dataset, batch_size in [
        (train_dataset, BATCH_SIZE),
        (val_dataset, BATCH_SIZE * 2),
        (test_dataset, BATCH_SIZE * 2)
    ]
]

# 训练循环
current_steps = 0
progress_bar = tqdm(desc="Train", total=VAL_PER_STEP * NUM_VAL_CYCLES)
while current_steps < VAL_PER_STEP * NUM_VAL_CYCLES:
    for batch in train_loader:
        batch = [x.to(device) for x in batch[1:]]
        with autocast(device, dtype=torch.float16):
            logits = model(*batch[:-1])
            loss = F.binary_cross_entropy_with_logits(logits, batch[-1])
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        # 更新计数和进度条
        current_steps += 1
        progress_bar.update()

In [None]:
# 跑一遍结果
from tqdm import tqdm
from torch.nn import functional as F
from torch.utils.data import DataLoader

model.eval()
with open("/kaggle/working/submission.csv", "w") as f:
    f.write("id")
    with torch.inference_mode():
        for batch in tqdm(test_loader):
            data_indices = batch[0].tolist()
            batch = [x.to(device) for x in batch[1:-1]]
            outputs = F.sigmoid(model(*batch))
            for data_idx, output in zip(data_indices, outputs.cpu().tolist()):
                f.write(f"{data_idx},{output}\n")