In [4]:
import itertools
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from collections import OrderedDict, namedtuple, defaultdict
import os

def get_metrics(loader, model):
    pred, target = [], []  # 初始化预测结果和真实标签的列表
    model.eval()

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device).float(), y.to(device).float()
            y_pred = model(x)
            pred.append(y_pred.cpu().numpy())
            target.append(y.cpu().numpy())

    target = np.concatenate(target)
    pred = np.concatenate(pred)

    logloss = log_loss(target, pred)  # 计算 log loss
    auc = roc_auc_score(target, pred)  # 计算 AUC
    return logloss, auc

class MLP(nn.Module):
    def __init__(self, input_dim, mlp_layers, dropout=0.5, output_layer=False):
        super(MLP, self).__init__()
        layers = list()
        for i in mlp_layers:
            layers.append(torch.nn.Linear(input_dim, i))
            layers.append(torch.nn.BatchNorm1d(i))
            layers.append(torch.nn.ReLU())
            layers.append(torch.nn.Dropout(p=dropout))
            input_dim = i

        if output_layer:
            layers.append(torch.nn.Linear(input_dim, 1))
        self.mlp = torch.nn.Sequential(*layers)
        self._init_weight_()

    def _init_weight_(self):
        for m in self.mlp:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)

    def forward(self, x):
        return self.mlp(x)


class SENETLayer(nn.Module):
    def __init__(self, filed_size, reduction_ratio):
        super(SENETLayer, self).__init__()
        self.reduction_size = max(1, filed_size // reduction_ratio)
        self.excitation = nn.Sequential(
            nn.Linear(filed_size, self.reduction_size, bias=False),
            nn.ReLU(),
            nn.Linear(self.reduction_size, filed_size, bias=False),
            nn.ReLU(),
        )

    def forward(self, inputs):
        Z = torch.mean(inputs, dim=2)
        A = self.excitation(Z)
        V = torch.mul(inputs, torch.unsqueeze(A, dim=2))
        return V


class BilinearInteraction(nn.Module):
    def __init__(self, filed_size, embedding_size, bilinear_type='each'):
        super(BilinearInteraction, self).__init__()
        self.bilinear_type = bilinear_type
        self.bilinear = nn.ModuleList()
        if self.bilinear_type == 'all':
            self.bilinear = nn.Linear(embedding_size, embedding_size, bias=False)
        elif self.bilinear_type == 'each':
            for _ in range(filed_size):
                self.bilinear.append(nn.Linear(embedding_size, embedding_size, bias=False))
        elif self.bilinear_type == 'interaction':
            for _, _ in itertools.combinations(range(filed_size), 2):
                self.bilinear.append(nn.Linear(embedding_size, embedding_size, bias=False))

    def forward(self, inputs):
        inputs = torch.split(inputs, 1, dim=1)
        if self.bilinear_type == 'all':
            p = [torch.mul(self.bilinear(v_i), v_j) for v_i, v_j in itertools.combinations(inputs, 2)]
        elif self.bilinear_type == 'each':
            p = [torch.mul(self.bilinear[i](inputs[i]), inputs[j]) for i, j in
                 itertools.combinations(range(len(inputs)), 2)]
        elif self.bilinear_type == 'interaction':
            p = [torch.mul(bilinear(v[0]), v[1]) for v, bilinear in
                 zip(itertools.combinations(inputs, 2), self.bilinear)]
        return torch.cat(p, dim=1)



class CrossNet(nn.Module):
    def __init__(self, input_dim, layer_num):
        super(CrossNet, self).__init__()
        self.layer_num = layer_num
        self.w = torch.nn.ModuleList([
            torch.nn.Linear(input_dim, 1, bias=False) for _ in range(layer_num)
        ])
        self.b = torch.nn.ParameterList([torch.nn.Parameter(
            torch.zeros((input_dim,))) for _ in range(layer_num)])

    def forward(self, x):
        x0 = x
        for i in range(self.layer_num):
            xw = self.w[i](x)
            x = x0 * xw + self.b[i] + x
        return x

class MSB_CMNet(nn.Module):
    def __init__(self, feat_size, embedding_size, linear_feature_columns, dnn_feature_columns,
                 mlp_layers=(256, 128), num_heads=2,reduction_ratio=1,  cross_num=3, drop_rate=0.5):
        super(MSB_CMNet, self).__init__()
        self.act = nn.ReLU()
        self.dropout = nn.Dropout(drop_rate)
        self.sparse_feature_columns = list(filter(lambda x: x[1] == 'sparse', dnn_feature_columns))
        self.embedding_dic = nn.ModuleDict({
            feat[0]: nn.Embedding(feat_size[feat[0]], embedding_size, sparse=False) for feat in
            self.sparse_feature_columns
        })
        self.dense_feature_columns = list(filter(lambda x: x[1] == 'dense', dnn_feature_columns))

        self.feature_index = defaultdict(int)
        # 为特征名称建立索引位置（整数）的映射关系，并从 0 开始递增，确保不同特征拥有不同的索引位置。
        start = 0
        for feat in feat_size:
            self.feature_index[feat] = start
            start += 1

        # 引入多头注意力机制
        self.multi_head_attention = nn.MultiheadAttention(embedding_size, num_heads)
        self.field_size = len(self.embedding_dic)
        self.SE = SENETLayer(self.field_size, reduction_ratio)
        self.Bilinear = BilinearInteraction(self.field_size, embedding_size)
        dim = self.field_size * (self.field_size - 1) * embedding_size + len(self.dense_feature_columns)
        self.mlp = MLP(dim, mlp_layers)
        self.crossnet = CrossNet(dim, cross_num)
        self.dnn_linear = nn.Linear(dim + mlp_layers[-1], 1, bias=False)

    def forward(self, X):
        sparse_embedding = [
            self.embedding_dic[feat[0]](X[:, self.feature_index[feat[0]]].long()).reshape(X.shape[0], 1, -1)
            for feat in self.sparse_feature_columns]
        sparse_input = torch.cat(sparse_embedding, dim=1)
        multi_head_output, _ = self.multi_head_attention(sparse_input, sparse_input, sparse_input)
        dense_values = [X[:, self.feature_index[feat[0]]].reshape(-1, 1) for feat in self.dense_feature_columns]
        dense_input = torch.cat(dense_values, dim=1)
        senet_output = self.SE(sparse_input)
        senet_bilinear_out = self.Bilinear(senet_output)
        bilinear_out = self.Bilinear(multi_head_output)
        mlp_crossnet_input = torch.flatten(torch.cat((senet_bilinear_out, bilinear_out), dim=1), start_dim=1)
        final_mlp_crossnet_input = torch.cat((mlp_crossnet_input, dense_input), dim=1)
        dnn_out = self.mlp(final_mlp_crossnet_input)
        cross_out = self.crossnet(final_mlp_crossnet_input)
        final_out = torch.cat((cross_out, dnn_out), dim=-1)
        final_logit = self.dnn_linear(final_out)
        y_pred = torch.sigmoid(final_logit)
        return y_pred


if __name__ == '__main__':

    batch_size = 1024
    lr = 1e-3
    wd = 1e-5
    epoches = 12
    seed = 2022
    embedding_size = 20
    #device = 'cuda:0'
    device = 'cpu'
    # pd.set_option('display.max_rows', None)
    data = pd.read_csv('vehicle_data_model_50w_3.csv')
    dense_feature = ['driver_auth_success_days', 'cargo_search_cnt_3', 'cargo_search_cnt_7', 'scan_cargo_cnt_3',
                     'scan_cargo_cnt_7', 'click_cargo_cnt_3_x', 'click_cargo_cnt_7', 'call_cnt_3_driver',
                     'call_cnt_7_driver',
                     'shipper_auth_success_days', 'exposure_cargo_cnt_3', 'exposure_cnt_3', 'click_cargo_cnt_3_y',
                     'click_cnt_3', 'cargo_weight', 'vector_regular_subscribe_line',
                     'vector_regular_cargo_line_all', 'vector_regular_cargo_truck_type_all',
                     'vector_regular_cargo_truck_length_all', 'vector_regular_cargo_line_30',
                     'vector_regular_cargo_truck_type_30', 'vector_regular_cargo_truck_length_30']

    sparse_feature = data.drop(columns=['label'] + dense_feature).columns.tolist()
    # print(len(sparse_feature))
    print('数据维度:', data.shape, '标签比例:', data['label'].value_counts())
    data[sparse_feature] = data[sparse_feature].astype('uint8')
    target = ['label']

    feat_sizes = {}  # 初始化一个空字典 feat_sizes。
    feat_sizes_dense = {feat: 1 for feat in dense_feature}
    # 对每个稀疏特征创建一个键值对，键为特征名称，值为该特征在数据中唯一取值的数量（即不同的类别个数）。
    feat_sizes_sparse = {feat: len(data[feat].unique()) for feat in sparse_feature}
    # 将稠密特征和稀疏特征的维度大小更新到 feat_sizes 字典中，得到包含所有特征维度大小信息的字典 feat_sizes。
    feat_sizes.update(feat_sizes_dense)
    feat_sizes.update(feat_sizes_sparse)

    # 定义fixlen_feature_columns，包含了所有特征的名称和类型。
    fixlen_feature_columns = [(feat, 'sparse') for feat in sparse_feature] + [(feat, 'dense') for feat in dense_feature]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    # 数据集划分
    train_val, test = train_test_split(data, test_size=0.1, random_state=seed)
    train, validation = train_test_split(train_val, test_size=0.1, random_state=seed)

    # 每个数据集中 0 和 1 标签的数量
    train_label_summary = train['label'].value_counts()
    validation_label_summary = validation['label'].value_counts()
    test_label_summary = test['label'].value_counts()

    # 输出结果
    print("训练集标签汇总:\n", train_label_summary)
    print("验证集标签汇总:\n", validation_label_summary)
    print("测试集标签汇总:\n", test_label_summary)


    # DataLoader准备
    def create_data_loader(df, batch_size):
        labels = pd.DataFrame(df['label'])
        features = df.drop(columns=['label'])
        tensor_data = TensorDataset(torch.from_numpy(np.array(features)), torch.from_numpy(np.array(labels)))
        return DataLoader(tensor_data, shuffle=True, batch_size=batch_size)


    train_loader = create_data_loader(train, batch_size)
    validation_loader = create_data_loader(validation, batch_size)
    test_loader = create_data_loader(test, batch_size)

    # 模型初始化
    #device = 'cuda:0'
    device='cpu'
    model = MSB_CMNet(feat_sizes, embedding_size, linear_feature_columns, dnn_feature_columns).to(device)
    loss_func = nn.BCELoss(reduction='mean')
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

    # 训练循环
    for epoch in range(epoches):
        total_loss_epoch = 0.0
        total_tmp = 0
        model.train()
        for index, (x, y) in enumerate(train_loader):
            x, y = x.to(device).float(), y.to(device).float()
            y_pred = model(x)

            optimizer.zero_grad()
            loss = loss_func(y_pred, y)
            loss.backward()
            optimizer.step()
            total_loss_epoch += loss.item()
            total_tmp += 1

        # 验证集评估
        validation_loss, validation_auc = get_metrics(validation_loader, model)
        print(
            f'Epoch {epoch}/{epoches}, Train_loss: {total_loss_epoch / total_tmp:.4f}, Validation_loss: {validation_loss:.4f},Validation AUC: {validation_auc:.4f}')

    # 测试集评估
    test_loss, final_test_auc = get_metrics(test_loader, model)
    print(f'Test_loss:{test_loss:.4f},Final Test AUC: {final_test_auc:.4f}')

数据维度: (500000, 40) 标签比例: label
0    467495
1     32505
Name: count, dtype: int64
训练集标签汇总:
 label
0    378764
1     26236
Name: count, dtype: int64
验证集标签汇总:
 label
0    42034
1     2966
Name: count, dtype: int64
测试集标签汇总:
 label
0    46697
1     3303
Name: count, dtype: int64
Epoch 0/12, Train_loss: 0.2283, Validation_loss: 0.2174,Validation AUC: 0.7543
Epoch 1/12, Train_loss: 0.2136, Validation_loss: 0.2130,Validation AUC: 0.7702
Epoch 2/12, Train_loss: 0.2084, Validation_loss: 0.2095,Validation AUC: 0.7810
Epoch 3/12, Train_loss: 0.2038, Validation_loss: 0.2063,Validation AUC: 0.7899
Epoch 4/12, Train_loss: 0.1999, Validation_loss: 0.2078,Validation AUC: 0.7971
Epoch 5/12, Train_loss: 0.1957, Validation_loss: 0.2030,Validation AUC: 0.8000
Epoch 6/12, Train_loss: 0.1916, Validation_loss: 0.2019,Validation AUC: 0.8009
Epoch 7/12, Train_loss: 0.1876, Validation_loss: 0.2002,Validation AUC: 0.8078
Epoch 8/12, Train_loss: 0.1836, Validation_loss: 0.2010,Validation AUC: 0.8106
Epoch 9/12, Tr