In [1]:
import itertools
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from collections import OrderedDict, namedtuple, defaultdict


def get_auc(loader, model):
    pred, target = [], []
    model.eval()
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device).float(), y.to(device).float()
            y_hat = model(x)
            pred += list(y_hat.cpu().numpy())
            target += list(y.cpu().numpy())
    auc = roc_auc_score(target, pred)
    return auc


class DNN(nn.Module):
    def __init__(self, inputs_dim, hidden_units, dropout_rate, ):
        super(DNN, self).__init__()
        self.inputs_dim = inputs_dim
        self.hidden_units = hidden_units
        self.dropout = nn.Dropout(dropout_rate)

        self.hidden_units = [inputs_dim] + list(self.hidden_units)
        self.linear = nn.ModuleList([
            nn.Linear(self.hidden_units[i], self.hidden_units[i + 1]) for i in range(len(self.hidden_units) - 1)
        ])
        for name, tensor in self.linear.named_parameters():
            if 'weight' in name:
                nn.init.normal_(tensor, mean=0, std=0.0001)

        self.activation = nn.ReLU()

    def forward(self, X):
        inputs = X
        for i in range(len(self.linear)):
            fc = self.linear[i](inputs)
            fc = self.activation(fc)
            fc = self.dropout(fc)
            inputs = fc
        return inputs

class Wide_Deep(nn.Module):
    def __init__(self, feat_size, embedding_size, linear_feature_columns, dnn_feature_columns,
                 use_attention=True, attention_factor=8, l2_reg=0.00001, drop_rate=0.9, dnn_hidden_units=(256, 128)):
        super(Wide_Deep, self).__init__()
        self.sparse_feature_columns = list(filter(lambda x: x[1] == 'sparse', dnn_feature_columns))
        self.embedding_dic = nn.ModuleDict({
            feat[0]: nn.Embedding(feat_size[feat[0]], embedding_size, sparse=False) for feat in
            self.sparse_feature_columns
        })
        self.dense_feature_columns = list(filter(lambda x: x[1] == 'dense', dnn_feature_columns))

        self.feature_index = defaultdict(int)
        start = 0
        for feat in feat_size:
            self.feature_index[feat] = start
            start += 1

        self.dnn = DNN(len(self.dense_feature_columns) + embedding_size * len(self.embedding_dic), dnn_hidden_units,
                       0.5)

        self.dnn_linear = nn.Linear(dnn_hidden_units[-1] , 1, bias=False)

        dnn_hidden_units = [len(feat_size), 1]
        self.linear = nn.ModuleList([
            nn.Linear(dnn_hidden_units[i], dnn_hidden_units[i + 1]) for i in range(len(dnn_hidden_units) - 1)
        ])
        for name, tensor in self.linear.named_parameters():
            if 'weight' in name:
                nn.init.normal_(tensor, mean=0, std=0.00001)

        self.out = nn.Sigmoid()
        self.act = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, X):

        # wide
        logit = X
        for i in range(len(self.linear)):
            fc = self.linear[i](logit)
            fc = self.act(fc)
            fc = self.dropout(fc)
            logit = fc

        # deep
        sparse_embedding = [
            self.embedding_dic[feat[0]](X[:, self.feature_index[feat[0]]].long()).reshape(X.shape[0], 1, -1)
            for feat in self.sparse_feature_columns]
        sparse_input = torch.cat(sparse_embedding, dim=1)
        sparse_input = torch.flatten(sparse_input, start_dim=1)
        dense_values = [X[:, self.feature_index[feat[0]]].reshape(-1, 1) for feat in self.dense_feature_columns]
        dense_input = torch.cat(dense_values, dim=1)
        dnn_input = torch.cat((sparse_input, dense_input), dim=1)
        dnn_out = self.dnn(dnn_input)
        dnn_logit = self.dnn_linear(dnn_out)
        logit += dnn_logit

        y_pred = torch.sigmoid(logit)
        return y_pred

if __name__ == '__main__':

    import itertools
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from collections import OrderedDict, namedtuple, defaultdict


def get_auc(loader, model):
    pred, target = [], []
    model.eval()
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device).float(), y.to(device).float()
            y_hat = model(x)
            pred += list(y_hat.cpu().numpy())
            target += list(y.cpu().numpy())
    auc = roc_auc_score(target, pred)
    return auc


class DNN(nn.Module):
    def __init__(self, inputs_dim, hidden_units, dropout_rate, ):
        super(DNN, self).__init__()
        self.inputs_dim = inputs_dim
        self.hidden_units = hidden_units
        self.dropout = nn.Dropout(dropout_rate)

        self.hidden_units = [inputs_dim] + list(self.hidden_units)
        self.linear = nn.ModuleList([
            nn.Linear(self.hidden_units[i], self.hidden_units[i + 1]) for i in range(len(self.hidden_units) - 1)
        ])
        for name, tensor in self.linear.named_parameters():
            if 'weight' in name:
                nn.init.normal_(tensor, mean=0, std=0.0001)

        self.activation = nn.ReLU()

    def forward(self, X):
        inputs = X
        for i in range(len(self.linear)):
            fc = self.linear[i](inputs)
            fc = self.activation(fc)
            fc = self.dropout(fc)
            inputs = fc
        return inputs

class Wide_Deep(nn.Module):
    def __init__(self, feat_size, embedding_size, linear_feature_columns, dnn_feature_columns,
                 use_attention=True, attention_factor=8, l2_reg=0.00001, drop_rate=0.5, dnn_hidden_units=(256, 128)):
        super(Wide_Deep, self).__init__()
        self.sparse_feature_columns = list(filter(lambda x: x[1] == 'sparse', dnn_feature_columns))
        self.embedding_dic = nn.ModuleDict({
            feat[0]: nn.Embedding(feat_size[feat[0]], embedding_size, sparse=False) for feat in
            self.sparse_feature_columns
        })
        self.dense_feature_columns = list(filter(lambda x: x[1] == 'dense', dnn_feature_columns))

        self.feature_index = defaultdict(int)
        start = 0
        for feat in feat_size:
            self.feature_index[feat] = start
            start += 1

        self.dnn = DNN(len(self.dense_feature_columns) + embedding_size * len(self.embedding_dic), dnn_hidden_units,
                       0.5)

        self.dnn_linear = nn.Linear(dnn_hidden_units[-1] , 1, bias=False)

        dnn_hidden_units = [len(feat_size), 1]
        self.linear = nn.ModuleList([
            nn.Linear(dnn_hidden_units[i], dnn_hidden_units[i + 1]) for i in range(len(dnn_hidden_units) - 1)
        ])
        for name, tensor in self.linear.named_parameters():
            if 'weight' in name:
                nn.init.normal_(tensor, mean=0, std=0.00001)

        self.out = nn.Sigmoid()
        self.act = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, X):

        # wide
        logit = X
        for i in range(len(self.linear)):
            fc = self.linear[i](logit)
            fc = self.act(fc)
            fc = self.dropout(fc)
            logit = fc

        # deep
        sparse_embedding = [
            self.embedding_dic[feat[0]](X[:, self.feature_index[feat[0]]].long()).reshape(X.shape[0], 1, -1)
            for feat in self.sparse_feature_columns]
        sparse_input = torch.cat(sparse_embedding, dim=1)
        sparse_input = torch.flatten(sparse_input, start_dim=1)
        dense_values = [X[:, self.feature_index[feat[0]]].reshape(-1, 1) for feat in self.dense_feature_columns]
        dense_input = torch.cat(dense_values, dim=1)
        dnn_input = torch.cat((sparse_input, dense_input), dim=1)
        dnn_out = self.dnn(dnn_input)
        dnn_logit = self.dnn_linear(dnn_out)
        logit += dnn_logit

        y_pred = torch.sigmoid(logit)
        return y_pred

if __name__ == '__main__':
    
    batch_size =1024
    lr = 1e-3
    wd = 1e-5
    epoches = 50
    seed = 2022
    embedding_size =10
    device = 'cuda:0'
    # pd.set_option('display.max_rows', None)  # 显示数据中所有的列
    data = pd.read_csv('vehicle_data_model_50w_3.csv')
    # data = pd.read_csv('vehicle_data_model_30w_2.csv')
    dense_feature= ['driver_auth_success_days','cargo_search_cnt_3','cargo_search_cnt_7','scan_cargo_cnt_3','scan_cargo_cnt_7','click_cargo_cnt_3_x','click_cargo_cnt_7','call_cnt_3_driver','call_cnt_7_driver',
               'shipper_auth_success_days','exposure_cargo_cnt_3','exposure_cnt_3','click_cargo_cnt_3_y','click_cnt_3','cargo_weight','vector_regular_subscribe_line',
                       'vector_regular_cargo_line_all','vector_regular_cargo_truck_type_all','vector_regular_cargo_truck_length_all','vector_regular_cargo_line_30',
                       'vector_regular_cargo_truck_type_30','vector_regular_cargo_truck_length_30']


    # 假设你的数据集中包含'label'列，并且dense_feature已经定义
    sparse_feature  = data.drop(columns=['label'] + dense_feature).columns.tolist()
    print(len(sparse_feature))
    pd.options.display.max_rows = None  # 显示所有列
    data[sparse_feature]=data[sparse_feature].astype('uint8')
    target = ['label']

    feat_sizes = {}  # 初始化一个空字典 feat_sizes。
    feat_sizes_dense = {feat: 1 for feat in dense_feature}#这里将稠密特征的维度大小设置为1，因为这些特征不需要经过 Embedding 层，直接作为输入。
    # 对每个稀疏特征创建一个键值对，键为特征名称，值为该特征在数据中唯一取值的数量（即不同的类别个数）。
    feat_sizes_sparse = {feat: len(data[feat].unique()) for feat in sparse_feature}
    # 将稠密特征和稀疏特征的维度大小更新到 feat_sizes 字典中，得到包含所有特征维度大小信息的字典 feat_sizes。
    feat_sizes.update(feat_sizes_dense)
    feat_sizes.update(feat_sizes_sparse)
    # for feat in sparse_feature:
    #     lbe = LabelEncoder()  #使用LabelEncoder类对每个稀疏特征进行编码,将每个特征的字符串类型的值映射成整数编号
    #     data[feat] = lbe.fit_transform(data[feat])


    # 定义fixlen_feature_columns，包含了所有特征的名称和类型（sparse或dense）。
    fixlen_feature_columns = [(feat, 'sparse') for feat in sparse_feature] + [(feat, 'dense') for feat in dense_feature]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    # train, test = train_test_split(data, test_size=0.2, random_state=seed)

   

        # 数据集划分
    train, test = train_test_split(data, test_size=0.2, random_state=seed)
    validation, test = train_test_split(test, test_size=0.5, random_state=seed)

    # DataLoader准备
    def create_data_loader(df, batch_size):
        labels = pd.DataFrame(df['label'])
        features = df.drop(columns=['label'])
        tensor_data = TensorDataset(torch.from_numpy(np.array(features)), torch.from_numpy(np.array(labels)))
        return DataLoader(tensor_data, shuffle=True, batch_size=batch_size)


    train_loader = create_data_loader(train, batch_size)
    validation_loader = create_data_loader(validation, batch_size)
    test_loader = create_data_loader(test, batch_size)

    # 模型初始化
    # model = FiBiNET(feat_sizes, embedding_size, linear_feature_columns, dnn_feature_columns)
    device = 'cuda:0'
    model = Wide_Deep(feat_sizes, embedding_size, linear_feature_columns, dnn_feature_columns).to(device)
    loss_func = nn.BCELoss(reduction='mean')
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    # 早停策略参数
    early_stopping_threshold = 0.80
    best_validation_auc = 0
    
    # 定义一个函数来计算给定数据加载器上的平均损失和AUC
    def evaluate(loader):
        model.eval()
        total_loss = 0.0
        total_samples = 0
        with torch.no_grad():
            for x, y in loader:
                x, y = x.to(device).float(), y.to(device).float()
                y_hat = model(x)
                loss = loss_func(y_hat, y)
                total_loss += loss.item() * x.size(0)
                total_samples += x.size(0)
        avg_loss = total_loss / total_samples
        auc = get_auc(loader, model)
        return avg_loss, auc


    # 训练循环
    for epoch in range(epoches):
        total_loss_epoch = 0.0
        total_tmp = 0
        model.train()
        for index, (x, y) in enumerate(train_loader):
            x, y = x.to(device).float(), y.to(device).float()
            y_hat = model(x)

            optimizer.zero_grad()
            loss = loss_func(y_hat, y)
            loss.backward()
            optimizer.step()
            total_loss_epoch += loss.item()
            total_tmp += 1

        # 验证集评估
        validation_loss, validation_auc = evaluate(validation_loader)
        print(
            f'Epoch {epoch}/{epoches}, Train Loss: {total_loss_epoch / total_tmp:.4f}, Validation_loss: {validation_loss:.4f},Validation AUC: {validation_auc:.4f}')

        # 更新最佳验证集AUC
        if validation_auc > best_validation_auc:
            best_validation_auc = validation_auc

        # 早停判断
        if validation_auc >= early_stopping_threshold:
            print(f'Early stopping triggered at epoch {epoch}, Validation AUC: {validation_auc:.4f}')
            break

    # 测试集评估
    test_loss, final_test_auc = evaluate(test_loader)
    print(f'Test_loss:{test_loss:.4f},Final Test AUC: {final_test_auc:.4f}')

17
Epoch 0/50, Train Loss: 0.2570, Validation_loss: 0.2355,Validation AUC: 0.6310
Epoch 1/50, Train Loss: 0.2359, Validation_loss: 0.2263,Validation AUC: 0.7023
Epoch 2/50, Train Loss: 0.2286, Validation_loss: 0.2224,Validation AUC: 0.7347
Epoch 3/50, Train Loss: 0.2252, Validation_loss: 0.2217,Validation AUC: 0.7425
Epoch 4/50, Train Loss: 0.2231, Validation_loss: 0.2194,Validation AUC: 0.7447
Epoch 5/50, Train Loss: 0.2217, Validation_loss: 0.2164,Validation AUC: 0.7511
Epoch 6/50, Train Loss: 0.2200, Validation_loss: 0.2152,Validation AUC: 0.7566
Epoch 7/50, Train Loss: 0.2186, Validation_loss: 0.2144,Validation AUC: 0.7587
Epoch 8/50, Train Loss: 0.2169, Validation_loss: 0.2133,Validation AUC: 0.7631
Epoch 9/50, Train Loss: 0.2162, Validation_loss: 0.2132,Validation AUC: 0.7648
Epoch 10/50, Train Loss: 0.2150, Validation_loss: 0.2126,Validation AUC: 0.7672
Epoch 11/50, Train Loss: 0.2142, Validation_loss: 0.2117,Validation AUC: 0.7687
Epoch 12/50, Train Loss: 0.2129, Validation_los