### 导入数据

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_path = '/home/suibe/dev_sjl/毕业论文/data/'
ad_features = pd.read_csv(data_path + 'ad_feature.csv')
user_profile = pd.read_csv(data_path + 'user_profile.csv')
user_profile.rename(columns={'userid' : 'user_id'}, inplace=True)
user_profile.rename(columns={'new_user_class_level ': 'new_user_class_level'}, inplace=True)

In [3]:
sample = pd.read_csv(data_path + 'sample.csv')
sample_behavior_log = pd.read_csv(data_path + 'sample_behavior_log.csv')

In [4]:
sample_behavior_log['time_stamp'] = pd.to_datetime(sample_behavior_log['time_stamp'])
sample_behavior_log['unix_time'] = sample_behavior_log['time_stamp'].astype('int64') // 10**9

  sample_behavior_log['unix_time'] = sample_behavior_log['time_stamp'].astype('int64') // 10**9


In [5]:
# 对btag进行LabelEncoder编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
sample_behavior_log['btag'] = le.fit_transform(sample_behavior_log['btag'])
sample_behavior_log

Unnamed: 0,user,time_stamp,btag,cate,brand,unix_time
0,3,2017-05-06 00:49:31,3,4284,41299,1494031771
1,3,2017-05-06 00:46:27,3,4284,342498,1494031587
2,3,2017-04-30 09:48:11,3,6511,374258,1493545691
3,3,2017-04-30 09:48:27,3,6511,374258,1493545707
4,3,2017-04-30 09:48:29,3,6511,374258,1493545709
...,...,...,...,...,...,...
7512014,1141718,2017-05-03 16:57:57,3,4282,106054,1493830677
7512015,1141718,2017-05-03 16:49:03,3,4520,143597,1493830143
7512016,1141718,2017-05-04 06:25:00,3,6300,143597,1493879100
7512017,1141718,2017-05-03 16:44:37,3,6427,3014,1493829877


In [6]:
# 将时间戳归一化
sample_behavior_log['standard_time'] = (sample_behavior_log['unix_time'] - sample_behavior_log['unix_time'].mean()) / sample_behavior_log['unix_time'].std()
# price进行归一化
# ad_features['price'] = (ad_features['price'] - ad_features['price'].mean()) / ad_features['price'].std()

In [7]:
# 所有的序列取前100个
top100_behavior = sample_behavior_log.sort_values(by=['user', 'time_stamp'], ascending=[True, False]).groupby('user').head(100)
#  btag序列
btag_hist = top100_behavior.groupby('user')['btag'].apply(list).reset_index().rename(columns={'btag': 'btag_hist','user':'user_id'})
# cate序列
cate_hist = top100_behavior.groupby('user')['cate'].apply(list).reset_index().rename(columns={'cate': 'cate_hist','user':'user_id'})
# brand序列
brand_hist = top100_behavior.groupby('user')['brand'].apply(list).reset_index().rename(columns={'brand': 'brand_hist','user':'user_id'})
# 时间戳序列
time_hist = top100_behavior.groupby('user')['standard_time'].apply(list).reset_index().rename(columns={'standard_time': 'time_hist','user':'user_id'})

In [8]:
sample = pd.merge(sample, ad_features, on = 'adgroup_id', how = 'left')
sample = pd.merge(sample, user_profile, on = 'user_id', how = 'left')
sample = pd.merge(sample, btag_hist, on = 'user_id', how = 'left')
sample = pd.merge(sample, cate_hist, on = 'user_id', how = 'left')
sample = pd.merge(sample, brand_hist, on = 'user_id', how = 'left')
sample = pd.merge(sample, time_hist, on = 'user_id', how = 'left')
sample.fillna(0, inplace=True)

### 构建dataset

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

In [10]:
class mydata(Dataset):
    def __init__(self, df, max_seq_len, user_feature_cols, ad_feature_cols, label_col):
        """
        :param df: 原始 DataFrame
        :param max_seq_len: 最大序列长度，用于 padding 序列
        :param user_feature_cols: 用户特征列名列表
        :param ad_feature_cols: 广告特征列名列表
        # :param seq_feature_col: 序列特征列名
        :param label_col: 标签列名
        """
        self.df = df
        self.max_seq_len = max_seq_len
        self.user_feature_cols = user_feature_cols
        self.ad_feature_cols = ad_feature_cols
        self.label_col = label_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # 用户特征
        user_features = torch.tensor(row[self.user_feature_cols], dtype=torch.float)
        
        # 广告特征
        ad_features = torch.tensor(row[self.ad_feature_cols], dtype=torch.float)
        
        # 序列特征 (需要 padding)
        btag_hist = self.pad_sequence(row['btag_hist'], self.max_seq_len, padding_value=0, dtype=torch.int64)
        cate_hist = self.pad_sequence(row['cate_hist'], self.max_seq_len, padding_value=0, dtype=torch.int64)
        brand_hist = self.pad_sequence(row['brand_hist'], self.max_seq_len, padding_value=0, dtype=torch.int64)
        time_hist = self.pad_sequence(row['time_hist'], self.max_seq_len, padding_value=0.0, dtype=torch.int64)
        
        # 标签
        label = torch.tensor(row[self.label_col], dtype=torch.float)
        #
        
        return user_features, ad_features, (btag_hist, cate_hist,brand_hist,time_hist), label

    def pad_sequence(self, sequence, max_len, padding_value, dtype):
        """
        对输入序列进行padding。
        - sequence: 输入的序列 (list)
        - max_len: 需要padding的最大长度
        - padding_value: 用于填充的值 (默认: 0)
        - dtype: 转换后的数据类型 (默认: torch.int64)
        """
        seq_len = len(sequence)
        # 如果序列长度超过 max_len，进行截断
        if seq_len > max_len:
            sequence = sequence[:max_len]
        else:
            # 否则进行 padding
            sequence = sequence + [padding_value] * (max_len - seq_len)

        # 转换为Tensor
        padded_seq = torch.tensor(sequence, dtype=dtype)
        return padded_seq


In [11]:
# 用户特征列
user_feature_cols = ['user_id','cms_segid', 'cms_group_id' ,'final_gender_code',
                      'age_level', 'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level']
# 广告特征列
ad_feature_cols = ['adgroup_id', 'cate_id', 'campaign_id', 'brand', 'customer']
# 标签列
label_col = 'clk'


In [12]:
max_seq_len = 100  # 假设最大序列长度为100
dataset = mydata(sample, max_seq_len, user_feature_cols, ad_feature_cols, label_col)

In [13]:
# 构建 DataLoader
batch_size = 256
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
for batch in dataloader:
    user_features, ad_features, (btag_hist, cate_hist,brand_hist,time_hist), label = batch
    break

### 创建DeppFM模型

In [14]:
user_feature_dim = sample.user_id.max() + 1
ad_feature_dim = sample.adgroup_id.max() + 1
embedding_dim = 256

In [None]:
import torch.nn as nn
user_embedding = nn.Embedding(user_feature_dim, embedding_dim)
ad_embedding = nn.Embedding(ad_feature_dim, embedding_dim)

In [None]:
user_embed = user_embedding(user_features.long())
ad_embed = ad_embedding(ad_features.long())

In [None]:
# 分别计算用户特征和广告特征的二阶交互项
user_sum_square_embed = torch.pow(user_embed.sum(dim=1), 2)
user_square_sum_embed = torch.pow(user_embed, 2).sum(dim=1)
fm_user_interaction = 0.5 * (user_sum_square_embed - user_square_sum_embed)

ad_sum_square_embed = torch.pow(ad_embed.sum(dim=1), 2)
ad_square_sum_embed = torch.pow(ad_embed, 2).sum(dim=1)
fm_ad_interaction = 0.5 * (ad_sum_square_embed - ad_square_sum_embed)

# 将用户和广告的交互项相加或拼接
fm_interaction = fm_user_interaction + fm_ad_interaction

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class DeepFM(nn.Module):
    """
    DeepF模型 Pytorch版本
    """
    def __init__(self, user_feature_dim, ad_feature_dim, embedding_dim, hidden_dims, dropout_rate):
        """
        :param feature_dim: 特征维度(所有特征共享embedding)
        :param embedding_dim: embedding维度
        :param hidden_dims: 隐藏层维度列表
        :param dropout_rate: dropout比率
        """
        super(DeepFM, self).__init__()
        # 分别为用户特征和广告特征创建独立的 embedding 层
        self.user_embedding = nn.Embedding(user_feature_dim, embedding_dim)
        self.ad_embedding = nn.Embedding(ad_feature_dim, embedding_dim)

        # FM部分（交叉项）
        # 一阶特征 embedding
        self.fm_first_order_user = nn.Embedding(user_feature_dim, 1)
        self.fm_first_order_ad = nn.Embedding(ad_feature_dim, 1)

        # Deep部分
        all_dims = [2 * embedding_dim] + hidden_dims # [512, 256, 128]
        layers = []
        for i in range(len(all_dims)-1):
            layers.append(nn.Linear(all_dims[i], all_dims[i + 1]))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
        self.dnn = nn.Sequential(*layers)
        # 输出层
        self.output_layer = nn.Linear(hidden_dims[-1] + embedding_dim + 2, 1) #DNN+FFM

    def forward(self, user_features, ad_features):
        """
        :param user_features: 用户特征
        :param ad_features: 广告特征
        :param behavior_features: 行为特征
        """
        user_features = user_features.long()
        ad_features = ad_features.long()
        # 嵌入用户特征和广告特征
        user_embed = self.user_embedding(user_features)  # 用户特征的 embedding
        ad_embed = self.ad_embedding(ad_features)        # 广告特征的 embedding
        # FM 一阶项（线性部分）
        fm_first_order_user = self.fm_first_order_user(user_features)
        fm_first_order_ad = self.fm_first_order_ad(ad_features)
       # 拼接用户和广告特征的一阶项
        fm_first_order = torch.cat([fm_first_order_user, fm_first_order_ad], dim=1)
        # FM 二阶交互项
        # 分别计算用户特征和广告特征的二阶交互项
        user_sum_square_embed = torch.pow(user_embed.sum(dim=1), 2)
        user_square_sum_embed = torch.pow(user_embed, 2).sum(dim=1)
        fm_user_interaction = 0.5 * (user_sum_square_embed - user_square_sum_embed)

        ad_sum_square_embed = torch.pow(ad_embed.sum(dim=1), 2)
        ad_square_sum_embed = torch.pow(ad_embed, 2).sum(dim=1)
        fm_ad_interaction = 0.5 * (ad_sum_square_embed - ad_square_sum_embed)

        # 将用户和广告的交互项相加或拼接
        fm_interaction = fm_user_interaction + fm_ad_interaction
        # 将用户和广告的嵌入向量拼接后输入到 DNN 部分
        dnn_input = torch.cat([user_embed, ad_embed], dim=-1)
        dnn_output = self.dnn(dnn_input)
        # 将 DNN 输出、FM 一阶项和 FM 二阶交互项拼接
        feature_output = torch.cat([dnn_output, fm_first_order.view(fm_first_order.shape[0], -1), fm_interaction], dim=-1) # 128 + 2 + 256
            # 最终输出
        output = self.output_layer(feature_output)
        return output


In [16]:
# 参数定义
user_feature_dim = sample.user_id.max() + 1
ad_feature_dim = sample.adgroup_id.max() + 1
embedding_dim = 256
dnn_hidden_units = [256, 128]
learning_rate = 0.001
batch_size = 1024
epochs = 10
# 模型实例化
model = DeepFM(user_feature_dim, ad_feature_dim, embedding_dim, dnn_hidden_units,dropout_rate=0.1)
# 优化器
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# 损失函数
criterion = torch.nn.BCELoss()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.09 GiB (GPU 0; 44.34 GiB total capacity; 0 bytes already allocated; 530.69 MiB free; 0 bytes reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
from sklearn.metrics import roc_auc_score
# 模型训练
for epoch in range(epochs):
        model.train()  # 切换到训练模式
        running_loss = 0.0
        all_labels = []
        all_outputs = []

        for step, batch in enumerate(dataloader):
            # 获取数据，并移动到设备
            user_features, ad_features, (btag_hist, cate_hist, brand_hist, time_hist), label = batch
            user_features, ad_features, label = user_features.to(device), ad_features.to(device), label.to(device)

            # 前向传播
            output = model(user_features, ad_features)
            loss = criterion(output, label)
            
            # 反向传播
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 累积损失
            running_loss += loss.item()

            # 收集输出和标签用于计算AUC
            all_outputs.extend(output.detach().cpu().numpy())
            all_labels.extend(label.detach().cpu().numpy())

            # 每500个step输出一次损失和AUC
            if (step + 1) % 500 == 0:
                avg_loss = running_loss / 500
                auc = roc_auc_score(all_labels, all_outputs)

                print(f"Epoch [{epoch+1}/{epochs}], Step [{step+1}], Loss: {avg_loss:.4f}, AUC: {auc:.4f}")

                # 重置累积变量
                running_loss = 0.0
                all_labels = []
                all_outputs = []