### 导入数据

In [1]:
import pandas as pd
import numpy as np

In [28]:
data_path = '/home/suibe/dev_sjl/毕业论文/data/'
ad_features = pd.read_csv(data_path + 'ad_feature.csv')
user_profile = pd.read_csv(data_path + 'user_profile.csv')
user_profile.rename(columns={'userid' : 'user_id'}, inplace=True)
user_profile.rename(columns={'new_user_class_level ': 'new_user_class_level'}, inplace=True)

In [3]:
sample = pd.read_csv(data_path + 'sample.csv')
sample_behavior_log = pd.read_csv(data_path + 'sample_behavior_log.csv')

In [4]:
sample_behavior_log['time_stamp'] = pd.to_datetime(sample_behavior_log['time_stamp'])
sample_behavior_log['unix_time'] = sample_behavior_log['time_stamp'].astype('int64') // 10**9

  sample_behavior_log['unix_time'] = sample_behavior_log['time_stamp'].astype('int64') // 10**9


In [5]:
# 对btag进行LabelEncoder编码
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
sample_behavior_log['btag'] = le.fit_transform(sample_behavior_log['btag'])
sample_behavior_log

Unnamed: 0,user,time_stamp,btag,cate,brand,unix_time
0,3,2017-05-06 00:49:31,3,4284,41299,1494031771
1,3,2017-05-06 00:46:27,3,4284,342498,1494031587
2,3,2017-04-30 09:48:11,3,6511,374258,1493545691
3,3,2017-04-30 09:48:27,3,6511,374258,1493545707
4,3,2017-04-30 09:48:29,3,6511,374258,1493545709
...,...,...,...,...,...,...
7512014,1141718,2017-05-03 16:57:57,3,4282,106054,1493830677
7512015,1141718,2017-05-03 16:49:03,3,4520,143597,1493830143
7512016,1141718,2017-05-04 06:25:00,3,6300,143597,1493879100
7512017,1141718,2017-05-03 16:44:37,3,6427,3014,1493829877


In [6]:
# 将时间戳归一化
# 示例代码
sample_behavior_log['standard_time'] = (sample_behavior_log['unix_time'] - sample_behavior_log['unix_time'].mean()) / sample_behavior_log['unix_time'].std()

In [7]:
# 所有的序列取前100个
top100_behavior = sample_behavior_log.sort_values(by=['user', 'time_stamp'], ascending=[True, False]).groupby('user').head(100)
#  btag序列
btag_hist = top100_behavior.groupby('user')['btag'].apply(list).reset_index().rename(columns={'btag': 'btag_hist','user':'user_id'})
# cate序列
cate_hist = top100_behavior.groupby('user')['cate'].apply(list).reset_index().rename(columns={'cate': 'cate_hist','user':'user_id'})
# brand序列
brand_hist = top100_behavior.groupby('user')['brand'].apply(list).reset_index().rename(columns={'brand': 'brand_hist','user':'user_id'})
# 时间戳序列
time_hist = top100_behavior.groupby('user')['standard_time'].apply(list).reset_index().rename(columns={'standard_time': 'time_hist','user':'user_id'})

In [8]:
sample = pd.merge(sample, ad_features, on = 'adgroup_id', how = 'left')
sample = pd.merge(sample, user_profile, on = 'user_id', how = 'left')
sample = pd.merge(sample, btag_hist, on = 'user_id', how = 'left')
sample = pd.merge(sample, cate_hist, on = 'user_id', how = 'left')
sample = pd.merge(sample, brand_hist, on = 'user_id', how = 'left')
sample = pd.merge(sample, time_hist, on = 'user_id', how = 'left')
sample.fillna(0, inplace=True)

### 构建dataset

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader

In [10]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 648047 entries, 0 to 648046
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   user_id               648047 non-null  int64  
 1   time_stamp            648047 non-null  object 
 2   adgroup_id            648047 non-null  int64  
 3   pid                   648047 non-null  object 
 4   clk                   648047 non-null  int64  
 5   cate_id               648047 non-null  int64  
 6   campaign_id           648047 non-null  int64  
 7   customer              648047 non-null  int64  
 8   brand                 648047 non-null  float64
 9   price                 648047 non-null  float64
 10  cms_segid             648047 non-null  float64
 11  cms_group_id          648047 non-null  float64
 12  final_gender_code     648047 non-null  float64
 13  age_level             648047 non-null  float64
 14  pvalue_level          648047 non-null  float64
 15  

In [21]:
class mydata(Dataset):
    def __init__(self, df, max_seq_len, user_feature_cols, ad_feature_cols, label_col):
        """
        :param df: 原始 DataFrame
        :param max_seq_len: 最大序列长度，用于 padding 序列
        :param user_feature_cols: 用户特征列名列表
        :param ad_feature_cols: 广告特征列名列表
        # :param seq_feature_col: 序列特征列名
        :param label_col: 标签列名
        """
        self.df = df
        self.max_seq_len = max_seq_len
        self.user_feature_cols = user_feature_cols
        self.ad_feature_cols = ad_feature_cols
        self.label_col = label_col

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # 用户特征
        user_features = torch.tensor(row[self.user_feature_cols], dtype=torch.float)
        
        # 广告特征
        ad_features = torch.tensor(row[self.ad_feature_cols], dtype=torch.float)
        
        # 序列特征 (需要 padding)
        btag_hist = self.pad_sequence(row['btag_hist'], self.max_seq_len, padding_value=0, dtype=torch.int64)
        cate_hist = self.pad_sequence(row['cate_hist'], self.max_seq_len, padding_value=0, dtype=torch.int64)
        brand_hist = self.pad_sequence(row['brand_hist'], self.max_seq_len, padding_value=0, dtype=torch.int64)
        time_hist = self.pad_sequence(row['time_hist'], self.max_seq_len, padding_value=0.0, dtype=torch.int64)
        
        # 标签
        label = torch.tensor(row[self.label_col], dtype=torch.float)
        #
        
        return user_features, ad_features, (btag_hist, cate_hist,brand_hist,time_hist), label

    def pad_sequence(self, sequence, max_len, padding_value, dtype):
        """
        对输入序列进行padding。
        - sequence: 输入的序列 (list)
        - max_len: 需要padding的最大长度
        - padding_value: 用于填充的值 (默认: 0)
        - dtype: 转换后的数据类型 (默认: torch.int64)
        """
        seq_len = len(sequence)
        # 如果序列长度超过 max_len，进行截断
        if seq_len > max_len:
            sequence = sequence[:max_len]
        else:
            # 否则进行 padding
            sequence = sequence + [padding_value] * (max_len - seq_len)

        # 转换为Tensor
        padded_seq = torch.tensor(sequence, dtype=dtype)
        return padded_seq


In [30]:
# 用户特征列
user_feature_cols = ['user_id','cms_segid', 'cms_group_id' ,'final_gender_code',
                      'age_level', 'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level']
# 广告特征列
ad_feature_cols = ['adgroup_id', 'cate_id', 'campaign_id', 'brand', 'customer', 'price']
# 标签列
label_col = 'clk'


In [31]:
max_seq_len = 100  # 假设最大序列长度为5
dataset = mydata(sample, max_seq_len, user_feature_cols, ad_feature_cols, label_col)

In [32]:
# 构建 DataLoader
batch_size = 256
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [33]:
for batch in dataloader:
    user_features, ad_features,  (btag_hist, cate_hist,brand_hist,time_hist),label = batch
    print("用户特征:", user_features.size())
    print("广告特征:", ad_features.size())
    print("序列特征:", btag_hist.size(), cate_hist.size(), brand_hist.size(), time_hist.size())
    print("标签:", label.size())

  padded_seq = torch.tensor(sequence, dtype=dtype)


用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100])
标签: torch.Size([256])
用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100])
标签: torch.Size([256])
用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100])
标签: torch.Size([256])
用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100])
标签: torch.Size([256])
用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100])
标签: torch.Size([256])
用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) t

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7f46800ea880>>
Traceback (most recent call last):
  File "/home/suibe/anaconda3/envs/dev_sjl/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 790, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
  File "/home/suibe/anaconda3/envs/dev_sjl/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 790, in <setcomp>
    active_threads = {thread.ident for thread in threading.enumerate()}
  File "/home/suibe/anaconda3/envs/dev_sjl/lib/python3.8/threading.py", line 1047, in ident
    @property
KeyboardInterrupt: 


用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100])
标签: torch.Size([256])
用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100])
标签: torch.Size([256])
用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100])
标签: torch.Size([256])
用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100])
标签: torch.Size([256])
用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100])
标签: torch.Size([256])
用户特征: torch.Size([256, 9])
广告特征: torch.Size([256, 6])
序列特征: torch.Size([256, 100]) torch.Size([256, 100]) torch.Size([256, 100]) t