In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)
from tqdm import tqdm
tqdm.pandas(desc='pandas bar')
import pickle
from sklearn.preprocessing import LabelEncoder
import time

# 读数据
item读取有些麻烦, 需要按行来读. 其他数据正常使用pandas来读.

In [2]:
data_path = r'./data_for_ctr_predict/'
user = pd.read_table(data_path+'user_info.txt', sep='\t', names=['user_id', 'user_device', 'user_system', 'user_province', 'user_city', 'user_age', 'user_gender'])
#item = pd.read_table(data_path+'doc_info.txt', sep='\t', names=['item_id', 'item_title', 'item_time', 'item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords'])
train = pd.read_table(data_path+'train_data.txt', sep='\t', names=['user_id', 'item_id', 'time', 'network', 'refresh', 'position', 'label', 'duration'])
test = pd.read_table(data_path+'test_data.txt', sep='\t', names=['id', 'user_id', 'item_id', 'time', 'network', 'refresh'])

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [2]:
data_path = r'./data_for_ctr_predict/'
with open(data_path+'doc_info.txt', 'r') as f:
    lines = f.readlines()

item = []
for line in tqdm(lines):
    unit = line.split('\t')
    if len(unit)<7:
        for _ in range(7-len(unit)):
            unit.append(np.nan)
    item.append(unit)
#item = np.array(item)

100%|██████████| 633391/633391 [00:02<00:00, 269548.33it/s]


In [3]:
item = pd.DataFrame(item, columns=['item_id', 'item_title', 'item_time', 'item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords'])
item['item_id'] = item['item_id'].astype('int64')

# 检查数据

## 检查是否存在冷启动
验证了一下, 训练集和测试集内都不存在冷启动问题

In [7]:
len(set(train['item_id'].unique())-set(item['item_id'].unique())), len(set(train['user_id'].unique())-set(user['user_id'].unique()))

(0, 0)

In [8]:
len(set(test['item_id'].unique())-set(item['item_id'].unique())), len(set(test['user_id'].unique())-set(user['user_id'].unique()))

(0, 0)

## 检查空缺值

In [9]:
user.isnull().sum() / user.shape[0]

user_id          0.000000
user_device      0.044036
user_system      0.044077
user_province    0.059972
user_city        0.062450
user_age         0.038931
user_gender      0.038074
dtype: float64

In [10]:
user['user_age'].fillna('A_0_24:0.25,A_25_29:0.25,A_30_39:0.25,A_40+:0.25', inplace=True)
user['user_gender'].fillna('female:0.5,male:0.5', inplace=True)

In [11]:
item.isnull().sum() / item.shape[0]

item_id          0.000000
item_title       0.000000
item_time        0.000382
item_picture     0.000382
item_cluster1    0.000444
item_cluster2    0.000445
item_keywords    0.017337
dtype: float64

In [12]:
train.isnull().sum() / train.shape[0]

user_id     0.0
item_id     0.0
time        0.0
network     0.0
refresh     0.0
position    0.0
label       0.0
duration    0.0
dtype: float64

In [13]:
test.isnull().sum() / test.shape[0]

id         0.0
user_id    0.0
item_id    0.0
time       0.0
network    0.0
refresh    0.0
dtype: float64

# 处理数据

## 处理user特征

In [14]:
user.head(10)

Unnamed: 0,user_id,user_device,user_system,user_province,user_city,user_age,user_gender
0,1000372820,TAS-AN00,Android,广东,广州,"A_0_24:0.404616,A_25_29:0.059027,A_30_39:0.516...","female:0.051339,male:0.948661"
1,1000652892,PACM00,Android,河北,唐山,"A_0_24:0.615458,A_25_29:0.086233,A_30_39:0.141...","female:0.280295,male:0.719705"
2,1000908852,MI6X,Android,上海,上海,"A_0_24:0.123255,A_25_29:0.208225,A_30_39:0.298...","female:0.000000,male:1.000000"
3,1001168798,iPhone11,IOS,,,"A_0_24:0.436296,A_25_29:0.489370,A_30_39:0.061...","female:0.870710,male:0.129290"
4,1001305614,M2103K19C,Android,江苏,苏州,"A_0_24:0.006632,A_25_29:0.043408,A_30_39:0.350...","female:0.000000,male:1.000000"
5,1001309700,LYA-AL10,Android,黑龙江,哈尔滨,"A_0_24:0.413768,A_25_29:0.437501,A_30_39:0.141...","female:0.000000,male:1.000000"
6,1001339860,RedmiNote7Pro,Android,天津,天津,"A_0_24:0.067328,A_25_29:0.029283,A_30_39:0.303...","female:0.155445,male:0.844555"
7,1001384888,M2007J22C,Android,河北,石家庄,"A_0_24:0.008414,A_25_29:0.027505,A_30_39:0.161...","female:0.000000,male:1.000000"
8,100142658,JEF-AN20,Android,重庆,重庆,"A_0_24:0.160670,A_25_29:0.695923,A_30_39:0.109...","female:1.000000,male:0.000000"
9,1001439274,HMA-AL00,Android,山东,淄博,"A_0_24:0.000172,A_25_29:0.007701,A_30_39:0.778...","female:0.000000,male:1.000000"


In [15]:
def get_multi_hot(x, k):
    str_list = x.split(',')
    res = [float(s.split(':')[1]) for s in str_list]
    for _ in range(k-len(res)):
        print('加0')
        res.append(0)
    if len(res) == 16:
        res = np.array(res)
        res = res[[0,4,8,12]]
        res = list(res)
    if (k == 2) & (len(res) == 4):
        res = np.array(res)
        res = res[[0,2]]
        res = list(res)
    if len(res) != k:
        print(x)
    return res

In [16]:
# 将age和gender都处理成列表的形式
user['new_age'] = user['user_age'].progress_apply(lambda x: get_multi_hot(x, 4))
user['new_gender'] = user['user_gender'].progress_apply(lambda x: get_multi_hot(x, 2))

pandas bar: 100%|██████████| 1538384/1538384 [00:07<00:00, 211960.07it/s]
pandas bar:  15%|█▌        | 235351/1538384 [00:00<00:03, 405506.10it/s]

加0


pandas bar:  23%|██▎       | 357838/1538384 [00:00<00:02, 402642.38it/s]

加0
加0


pandas bar:  44%|████▍     | 681403/1538384 [00:02<00:02, 368601.66it/s]

加0
加0
加0


pandas bar:  55%|█████▌    | 846717/1538384 [00:02<00:01, 402051.29it/s]

加0
加0


pandas bar:  63%|██████▎   | 970315/1538384 [00:03<00:02, 209992.33it/s]

加0


pandas bar:  74%|███████▎  | 1132470/1538384 [00:03<00:01, 332213.53it/s]

加0


pandas bar:  84%|████████▍ | 1297936/1538384 [00:04<00:00, 391428.41it/s]

加0
加0
加0
加0


pandas bar:  95%|█████████▌| 1463764/1538384 [00:04<00:00, 408372.74it/s]

加0
加0


pandas bar: 100%|██████████| 1538384/1538384 [00:05<00:00, 294983.17it/s]

加0





In [17]:
user.drop(['user_age', 'user_gender'], axis=1, inplace=True)
user.rename(columns={'new_age': 'user_age', 'new_gender': 'user_gender'}, inplace=True)

In [18]:
user[['user_device', 'user_system', 'user_province', 'user_city']] = user[['user_device', 'user_system', 'user_province', 'user_city']].fillna('nan')

In [19]:
user.isnull().sum()

user_id          0
user_device      0
user_system      0
user_province    0
user_city        0
user_age         0
user_gender      0
dtype: int64

In [20]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
user[['user_device', 'user_system', 'user_province', 'user_city']] = user[['user_device', 'user_system', 'user_province', 'user_city']].apply(encoder.fit_transform)

In [21]:
uid_dict = dict(zip(user.user_id, user.index))

In [22]:
user['new_id'] = user.index
user.drop(['user_id'], axis=1, inplace=True)
user.rename(columns={'new_id': 'user_id'}, inplace=True)
user = user[['user_id', 'user_device', 'user_system', 'user_province', 'user_city', 'user_gender', 'user_age']]

In [23]:
user[['user_id', 'user_device', 'user_system', 'user_province', 'user_city']].nunique()

user_id          1538384
user_device         3096
user_system            3
user_province        328
user_city            768
dtype: int64

## 处理item特征

In [24]:
item.head(10)

Unnamed: 0,item_id,item_title,item_time,item_picture,item_cluster1,item_cluster2,item_keywords
0,349635709,"拿到c1驾照后,实习期扣分了会怎样?扣12分驾照会吊销么?",1572519971000,9,汽车,汽车/用车,"上班族:8.469502,买车:8.137443,二手车:9.022247,副页:11.21..."
1,361653323,"疫情谣言粉碎机丨接种新冠疫苗后用麻药或致死?盘点最新疫情谣言,别被忽悠了",1624522285000,1,健康,健康/疾病防护治疗及西医用药,"医生:14.760494,吸烟:16.474872,板蓝根:15.597788,板蓝根^^熏..."
2,426732705,"实拍本田飞度:空间真大,8万出头工薪族可选,但内饰能忍?",1610808303000,9,汽车,汽车/买车,"155n:8.979802,polo:7.951116,中控台:5.954278,中网:7...."
3,430221183,搭载135kw电机比亚迪秦plus纯电动版外观更精致,1612581556000,2,汽车,汽车/买车,"etc:12.055207,代表:8.878175,内饰:5.342025,刀片:9.453..."
4,441756326,【提车作业】不顾他人眼光帕萨特phev俘获30老男人浪子心,1618825835000,23,汽车,汽车/买车,"丰田凯美瑞:12.772149,充电器:8.394001,品牌:8.436843,城市:7...."
5,443485341,"魏延有反骨之心都能重用,赵云忠心为什么却不被重用?",1619484501000,4,历史,历史/中国史,"三国:8.979797,五虎将:13.072728,人才:7.532783,保镖:6.811..."
6,447124796,"高考志愿|14个省份新高考录取有变化,这些专业傻傻分不清楚,填志愿看仔细",1624506618000,1,教育,教育/高考,"兴趣:11.558689,家长:11.350382,就业:12.176434,考生:11.5..."
7,448023100,三国煮酒论英雄,1624601704000,1,文化艺术,文化艺术/读书,"历史:9.184261,故事:9.405677\n"
8,452701283,懂我所需—吉利帝豪,1622081779000,19,汽车,汽车/买车,"ec7:14.355725,交互:10.203005,人机交互:10.540608,人机交互..."
9,452933213,"美丽的非洲长河?可以从这条河中,人的形象与命运?",1622121296000,3,旅游,旅游/旅游攻略,"东非:8.944922,东非高原:11.758570,两河:9.025832,伊兹:7.49..."


In [25]:
def convert_time(x):
    try:
        return int(x)
    except:
        return int(1563404932000)

item['item_time'] = item['item_time'].apply(convert_time)

In [26]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
item['item_time'].astype('int64').describe()

count          633391.000
mean    1624972374821.468
std        1291307341.395
min     1563404932000.000
25%     1624678967000.000
50%     1624975577000.000
75%     1625301111000.000
max     1625672120000.000
Name: item_time, dtype: float64

In [27]:
time_dict = dict(zip(item.item_id, item.item_time))

2019-7-18--2021-7-7

In [28]:
item = item[['item_id', 'item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords']]

In [29]:
encoder = LabelEncoder()
item[['item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords']] = item[['item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords']].fillna('nan')
item[['item_picture', 'item_cluster1', 'item_cluster2']] = item[['item_picture', 'item_cluster1', 'item_cluster2']].apply(encoder.fit_transform)

In [30]:
item.isnull().sum()

item_id          0
item_picture     0
item_cluster1    0
item_cluster2    0
item_keywords    0
dtype: int64

### 处理keywords

In [31]:
# 处理关键字信息
def chuli_k(x, prob=False):
    if x == 'nan':
        return x
    else:
        try:
            y = x.split(',')
        except:
            print(x)
        z1 = [] # 存概率
        z2 = [] # 存keyword
        for kw in y: 
            if len(kw.split(':'))==2:
                # 还要去掉^^
                word = kw.split(':')[0].replace('^^','')
                if word != '':
                    z2.append(word)
                    z1.append(kw.split(':')[1])
            else:
                if ':' in kw:
                    # 异形^^:^^火力^^小队:15.721525 处理这种情况
                    word = kw.split(':',1)[1].split(':')[0].replace('^^','')
                    if word != '':
                        z2.append(word)
                        z1.append(kw.split(':',1)[1].split(':')[1])
        if prob:
            return z1
        else:
            return z2
item['keywords'] = item['item_keywords'].progress_apply(lambda x: chuli_k(x))
item['keywords_p'] = item['item_keywords'].progress_apply(lambda x: chuli_k(x, prob=True))

pandas bar: 100%|██████████| 633391/633391 [00:18<00:00, 33636.62it/s]
pandas bar: 100%|██████████| 633391/633391 [00:19<00:00, 31723.68it/s]


In [32]:
# 先得到keywords
keywords = []
def get_k(x):
    if x != 'nan':
        keywords.extend(x)
item['keywords'].progress_apply(lambda x: get_k(x))
keywords = set(keywords)

# 对keywords进行label_encode
# 处理关键字信息，并得到encode的字典,label_encode时间太久了，用dict来做
keywords = list(keywords)
ind = list(range(1,len(keywords)+1))
keys = dict(zip(keywords,ind))
def chuli_kk(x):
    if x == 'nan':
        return x
    else:
        z = []
        for i in range(len(x)):
            z.append(keys[x[i]])
        return z
item['keywords'] = item['keywords'].progress_apply(lambda x: chuli_kk(x))

pandas bar: 100%|██████████| 633391/633391 [00:01<00:00, 616896.47it/s]
pandas bar: 100%|██████████| 633391/633391 [00:10<00:00, 62006.21it/s]


In [33]:
iid_dict = dict(zip(item.item_id, item.index))

In [34]:
item['new_id'] = item.index
item.drop(['item_id'], axis=1, inplace=True)
item.rename(columns={'new_id': 'item_id'}, inplace=True)
item = item[['item_id', 'item_picture', 'item_cluster1', 'item_cluster2', 'keywords', 'keywords_p']]

## 处理训练集和测试集

In [35]:
train.head(1)

Unnamed: 0,user_id,item_id,time,network,refresh,position,label,duration
0,1000014754,463510256,1624843756147,5,0,16,0,0


In [36]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189766959 entries, 0 to 189766958
Data columns (total 8 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   item_id   int64
 2   time      int64
 3   network   int64
 4   refresh   int64
 5   position  int64
 6   label     int64
 7   duration  int64
dtypes: int64(8)
memory usage: 11.3 GB


In [37]:
train_inter = train.sort_values('time', ascending=True)

In [38]:
groups = train_inter.groupby('user_id') # 先排序后分组

In [39]:
# 获得用户历史行为序列
train_set = []
userId2Behavior = {}

for userId, hist in tqdm(groups):
    hist_list = hist['item_id'].tolist()
    label_list = hist['label'].tolist()
    duration_list = hist['duration'].tolist()
    network_list = hist['network'].tolist()
    refresh_list = hist['refresh'].tolist()
    position_list = hist['position'].tolist()
    time_list = hist['time'].tolist()

    positive_list = [] # 正样本列表

    length = len(hist_list)

    if 1 not in label_list:
        for i in range(length):
            # [userId, itemId, time, network, refresh, position, duration, behaviorId, label]
            train_set.append([userId, hist_list[i], time_list[i], network_list[i], refresh_list[i], \
                position_list[i], duration_list[i], '', label_list[i]])
        continue
    else:
        start = label_list.index(1)

    for i in range(start + 1):
        train_set.append([userId, hist_list[i], time_list[i], network_list[i], refresh_list[i], \
             position_list[i], duration_list[i], '', label_list[i]])
    
    if start == length - 1: # 只有一个正样本，且正样本后没有负样本, 该正样本作为测试集的历史行为
        userId2Behavior[userId] = [hist_list[start]]
        continue
    positive_list.append(hist_list[start])

    # 用户的历史交互序列可以分为多个序列
    for i in range(start + 1, length):
        positive_list_subset = positive_list
        if len(positive_list) > 50:
            positive_list_subset = positive_list[-50:]
        train_set.append([userId, hist_list[i], time_list[i], network_list[i], refresh_list[i],\
             position_list[i], duration_list[i], ','.join(str(v) for v in positive_list_subset), label_list[i]]) # [user_id, item_id, behavior, duration]
        if label_list[i] == 1:
            positive_list.append(hist_list[i])
        if i == (length - 1):
            userId2Behavior[userId] = positive_list

 76%|███████▌  | 1126439/1478694 [29:51<07:40, 764.79it/s]

In [None]:
train_data = pd.DataFrame(train_set, columns=['user_id', 'item_id', 'time', 'network', 'refresh', 'position', 'duration', 'behavior_id', 'label'])
train_data

Unnamed: 0,user_id,item_id,time,network,refresh,position,duration,behavior_id,label
0,17340,462907578,1624623340452,2,0,16,364,[],1
1,17340,462077126,1624623340452,2,0,12,0,"[462907578, 462317087, 462821612, 462464730, 4...",0
2,17340,462317087,1624623340452,2,0,14,19,"[462907578, 462317087, 462821612, 462464730, 4...",1
3,17340,462243474,1624623340452,2,0,6,0,"[462907578, 462317087, 462821612, 462464730, 4...",0
4,17340,462821612,1624623340452,2,0,13,6,"[462907578, 462317087, 462821612, 462464730, 4...",1
...,...,...,...,...,...,...,...,...,...
189766751,2447273764,466036199,1625587022525,2,1,12,0,[],0
189766752,2447273838,466809988,1625587199715,2,1,7,0,[],0
189766753,2447273852,466817295,1625587139950,5,0,7,0,[],0
189766754,2447273854,466083315,1625587158131,2,0,8,0,[],0


## 获得包含用户历史交互记录的测试集

In [None]:
# 构造测试集
cnt = 0
test_set = []
for index, data in tqdm(test.iterrows()):
    userId = data[1]
    docId = data[2]
    network = data[3]
    refresh = data[4]
    if userId in userId2Behavior.keys(): # 新用户，历史行为列表置为 [0]
        positive_list = userId2Behavior[userId]
        if len(positive_list) > 50:
            positive_list = positive_list[-50:]
        test_set.append([userId, docId, network, refresh, ','.join(str(v) for v in positive_list)]) # [userId, itemId, behavior]
    else:
        cnt += 1
        test_set.append([userId, docId, network, refresh, '']) # [userId, itemId, behavior]

50000it [00:02, 19554.45it/s]


In [None]:
test_data = pd.DataFrame(test_set, columns=['user_id', 'item_id', 'network', 'refresh', 'behavior_id'])

In [None]:
test_data

Unnamed: 0,user_id,item_id,network,refresh,behavior_id
0,1375690406,466953548,1625670551352,2,"[466276727, 465516352, 466090289, 466165951, 4..."
1,2440720232,466998093,1625666340961,2,"[464879818, 465304020, 465430261, 465397240, 4..."
2,2215148410,466562014,1625633299623,2,"[464481328, 464384437, 464877227, 465028809, 4..."
3,1439698458,466141989,1625613272489,2,"[462678614, 462625941, 461772388, 462814538, 4..."
4,1443858466,466335797,1625654253069,5,[]
...,...,...,...,...,...
49995,1482434596,466823351,1625614181006,2,"[464639557, 465022489, 464890967, 465064102, 4..."
49996,1403405680,466329372,1625656090576,5,"[464523794, 464953384, 464942014, 464727307, 4..."
49997,1433009930,466837941,1625646622637,5,"[466122617, 466153779, 465997946, 466256953, 4..."
49998,2226316954,467221118,1625670079349,2,"[463304767, 463492789, 463531117, 462765057, 4..."


- 训练集的结束时间为2021.6.24 22:56:53, 训练集的结束时间为2021.7.6 23:59:59
- 测试集的开始时间为2021.7.7 00:00:04, 测试集的结束时间为2021.7.7 23:41:22
- 验证集截取7.6一天的数据

In [None]:
with open(r'./train_online.pkl', 'wb') as f:
    pickle.dump(train_data[['user_id', 'item_id', 'network', 'refresh', 'behavior_id', 'label']].to_numpy(), f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [None]:
validation = train_data.loc[train_data['time']>1625500811000]
validation.shape

(15191983, 9)

In [None]:
train_data = train_data.loc[set(train_data.index) - set(validation.index)]
train_data.shape

(174574773, 9)

In [None]:
train_data = train_data[['user_id', 'item_id', 'network', 'refresh', 'behavior_id', 'label']]
validation = validation[['user_id', 'item_id', 'network', 'refresh', 'behavior_id', 'label']]
test = test_data[['user_id', 'item_id', 'network', 'refresh', 'behavior_id']]

In [None]:
train_data

Unnamed: 0,user_id,item_id,network,refresh,behavior_id,label
0,17340,462907578,2,0,[],1
1,17340,462077126,2,0,"[462907578, 462317087, 462821612, 462464730, 4...",0
2,17340,462317087,2,0,"[462907578, 462317087, 462821612, 462464730, 4...",1
3,17340,462243474,2,0,"[462907578, 462317087, 462821612, 462464730, 4...",0
4,17340,462821612,2,0,"[462907578, 462317087, 462821612, 462464730, 4...",1
...,...,...,...,...,...,...
189697619,2447156438,466158873,2,1,[],0
189697620,2447156438,465785748,2,1,[],0
189697621,2447156438,466329097,2,1,[],0
189697622,2447156438,466320209,2,1,[],0


In [None]:
validation

Unnamed: 0,user_id,item_id,network,refresh,behavior_id,label
484,17340,466446910,5,0,"[465148736, 464686134, 464707540, 464993414, 4...",0
485,17340,465871960,5,2,"[465148736, 464686134, 464707540, 464993414, 4...",1
486,17340,466335882,5,2,"[464686134, 464707540, 464993414, 464622349, 4...",0
487,17340,466562014,5,2,"[464686134, 464707540, 464993414, 464622349, 4...",0
488,17340,466412000,5,2,"[464686134, 464707540, 464993414, 464622349, 4...",0
...,...,...,...,...,...,...
189766751,2447273764,466036199,2,1,[],0
189766752,2447273838,466809988,2,1,[],0
189766753,2447273852,466817295,5,0,[],0
189766754,2447273854,466083315,2,0,[],0


In [None]:
test

Unnamed: 0,user_id,item_id,network,refresh,behavior_id
0,1375690406,466953548,1625670551352,2,"[466276727, 465516352, 466090289, 466165951, 4..."
1,2440720232,466998093,1625666340961,2,"[464879818, 465304020, 465430261, 465397240, 4..."
2,2215148410,466562014,1625633299623,2,"[464481328, 464384437, 464877227, 465028809, 4..."
3,1439698458,466141989,1625613272489,2,"[462678614, 462625941, 461772388, 462814538, 4..."
4,1443858466,466335797,1625654253069,5,[]
...,...,...,...,...,...
49995,1482434596,466823351,1625614181006,2,"[464639557, 465022489, 464890967, 465064102, 4..."
49996,1403405680,466329372,1625656090576,5,"[464523794, 464953384, 464942014, 464727307, 4..."
49997,1433009930,466837941,1625646622637,5,"[466122617, 466153779, 465997946, 466256953, 4..."
49998,2226316954,467221118,1625670079349,2,"[463304767, 463492789, 463531117, 462765057, 4..."


# 存储数据

In [None]:
with open(r'./train.pkl', 'wb') as f:
    pickle.dump(train_data.to_numpy(), f, pickle.HIGHEST_PROTOCOL)
with open(r'./test.pkl', 'wb') as f:
    pickle.dump(test.to_numpy(), f, pickle.HIGHEST_PROTOCOL)
with open(r'./validation.pkl', 'wb') as f:
    pickle.dump(validation.to_numpy(), f, pickle.HIGHEST_PROTOCOL)
with open(r'./user_feature.pkl', 'wb') as f:
    pickle.dump(user, f, pickle.HIGHEST_PROTOCOL)
with open(r'./item_feature.pkl', 'wb') as f:
    pickle.dump(item, f, pickle.HIGHEST_PROTOCOL)
with open(r'./id_dict.pkl', 'wb') as f:
    pickle.dump((uid_dict, iid_dict), f, pickle.HIGHEST_PROTOCOL)

In [None]:
import json
info_dict = {'feat_field': {}, 'vocabulary_size':{}, 'feat_type':{'dense':[], 'sparse':[], 'multi-hot':[]}, 'sample_num': train.shape[0], 'user_num': user.shape[0], 'item_num': item.shape[0]}

user_feats = list(user.columns)
user_feats.remove('user_age')
user_feats.remove('user_gender')
for feat in user_feats:
    info_dict['vocabulary_size'][feat] = user[feat].nunique()
info_dict['vocabulary_size']['user_age'] = 4
info_dict['vocabulary_size']['user_gender'] = 2

for feat in list(item.columns):
    info_dict['vocabulary_size'][feat] = item[feat].nunique()

for feat in ['network', 'refresh']:
    info_dict['vocabulary_size'][feat] = train[feat].nunique()
info_dict['vocabulary_size']['network'] = 6
info_dict['vocabulary_size']['day'] = 7
info_dict['vocabulary_size']['hour'] = 24
info_dict['feat_type']['sparse'] = user_feats + list(item.columns) + ['network', 'refresh']
info_dict['feat_type']['multi-hot'] = ['user_age', 'user_gender']
info_dict['feat_field']['user'] = list(user.columns)
info_dict['feat_field']['item'] = list(item.columns)
info_dict['feat_field']['inter'] = list(train.columns)

with open(r'./info.json', 'w') as f:
    json.dump(info_dict, f)

In [38]:
dict(user.nunique())

{'user_id': 1538384,
 'user_device': 3096,
 'user_system': 3,
 'user_province': 328,
 'user_city': 768}

## 处理线下训练集的行为序列

In [4]:
import pickle
with open('./train.pkl', 'rb') as f:
    train = pickle.load(f)
    f.close()

In [None]:
import numpy as np
with open('./train.pkl', 'wb') as f:
    pickle.dump(np.delete(train, 4, axis=1), f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [2]:
behaviors =  train[:, 4]
behaviors

array(['', '462907578', '462907578', ..., '', '', ''], dtype=object)

In [5]:
len(behaviors)

174574959

In [2]:
def fix_behavior_length(x, length):
    if len(x) >= length:
        return x[(-1 * length):]
    for i in range(len(x), length):
        x.append(0)
    return x

In [6]:
import numpy as np
from tqdm import tqdm

behavior_data = []
for index, b in enumerate(tqdm(behaviors)):
    if b == '':
        behavior_data.append(np.zeros(5))
    else:
        behavior_data.append(fix_behavior_length(list(map(lambda x: int(x), b.split(','))), 5))

100%|██████████| 174574959/174574959 [23:13<00:00, 125311.10it/s]


In [8]:
len(behavior_data)

174574959

In [10]:
behavior_data[:2]

[array([0., 0., 0., 0., 0.]), [462907578, 0, 0, 0, 0]]

In [9]:
with open('./behavior_train.pkl', 'wb') as f:
    pickle.dump(behavior_data, f, pickle.HIGHEST_PROTOCOL)
    f.close()

## 处理线下验证集的行为序列

In [1]:
import pickle
with open('./validation.pkl', 'rb') as f:
    validation = pickle.load(f)
    f.close()

In [3]:
import numpy as np
with open('./validation.pkl', 'wb') as f:
    pickle.dump(np.delete(validation, 4, axis=1), f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [2]:
behaviors = validation[:, 4]
behaviors

array(['464686134,465381362,464622349,464707540,464993414,465165905,465115022,465396605,464711095,465149242,465306246,465509731,465245341,465598264,465603407,465415741,465387988,465707919,464931373,464829824,464902342,465670760,465426190,464938309,465184479,465547883,465209282,464945580,465534532,465418906,464935673,465702944,465790363,465827093,465930516,465321111,465579899,465958102,466029824,466257253,465940297,465973671,466369280,465941692,465827727,466016538,466260424,466277913,466179098,465796340',
       '464686134,465381362,464622349,464707540,464993414,465165905,465115022,465396605,464711095,465149242,465306246,465509731,465245341,465598264,465603407,465415741,465387988,465707919,464931373,464829824,464902342,465670760,465426190,464938309,465184479,465547883,465209282,464945580,465534532,465418906,464935673,465702944,465790363,465827093,465930516,465321111,465579899,465958102,466029824,466257253,465940297,465973671,466369280,465941692,465827727,466016538,466260424,466277913,46

In [3]:
len(behaviors)

15192000

In [4]:
def fix_behavior_length(x, length):
    if len(x) >= length:
        return x[(-1 * length):]
    for i in range(len(x), length):
        x.append(0)
    return x

In [5]:
import numpy as np
from tqdm import tqdm

behavior_data = []
for index, b in enumerate(tqdm(behaviors)):
    if b == '':
        behavior_data.append(np.zeros(5))
    else:
        behavior_data.append(fix_behavior_length(list(map(lambda x: int(x), b.split(','))), 5))

100%|██████████| 15192000/15192000 [02:29<00:00, 101770.74it/s]


In [6]:
behavior_data

[[466016538, 466260424, 466277913, 466179098, 465796340],
 [466016538, 466260424, 466277913, 466179098, 465796340],
 [466016538, 466260424, 466277913, 466179098, 465796340],
 [466016538, 466260424, 466277913, 466179098, 465796340],
 [466016538, 466260424, 466277913, 466179098, 465796340],
 [466260424, 466277913, 466179098, 465796340, 465871960],
 [466260424, 466277913, 466179098, 465796340, 465871960],
 [466277913, 466179098, 465796340, 465871960, 466368404],
 [466277913, 466179098, 465796340, 465871960, 466368404],
 [466277913, 466179098, 465796340, 465871960, 466368404],
 [466179098, 465796340, 465871960, 466368404, 466126343],
 [466179098, 465796340, 465871960, 466368404, 466126343],
 [466179098, 465796340, 465871960, 466368404, 466126343],
 [466179098, 465796340, 465871960, 466368404, 466126343],
 [465796340, 465871960, 466368404, 466126343, 466423704],
 [465796340, 465871960, 466368404, 466126343, 466423704],
 [465871960, 466368404, 466126343, 466423704, 466320999],
 [465871960, 4

In [7]:
len(behavior_data)

15192000

In [8]:
with open('./behavior_validation.pkl', 'wb') as f:
    pickle.dump(behavior_data, f, pickle.HIGHEST_PROTOCOL)
    f.close()

## 处理线上训练集行为序列

In [1]:
import pickle
with open('./train_online.pkl', 'rb') as f:
    train = pickle.load(f)
    f.close()

In [19]:
with open('./train_online.pkl', 'wb') as f:
    pickle.dump(np.delete(train, 4, axis=1), f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [2]:
behaviors = train[:, 4]
behaviors

array(['', '462907578', '462907578', ..., '', '', ''], dtype=object)

In [3]:
len(behaviors)

189766959

In [4]:
def fix_behavior_length(x, length):
    if len(x) >= length:
        return x[(-1 * length):]
    for i in range(len(x), length):
        x.append(0)
    return x

In [5]:
import numpy as np
from tqdm import tqdm

behavior_data = []
for index, b in enumerate(tqdm(behaviors)):
    if b == '':
        behavior_data.append(np.zeros(5))
    else:
        behavior_data.append(fix_behavior_length(list(map(lambda x: int(x), b.split(','))), 5))

100%|██████████| 189766959/189766959 [27:11<00:00, 116299.68it/s]


In [6]:
behavior_data

[array([0., 0., 0., 0., 0.]),
 [462907578, 0, 0, 0, 0],
 [462907578, 0, 0, 0, 0],
 [462907578, 462317087, 0, 0, 0],
 [462907578, 462317087, 0, 0, 0],
 [462907578, 462317087, 462821612, 0, 0],
 [462907578, 462317087, 462821612, 462464730, 0],
 [462907578, 462317087, 462821612, 462464730, 0],
 [462907578, 462317087, 462821612, 462464730, 0],
 [462907578, 462317087, 462821612, 462464730, 0],
 [462907578, 462317087, 462821612, 462464730, 0],
 [462907578, 462317087, 462821612, 462464730, 0],
 [462907578, 462317087, 462821612, 462464730, 462490156],
 [462907578, 462317087, 462821612, 462464730, 462490156],
 [462907578, 462317087, 462821612, 462464730, 462490156],
 [462907578, 462317087, 462821612, 462464730, 462490156],
 [462317087, 462821612, 462464730, 462490156, 462098172],
 [462821612, 462464730, 462490156, 462098172, 462489596],
 [462464730, 462490156, 462098172, 462489596, 462103758],
 [462490156, 462098172, 462489596, 462103758, 463137667],
 [462098172, 462489596, 462103758, 463137667

In [7]:
len(behavior_data)

189766959

In [8]:
with open('./behavior_train_online.pkl', 'wb') as f:
    pickle.dump(behavior_data, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [17]:
np.delete(train, 4, axis=1)

array([[17340, 462907578, 2, 0, 1],
       [17340, 462077126, 2, 0, 0],
       [17340, 462317087, 2, 0, 1],
       ...,
       [2447273852, 466817295, 5, 0, 0],
       [2447273854, 466083315, 2, 0, 0],
       [2447273874, 466838519, 2, 0, 0]], dtype=object)

## 处理线上测试集行为序列

In [9]:
import pickle
with open('./test.pkl', 'rb') as f:
    test = pickle.load(f)
    f.close()

In [20]:
with open('./test.pkl', 'wb') as f:
    pickle.dump(np.delete(test, 4, axis=1), f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [10]:
behaviors = test[:, 4]
behaviors

array(['466276727,465516352,466090289,466165951,466211051,466304869,466277352,466242880,466160502,465776408,466070242,466152627,466152552,466161038,466161040,466363761,465976281,466024365,465964472,465725391,465708664,466084207,466283471,466215970,465822476,466074235,466152716,466036630,466160820,466156822,466444257,465719059,466343892,466404226,466470014,466560412,466560430,466402615,466560505,466463891,466396939,466497105,466639094,466639171,466831193,466876421,466791095,466563726,466798141,466269036',
       '464879818,465304020,465397240,465430261,465313977,465304330,465320086,465114804,465418359,465214682,465268915,464757598,465295015,464800850,465429593,465490828,464597827,464719505,465425217,465652204,465855563,466084738,466318936,466201336,465897698,466348526,466189109,466315782,466282707,466337457,466005010,465935004,466332734,466085577,466276907,466170589,465931138,465794257,466017667,465817755,466063811,466089283,465958923,466284129,465881308,466746869,466499412,466595466,46

In [11]:
len(behaviors)

50000

In [12]:
def fix_behavior_length(x, length):
    if len(x) >= length:
        return x[(-1 * length):]
    for i in range(len(x), length):
        x.append(0)
    return x

In [13]:
import numpy as np
from tqdm import tqdm

behavior_data = []
for index, b in enumerate(tqdm(behaviors)):
    if b == '':
        behavior_data.append(np.zeros(5))
    else:
        behavior_data.append(fix_behavior_length(list(map(lambda x: int(x), b.split(','))), 5))

100%|██████████| 50000/50000 [00:00<00:00, 109746.53it/s]


In [14]:
behavior_data

[[466876421, 466791095, 466563726, 466798141, 466269036],
 [466746869, 466499412, 466595466, 466271836, 466673213],
 [465768168, 466412000, 466434630, 466416050, 466541231],
 [465459411, 465870698, 465877686, 466044246, 466105862],
 array([0., 0., 0., 0., 0.]),
 [464017885, 465140484, 465087157, 465272680, 466195551],
 [465128171, 465206949, 465849204, 466345446, 466364311],
 [464066222, 464539291, 464482997, 464584356, 465399704],
 [466265132, 466390209, 466266191, 466283317, 466465644],
 [466070635, 466592349, 466724367, 466625994, 466625994],
 [466033721, 466267731, 466141989, 465753089, 465781204],
 [466347594, 466608596, 465967167, 466754868, 466783111],
 [463473302, 464244162, 464821092, 465822412, 466004979],
 [465119564, 465349981, 465307226, 0, 0],
 [465701770, 465765707, 465849204, 466230162, 465670778],
 [466749855, 466606169, 466403924, 466705984, 466848382],
 [466207328, 466368573, 466464709, 466096919, 466557292],
 [463017414, 464172111, 464402095, 466007382, 466339713],


In [15]:
len(behavior_data)

50000

In [16]:
with open('./behavior_test.pkl', 'wb') as f:
    pickle.dump(behavior_data, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [None]:
import pickle
with o