In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)
from tqdm import tqdm
tqdm.pandas(desc='pandas bar')
import pickle
from sklearn.preprocessing import LabelEncoder
import time

# 读数据
item读取有些麻烦, 需要按行来读. 其他数据正常使用pandas来读.

In [2]:
data_path = r'./data_for_ctr_predict/'
user = pd.read_table(data_path+'user_info.txt', sep='\t', names=['user_id', 'user_device', 'user_system', 'user_province', 'user_city', 'user_age', 'user_gender'])
#item = pd.read_table(data_path+'doc_info.txt', sep='\t', names=['item_id', 'item_title', 'item_time', 'item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords'])
train = pd.read_table(data_path+'train_data.txt', sep='\t', names=['user_id', 'item_id', 'time', 'network', 'refresh', 'position', 'label', 'duration'])
test = pd.read_table(data_path+'test_data.txt', sep='\t', names=['id', 'user_id', 'item_id', 'time', 'network', 'refresh'])

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
data_path = r'./data_for_ctr_predict/'
with open(data_path+'doc_info.txt', 'r') as f:
    lines = f.readlines()

item = []
for line in tqdm(lines):
    unit = line.split('\t')
    if len(unit)<7:
        for _ in range(7-len(unit)):
            unit.append(np.nan)
    item.append(unit)
#item = np.array(item)

100%|██████████| 633391/633391 [00:02<00:00, 278138.74it/s]


In [5]:
item = pd.DataFrame(item, columns=['item_id', 'item_title', 'item_time', 'item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords'])
item['item_id'] = item['item_id'].astype('int64')

# 检查数据

## 检查是否存在冷启动
验证了一下, 训练集和测试集内都不存在冷启动问题

In [6]:
len(set(train['item_id'].unique())-set(item['item_id'].unique())), len(set(train['user_id'].unique())-set(user['user_id'].unique()))

(0, 0)

In [7]:
len(set(test['item_id'].unique())-set(item['item_id'].unique())), len(set(test['user_id'].unique())-set(user['user_id'].unique()))

(0, 0)

## 检查空缺值

In [8]:
user.isnull().sum() / user.shape[0]

user_id          0.000000
user_device      0.044036
user_system      0.044077
user_province    0.059972
user_city        0.062450
user_age         0.038931
user_gender      0.038074
dtype: float64

In [9]:
user['user_age'].fillna('A_0_24:0.25,A_25_29:0.25,A_30_39:0.25,A_40+:0.25', inplace=True)
user['user_gender'].fillna('female:0.5,male:0.5', inplace=True)

In [10]:
item.isnull().sum() / item.shape[0]

item_id          0.000000
item_title       0.000000
item_time        0.000382
item_picture     0.000382
item_cluster1    0.000444
item_cluster2    0.000445
item_keywords    0.017337
dtype: float64

In [11]:
train.isnull().sum() / train.shape[0]

user_id     0.0
item_id     0.0
time        0.0
network     0.0
refresh     0.0
position    0.0
label       0.0
duration    0.0
dtype: float64

In [12]:
test.isnull().sum() / test.shape[0]

id         0.0
user_id    0.0
item_id    0.0
time       0.0
network    0.0
refresh    0.0
dtype: float64

# 处理数据

## 处理user特征

In [13]:
user.head(10)

Unnamed: 0,user_id,user_device,user_system,user_province,user_city,user_age,user_gender
0,1000372820,TAS-AN00,Android,广东,广州,"A_0_24:0.404616,A_25_29:0.059027,A_30_39:0.516...","female:0.051339,male:0.948661"
1,1000652892,PACM00,Android,河北,唐山,"A_0_24:0.615458,A_25_29:0.086233,A_30_39:0.141...","female:0.280295,male:0.719705"
2,1000908852,MI6X,Android,上海,上海,"A_0_24:0.123255,A_25_29:0.208225,A_30_39:0.298...","female:0.000000,male:1.000000"
3,1001168798,iPhone11,IOS,,,"A_0_24:0.436296,A_25_29:0.489370,A_30_39:0.061...","female:0.870710,male:0.129290"
4,1001305614,M2103K19C,Android,江苏,苏州,"A_0_24:0.006632,A_25_29:0.043408,A_30_39:0.350...","female:0.000000,male:1.000000"
5,1001309700,LYA-AL10,Android,黑龙江,哈尔滨,"A_0_24:0.413768,A_25_29:0.437501,A_30_39:0.141...","female:0.000000,male:1.000000"
6,1001339860,RedmiNote7Pro,Android,天津,天津,"A_0_24:0.067328,A_25_29:0.029283,A_30_39:0.303...","female:0.155445,male:0.844555"
7,1001384888,M2007J22C,Android,河北,石家庄,"A_0_24:0.008414,A_25_29:0.027505,A_30_39:0.161...","female:0.000000,male:1.000000"
8,100142658,JEF-AN20,Android,重庆,重庆,"A_0_24:0.160670,A_25_29:0.695923,A_30_39:0.109...","female:1.000000,male:0.000000"
9,1001439274,HMA-AL00,Android,山东,淄博,"A_0_24:0.000172,A_25_29:0.007701,A_30_39:0.778...","female:0.000000,male:1.000000"


In [14]:
def get_multi_hot(x, k):
    str_list = x.split(',')
    res = [float(s.split(':')[1]) for s in str_list]
    for _ in range(k-len(res)):
        print('加0')
        res.append(0)
    if len(res) == 16:
        res = np.array(res)
        res = res[[0,4,8,12]]
        res = list(res)
    if (k == 2) & (len(res) == 4):
        res = np.array(res)
        res = res[[0,2]]
        res = list(res)
    if len(res) != k:
        print(x)
    return res

In [15]:
# 将age和gender都处理成列表的形式
user['new_age'] = user['user_age'].progress_apply(lambda x: get_multi_hot(x, 4))
user['new_gender'] = user['user_gender'].progress_apply(lambda x: get_multi_hot(x, 2))

pandas bar: 100%|██████████| 1538384/1538384 [00:06<00:00, 223436.03it/s]
pandas bar:  15%|█▌        | 234516/1538384 [00:00<00:03, 404714.12it/s]

加0


pandas bar:  23%|██▎       | 357896/1538384 [00:00<00:02, 403481.17it/s]

加0
加0


pandas bar:  45%|████▍     | 687693/1538384 [00:02<00:02, 375377.42it/s]

加0
加0
加0


pandas bar:  53%|█████▎    | 813255/1538384 [00:02<00:01, 402893.08it/s]

加0
加0


pandas bar:  64%|██████▎   | 979612/1538384 [00:03<00:02, 241751.12it/s]

加0


pandas bar:  74%|███████▍  | 1145760/1538384 [00:03<00:01, 352757.10it/s]

加0


pandas bar:  85%|████████▌ | 1312477/1538384 [00:04<00:00, 400702.95it/s]

加0
加0
加0
加0


pandas bar:  94%|█████████▎| 1438637/1538384 [00:04<00:00, 413802.47it/s]

加0
加0


pandas bar: 100%|██████████| 1538384/1538384 [00:05<00:00, 299311.07it/s]

加0





In [16]:
user.drop(['user_age', 'user_gender'], axis=1, inplace=True)
user.rename(columns={'new_age': 'user_age', 'new_gender': 'user_gender'}, inplace=True)

In [17]:
user[['user_device', 'user_system', 'user_province', 'user_city']] = user[['user_device', 'user_system', 'user_province', 'user_city']].fillna('nan')

In [18]:
user.isnull().sum()

user_id          0
user_device      0
user_system      0
user_province    0
user_city        0
user_age         0
user_gender      0
dtype: int64

In [19]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
user[['user_device', 'user_system', 'user_province', 'user_city']] = user[['user_device', 'user_system', 'user_province', 'user_city']].apply(encoder.fit_transform)

In [20]:
uid_dict = dict(zip(user.user_id, user.index))

In [21]:
user['new_id'] = user.index
user.drop(['user_id'], axis=1, inplace=True)
user.rename(columns={'new_id': 'user_id'}, inplace=True)
user = user[['user_id', 'user_device', 'user_system', 'user_province', 'user_city', 'user_gender', 'user_age']]

In [22]:
user[['user_id', 'user_device', 'user_system', 'user_province', 'user_city']].nunique()

user_id          1538384
user_device         3096
user_system            3
user_province        328
user_city            768
dtype: int64

## 处理item特征

In [23]:
item.head(10)

Unnamed: 0,item_id,item_title,item_time,item_picture,item_cluster1,item_cluster2,item_keywords
0,349635709,"拿到c1驾照后,实习期扣分了会怎样?扣12分驾照会吊销么?",1572519971000,9,汽车,汽车/用车,"上班族:8.469502,买车:8.137443,二手车:9.022247,副页:11.21..."
1,361653323,"疫情谣言粉碎机丨接种新冠疫苗后用麻药或致死?盘点最新疫情谣言,别被忽悠了",1624522285000,1,健康,健康/疾病防护治疗及西医用药,"医生:14.760494,吸烟:16.474872,板蓝根:15.597788,板蓝根^^熏..."
2,426732705,"实拍本田飞度:空间真大,8万出头工薪族可选,但内饰能忍?",1610808303000,9,汽车,汽车/买车,"155n:8.979802,polo:7.951116,中控台:5.954278,中网:7...."
3,430221183,搭载135kw电机比亚迪秦plus纯电动版外观更精致,1612581556000,2,汽车,汽车/买车,"etc:12.055207,代表:8.878175,内饰:5.342025,刀片:9.453..."
4,441756326,【提车作业】不顾他人眼光帕萨特phev俘获30老男人浪子心,1618825835000,23,汽车,汽车/买车,"丰田凯美瑞:12.772149,充电器:8.394001,品牌:8.436843,城市:7...."
5,443485341,"魏延有反骨之心都能重用,赵云忠心为什么却不被重用?",1619484501000,4,历史,历史/中国史,"三国:8.979797,五虎将:13.072728,人才:7.532783,保镖:6.811..."
6,447124796,"高考志愿|14个省份新高考录取有变化,这些专业傻傻分不清楚,填志愿看仔细",1624506618000,1,教育,教育/高考,"兴趣:11.558689,家长:11.350382,就业:12.176434,考生:11.5..."
7,448023100,三国煮酒论英雄,1624601704000,1,文化艺术,文化艺术/读书,"历史:9.184261,故事:9.405677\n"
8,452701283,懂我所需—吉利帝豪,1622081779000,19,汽车,汽车/买车,"ec7:14.355725,交互:10.203005,人机交互:10.540608,人机交互..."
9,452933213,"美丽的非洲长河?可以从这条河中,人的形象与命运?",1622121296000,3,旅游,旅游/旅游攻略,"东非:8.944922,东非高原:11.758570,两河:9.025832,伊兹:7.49..."


In [24]:
def convert_time(x):
    try:
        return int(x)
    except:
        return int(1563404932000)

item['item_time'] = item['item_time'].apply(convert_time)

In [25]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
item['item_time'].astype('int64').describe()

count          633391.000
mean    1624972374821.468
std        1291307341.395
min     1563404932000.000
25%     1624678967000.000
50%     1624975577000.000
75%     1625301111000.000
max     1625672120000.000
Name: item_time, dtype: float64

In [26]:
time_dict = dict(zip(item.item_id, item.item_time))

2019-7-18--2021-7-7

In [27]:
item = item[['item_id', 'item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords']]

In [28]:
encoder = LabelEncoder()
item[['item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords']] = item[['item_picture', 'item_cluster1', 'item_cluster2', 'item_keywords']].fillna('nan')
item[['item_picture', 'item_cluster1', 'item_cluster2']] = item[['item_picture', 'item_cluster1', 'item_cluster2']].apply(encoder.fit_transform)

In [29]:
item.isnull().sum()

item_id          0
item_picture     0
item_cluster1    0
item_cluster2    0
item_keywords    0
dtype: int64

### 处理keywords

In [30]:
# 处理关键字信息
def chuli_k(x, prob=False):
    if x == 'nan':
        return x
    else:
        try:
            y = x.split(',')
        except:
            print(x)
        z1 = [] # 存概率
        z2 = [] # 存keyword
        for kw in y: 
            if len(kw.split(':'))==2:
                # 还要去掉^^
                word = kw.split(':')[0].replace('^^','')
                if word != '':
                    z2.append(word)
                    z1.append(kw.split(':')[1])
            else:
                if ':' in kw:
                    # 异形^^:^^火力^^小队:15.721525 处理这种情况
                    word = kw.split(':',1)[1].split(':')[0].replace('^^','')
                    if word != '':
                        z2.append(word)
                        z1.append(kw.split(':',1)[1].split(':')[1])
        if prob:
            return z1
        else:
            return z2
item['keywords'] = item['item_keywords'].progress_apply(lambda x: chuli_k(x))
item['keywords_p'] = item['item_keywords'].progress_apply(lambda x: chuli_k(x, prob=True))

pandas bar: 100%|██████████| 633391/633391 [00:18<00:00, 33786.05it/s]
pandas bar: 100%|██████████| 633391/633391 [00:20<00:00, 31630.37it/s]


In [31]:
# 先得到keywords
keywords = []
def get_k(x):
    if x != 'nan':
        keywords.extend(x)
item['keywords'].progress_apply(lambda x: get_k(x))
keywords = set(keywords)

# 对keywords进行label_encode
# 处理关键字信息，并得到encode的字典,label_encode时间太久了，用dict来做
keywords = list(keywords)
ind = list(range(1,len(keywords)+1))
keys = dict(zip(keywords,ind))
def chuli_kk(x):
    if x == 'nan':
        return x
    else:
        z = []
        for i in range(len(x)):
            z.append(keys[x[i]])
        return z
item['keywords'] = item['keywords'].progress_apply(lambda x: chuli_kk(x))

pandas bar: 100%|██████████| 633391/633391 [00:00<00:00, 703649.71it/s]
pandas bar: 100%|██████████| 633391/633391 [00:10<00:00, 62927.82it/s]


In [32]:
iid_dict = dict(zip(item.item_id, item.index))

In [33]:
item['new_id'] = item.index
item.drop(['item_id'], axis=1, inplace=True)
item.rename(columns={'new_id': 'item_id'}, inplace=True)
item = item[['item_id', 'item_picture', 'item_cluster1', 'item_cluster2', 'keywords', 'keywords_p']]

## 处理训练集和测试集

In [34]:
train.head(1)

Unnamed: 0,user_id,item_id,time,network,refresh,position,label,duration
0,1000014754,463510256,1624843756147,5,0,16,0,0


In [35]:
train_inter = train.sort_values('time', ascending=True)

In [36]:
groups = train_inter.groupby('user_id') # 先排序后分组

In [37]:
# 获得用户历史行为序列
train_set = []
userId2Behavior = {}

for userId, hist in tqdm(groups):
    hist_list = hist['item_id'].tolist()
    label_list = hist['label'].tolist()
    network_list = hist['network'].tolist()
    refresh_list = hist['refresh'].tolist()
    time_list = hist['time'].tolist()

    positive_list = [] # 正样本列表

    length = len(hist_list)

    if 1 not in label_list:
        for i in range(length):
            # [userId, itemId, time, network, refresh, behaviorId, label]
            train_set.append([userId, hist_list[i], time_list[i], network_list[i], refresh_list[i], [], label_list[i]])
        continue
    else:
        start = label_list.index(1)

    for i in range(start + 1):
        train_set.append([userId, hist_list[i], time_list[i], network_list[i], refresh_list[i], [], label_list[i]])
    
    if start == length - 1: # 只有一个正样本，且正样本后没有负样本, 该正样本作为测试集的历史行为
        userId2Behavior[userId] = [hist_list[start]]
        continue
    positive_list.append(hist_list[start])

    # 用户的历史交互序列可以分为多个序列
    for i in range(start + 1, length):
        positive_list_subset = positive_list
        if len(positive_list) > 20:
            positive_list_subset = positive_list[-20:]
        train_set.append([userId, hist_list[i], time_list[i], network_list[i], refresh_list[i], np.array(positive_list_subset).astype(int), label_list[i]])
        if label_list[i] == 1:
            positive_list.append(hist_list[i])
        if i == (length - 1):
            userId2Behavior[userId] = positive_list

100%|██████████| 1478694/1478694 [20:48<00:00, 1184.83it/s]


In [38]:
train_data = pd.DataFrame(train_set, columns=['user_id', 'item_id', 'time', 'network', 'refresh', 'behavior_id', 'label'])
train_data

Unnamed: 0,user_id,item_id,time,network,refresh,behavior_id,label
0,17340,462907578,1624623340452,2,0,[],1
1,17340,462077126,1624623340452,2,0,[462907578],0
2,17340,462317087,1624623340452,2,0,[462907578],1
3,17340,462243474,1624623340452,2,0,"[462907578, 462317087]",0
4,17340,462821612,1624623340452,2,0,"[462907578, 462317087]",1
...,...,...,...,...,...,...,...
189766954,2447273764,466036199,1625587022525,2,1,[],0
189766955,2447273838,466809988,1625587199715,2,1,[],0
189766956,2447273852,466817295,1625587139950,5,0,[],0
189766957,2447273854,466083315,1625587158131,2,0,[],0


## 获得包含用户历史交互记录的测试集

In [39]:
# 构造测试集
cnt = 0
test_set = []
for index, data in tqdm(test.iterrows()):
    userId = data[1]
    docId = data[2]
    network = data[3]
    refresh = data[4]
    if userId in userId2Behavior.keys(): # 新用户，历史行为列表置为 [0]
        positive_list = userId2Behavior[userId]
        if len(positive_list) > 20:
            positive_list = positive_list[-20:]
        test_set.append([userId, docId, network, refresh, np.array(positive_list).astype(int)]) # [userId, itemId, behavior]
    else:
        cnt += 1
        test_set.append([userId, docId, network, refresh, []]) # [userId, itemId, behavior]

50000it [00:02, 17548.63it/s]


In [42]:
cnt

2367

In [40]:
test_data = pd.DataFrame(test_set, columns=['user_id', 'item_id', 'network', 'refresh', 'behavior_id'])

In [41]:
test_data

Unnamed: 0,user_id,item_id,network,refresh,behavior_id
0,1375690406,466953548,1625670551352,2,"[466444257, 465719059, 466343892, 466404226, 4..."
1,2440720232,466998093,1625666340961,2,"[466005010, 465935004, 466332734, 466085577, 4..."
2,2215148410,466562014,1625633299623,2,"[465418363, 465958871, 465783413, 465767991, 4..."
3,1439698458,466141989,1625613272489,2,"[464543500, 464880043, 464704649, 464338931, 4..."
4,1443858466,466335797,1625654253069,5,[]
...,...,...,...,...,...
49995,1482434596,466823351,1625614181006,2,"[465676688, 466095142, 465649220, 465871960, 4..."
49996,1403405680,466329372,1625656090576,5,"[465112373, 465064102, 465663478, 465575523, 4..."
49997,1433009930,466837941,1625646622637,5,"[466004179, 465914995, 466596644, 465767169, 4..."
49998,2226316954,467221118,1625670079349,2,"[463473330, 463587415, 463465317, 464011363, 4..."


- 训练集的结束时间为2021.6.24 22:56:53, 训练集的结束时间为2021.7.6 23:59:59
- 测试集的开始时间为2021.7.7 00:00:04, 测试集的结束时间为2021.7.7 23:41:22
- 验证集截取7.6一天的数据

## 存储数据

In [43]:
# 构建线上训练集
with open(r'./train_online.pkl', 'wb') as f:
    pickle.dump((train_data[['user_id', 'item_id', 'network', 'refresh', 'label']].to_numpy()).astype(int), f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [50]:
# 构建线上行为序列
with open('./behavior_train_online_all.pkl', 'wb') as f:
    pickle.dump(train_data['behavior_id'].values, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [51]:
# 构建线上测试集
with open('./test.pkl', 'wb') as f:
    pickle.dump(test[['user_id', 'item_id', 'network', 'refresh']].values.astype(int), f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [53]:
# 构建线上行为序列
with open('./behavior_test_all.pkl', 'wb') as f:
    pickle.dump(test_data['behavior_id'].values, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [54]:
# 构建线下验证集
validation = train_data.loc[train_data['time']>1625500811000]
with open('./validation.pkl', 'wb') as f:
    pickle.dump(validation[['user_id', 'item_id', 'network', 'refresh', 'label']].values.astype(int), f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [55]:
# 构建线下验证行为序列
behavior_validation = validation['behavior_id'].values
with open('./behavior_validation_all.pkl', 'wb') as f:
    pickle.dump(behavior_validation, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [None]:
train_data_offline = train_data.loc[set(train_data.index) - set(validation.index)]

In [58]:
# 构建线下训练集
with open('./train.pkl', 'wb') as f:
    pickle.dump(train_data.loc[set(train_data.index) - set(validation.index)][['user_id', 'item_id', 'network', 'refresh', 'label']].values.astype(int), f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [59]:
# 构建线下训练行为序列
with open('./behavior_train_all.pkl', 'wb') as f:
    pickle.dump(train_data.loc[set(train_data.index) - set(validation.index)]['behavior_id'].values, f, pickle.HIGHEST_PROTOCOL)
    f.close()

# 存储数据

In [56]:
with open(r'./user_feature.pkl', 'wb') as f:
    pickle.dump(user, f, pickle.HIGHEST_PROTOCOL)
with open(r'./item_feature.pkl', 'wb') as f:
    pickle.dump(item, f, pickle.HIGHEST_PROTOCOL)
with open(r'./id_dict.pkl', 'wb') as f:
    pickle.dump((uid_dict, iid_dict), f, pickle.HIGHEST_PROTOCOL)

In [None]:
import json
info_dict = {'feat_field': {}, 'vocabulary_size':{}, 'feat_type':{'dense':[], 'sparse':[], 'multi-hot':[]}, 'sample_num': train.shape[0], 'user_num': user.shape[0], 'item_num': item.shape[0]}

user_feats = list(user.columns)
user_feats.remove('user_age')
user_feats.remove('user_gender')
for feat in user_feats:
    info_dict['vocabulary_size'][feat] = user[feat].nunique()
info_dict['vocabulary_size']['user_age'] = 4
info_dict['vocabulary_size']['user_gender'] = 2

for feat in list(item.columns):
    info_dict['vocabulary_size'][feat] = item[feat].nunique()

for feat in ['network', 'refresh']:
    info_dict['vocabulary_size'][feat] = train[feat].nunique()
info_dict['vocabulary_size']['network'] = 6
info_dict['vocabulary_size']['day'] = 7
info_dict['vocabulary_size']['hour'] = 24
info_dict['feat_type']['sparse'] = user_feats + list(item.columns) + ['network', 'refresh']
info_dict['feat_type']['multi-hot'] = ['user_age', 'user_gender']
info_dict['feat_field']['user'] = list(user.columns)
info_dict['feat_field']['item'] = list(item.columns)
info_dict['feat_field']['inter'] = list(train.columns)

with open(r'./info.json', 'w') as f:
    json.dump(info_dict, f)

In [38]:
dict(user.nunique())

{'user_id': 1538384,
 'user_device': 3096,
 'user_system': 3,
 'user_province': 328,
 'user_city': 768}

## 将行为序列处理为定长

In [2]:
length = 15

In [3]:
import pickle
import numpy as np

# 将用户行为序列处理为 length 长度,
def fix_behavior_length(x, length):
    x = list(x)
    if len(x) >= length:
        return x[(-1 * length):]
    for i in range(len(x), length):
        x.append(0)
    return x

In [4]:
# 处理线下训练行为序列
with open('./behavior_train_all.pkl', 'rb') as f:
    behavior_train = pickle.load(f)
    f.close()

In [5]:
behavior_train

array([list([]), array([462907578]), array([462907578]), ..., list([]),
       list([]), list([])], dtype=object)

In [8]:
behavior_train = list(map(lambda x: fix_behavior_length(x, length), behavior_train))

In [12]:
behavior_train = np.array(behavior_train).astype(int)

In [13]:
behavior_train

array([[        0,         0,         0, ...,         0,         0,
                0],
       [462907578,         0,         0, ...,         0,         0,
                0],
       [462907578,         0,         0, ...,         0,         0,
                0],
       ...,
       [        0,         0,         0, ...,         0,         0,
                0],
       [        0,         0,         0, ...,         0,         0,
                0],
       [        0,         0,         0, ...,         0,         0,
                0]])

In [14]:
with open('./behavior_train.pkl', 'wb') as f:
    pickle.dump(behavior_train, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [4]:
# 处理线下验证集
with open('./behavior_validation_all.pkl', 'rb') as f:
    behavior_validation = pickle.load(f)
    f.close()

In [5]:
behavior_validation = list(map(lambda x: fix_behavior_length(x, length), behavior_validation))

In [7]:
behavior_validation = np.array(behavior_validation).astype(int)

In [8]:
with open('./behavior_validation.pkl', 'wb') as f:
    pickle.dump(behavior_validation, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [9]:
# 处理线上训练集
with open('./behavior_train_online_all.pkl', 'rb') as f:
    behavior_train_online = pickle.load(f)
    f.close()

In [10]:
behavior_train_online = list(map(lambda x: fix_behavior_length(x, length), behavior_train_online))

In [11]:
behavior_train_online = np.array(behavior_train_online).astype(int)

In [None]:
behavior_train_online

In [12]:
with open('./behavior_train_online.pkl', 'wb') as f:
    pickle.dump(behavior_train_online, f, pickle.HIGHEST_PROTOCOL)
    f.close()

In [None]:
# 处理线上测试行为序列
with open('./behavior_test_all.pkl', 'rb') as f:
    behavior_test = pickle.load(f)
    f.close()

In [None]:
behavior_test = list(map(lambda x: fix_behavior_length(x, length), behavior_test))

In [None]:
behavior_test = np.array(behavior_test).astype(int)

In [None]:
with open('./behavior_test.pkl', 'wb') as f:
    pickle.dump(behavior_test, f, pickle.HIGHEST_PROTOCOL)
    f.close()