In [1]:
import pandas as pd
from tqdm import tqdm
import os
import warnings

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('precision', 10)

warnings.filterwarnings('ignore')

In [2]:
current_phase = 9
phases = [i for i in range(current_phase + 1)]

In [32]:
click_list = []
test_qtime_list = []

for i in tqdm(phases):
    # 读取train click 点击数据
    df_click_train = pd.read_csv(
        '../data/underexpose_train/underexpose_train_click-{}.csv'.format(i),
        header=None)
    df_click_train.columns = ['user_id', 'item_id', 'time']
    df_click_train['phase'] = i
    click_list.append(df_click_train)

    # 读取test click 点击数据和待预测数据
    df_click_test = pd.read_csv(
        '../data/underexpose_test/underexpose_test_click-{}/underexpose_test_click-{}.csv'
        .format(i, i),
        header=None)
    df_click_test.columns = ['user_id', 'item_id', 'time']
    df_click_test['phase'] = i

    df_qtime_test = pd.read_csv(
        '../data/underexpose_test/underexpose_test_click-{}/underexpose_test_qtime-{}.csv'
        .format(i, i),
        header=None)
    df_qtime_test.columns = ['user_id', 'query_time']
    df_qtime_test['item_id'] = -1
    df_qtime_test['phase'] = i

    click_list.append(df_click_test)
    test_qtime_list.append(df_qtime_test)

100%|██████████| 10/10 [00:01<00:00,  8.88it/s]


In [33]:
df_click = pd.concat(click_list)
df_click = df_click.sort_values(['user_id', 'time']).reset_index(drop=True)

In [34]:
df_click.head()

Unnamed: 0,user_id,item_id,time,phase
0,1,78142,0.9837416195,0
1,1,26646,0.9837566561,0
2,1,89568,0.9837634375,0
3,1,76240,0.9837704328,0
4,1,87533,0.9837895071,0


In [35]:
df_test_qtime = pd.concat(test_qtime_list)
df_test_qtime = df_test_qtime.sort_values(['user_id', 'query_time'
                                           ]).reset_index(drop=True)
df_test_qtime.head()

Unnamed: 0,user_id,query_time,item_id,phase
0,1,0.9839420823,-1,1
1,2,0.9840615264,-1,2
2,3,0.9841167499,-1,3
3,7,0.984279972,-1,7
4,8,0.9842500685,-1,8


In [48]:
# 存储每个测试阶段所有用户ID的集合 无重复
df_ = df_test_qtime.groupby(
    ['phase'])['user_id'].apply(lambda x: sorted(list(set(x)))).reset_index()

# 存储到字典中
phase_testusers_dict = dict(zip(df_['phase'], df_['user_id']))

# 打印出每个测试集合中的用户数量
for i in tqdm(df_test_qtime['phase'].unique()):
    print('phase {} 测试集用户数 {}'.format(i, len(phase_testusers_dict[i])))

100%|██████████| 10/10 [00:00<00:00, 21509.25it/s]

phase 1 测试集用户数 1726
phase 2 测试集用户数 1690
phase 3 测试集用户数 1675
phase 7 测试集用户数 1797
phase 8 测试集用户数 1818
phase 9 测试集用户数 1752
phase 0 测试集用户数 1663
phase 4 测试集用户数 1708
phase 5 测试集用户数 1798
phase 6 测试集用户数 1821





In [60]:
# 训练集用户 抽出行为数据最后一条作为label
# 测试集用户 行为数据删除时间大于query_time的行为
click_list = []
train_qtime_list = []

# 所有点击数据 根据phase和user_id进行分类
groups = df_click.groupby(['phase', 'user_id'])

#遍历每个phase和user_id 与其中的数据 g
for (phase, user_id), g in tqdm(groups):
    # 测试集用户
    # 如果某个phase中的user_id在测试集中也存在
    if user_id in phase_testusers_dict[phase]:
        # 将测试集中'user_id'为此user_id和'phase'为此phase的'query_time'记为qtime
        qtime = df_test_qtime[(df_test_qtime['user_id'] == user_id) & (
            df_test_qtime['phase'] == phase)]['query_time'].values[0]
        # 该phase和user的测试集数据为小于这个时间的click数据
        test_click = g[g['time'] < qtime]
        # 不存在大于qtime的历史点击记录
        assert test_click.shape[0] == g.shape[0]
        click_list.append(test_click)

    # 训练集用户
    else:
        # 抽出行为数据最后一条作为label
        train_qtime = g.tail(1)
        train_qtime_list.append(train_qtime)

        # 将最后一条数据之前的数据加入click_list
        train_click = g.head(g.shape[0] - 1)
        click_list.append(train_click)

100%|██████████| 192600/192600 [01:13<00:00, 2614.62it/s]


In [62]:
df_click = pd.concat(click_list, sort=False)

In [71]:
df_train_qtime = pd.concat(train_qtime_list)
df_train_qtime.rename(columns={'time': 'query_time'}, inplace=True)
df_train_qtime = df_train_qtime[['user_id', 'query_time', 'item_id', 'phase']]
print('训练集用户数', df_train_qtime['user_id'].nunique())

训练集用户数 35057


In [72]:
df_train_qtime.head()

Unnamed: 0,user_id,query_time,item_id,phase
24,1,0.9839419315,69359,0
214,2,0.9838837214,58621,0
524,4,0.9838849522,90818,0
612,7,0.9839401177,23436,0
753,9,0.9838944402,114268,0


In [73]:
# 询问时间文件包含 测试集 和 
# 训练集中不存在于测试集中的用户的最后一条数据 
df_qtime = pd.concat([df_train_qtime, df_test_qtime], sort=False)

In [76]:
df_qtime.to_pickle('../user_data/data/qtime.pkl')

# 点击率文件包含 在测试集和训练集中都出现的用户的 训练集的数据（时间小于测试集中的“询问”时间）和
# 不存在于测试集中的用户的最后一条数据之前的数据
df_click.to_pickle('../user_data/data/click.pkl')

In [81]:
df_qtime.head()

Unnamed: 0,user_id,query_time,item_id,phase
24,1,0.9839419315,69359,0
214,2,0.9838837214,58621,0
524,4,0.9838849522,90818,0
612,7,0.9839401177,23436,0
753,9,0.9838944402,114268,0
