In [11]:
import pandas as pd

import random
import json

import os

In [12]:
core = 10
path_train_read = './KuaiRand/log_standard_4_08_to_4_21_1k.csv'
path_train = 'train_data.json'
path_all_data_pred = 'train_data_pred.json'

path_validation = 'validation_data.json'
path_validation_read = './KuaiRand/log_standard_4_22_to_5_08_1k.csv'

In [13]:
# train data func
def dataset_filtering(interaction, core):
    # filter the cold users and items within 10 interactions
    user_id_dic = {}  # record the number of interaction for each user and item
    item_id_dic = {}
    for(user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in interaction:
        try: user_id_dic[user_id] += 1
        except: user_id_dic[user_id] = 1
        try: item_id_dic[item_id] += 1
        except: item_id_dic[item_id] = 1
    print('# Original training dataset')
    print('  User:', len(user_id_dic), 'Item:', len(item_id_dic), 'Interaction:', len(interaction), 'Sparsity:', 100 - len(interaction) * 100.0 / len(user_id_dic) / len(item_id_dic), '%')
    sort_user = []
    sort_item = []
    for user_id in user_id_dic:
        sort_user.append((user_id, user_id_dic[user_id]))
    for item_id in item_id_dic:
        sort_item.append((item_id, item_id_dic[item_id]))
    
    sort_user.sort(key=lambda x: x[1])
    sort_item.sort(key=lambda x: x[1])
    print('Fitering (core = ' + str(core) + ') ... ', end = 'number of remained interactions: ')
    
    while sort_user[0][1] < core or sort_item[0][1] < core:
        # find out all users and items with less than core recorders
        user_LessThanCore = set()
        item_LessThanCore = set()
        for pair in sort_user:
            if pair[1] < core: user_LessThanCore.add(pair[0])
            else: break
        for pair in sort_item:
            if pair[1] < core: item_LessThanCore.add(pair[0])
            else: break
        # reconstruct the interaction record, remove the cool one
        interaction_filtered = []
        for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in interaction:
            if not (user_id in user_LessThanCore or item_id in item_LessThanCore):
                interaction_filtered.append((user_id, item_id, time_ms, click, like, follow, comment, forward, longview))
        # update the record
        interaction = interaction_filtered
        # count the number of each user and item in new data, check if all cool users and items are removed
        # reset all memory variables
        user_id_dic = {}  # record the number of interaction for each user and item
        item_id_dic = {}
        for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in interaction:
            try: user_id_dic[user_id] += 1
            except: user_id_dic[user_id] = 1
            try: item_id_dic[item_id] += 1
            except: item_id_dic[item_id] = 1

        sort_user = []
        sort_item = []
        for user_id in user_id_dic:
            sort_user.append((user_id, user_id_dic[user_id]))
        for item_id in item_id_dic:
            sort_item.append((item_id, item_id_dic[item_id]))
        sort_user.sort(key=lambda x: x[1])
        sort_item.sort(key=lambda x: x[1])
        print (len(interaction), end = ' ')
    print()
    print ('# Filtered training dataset')
    print ('  User:', len(user_id_dic), 'Item:', len(item_id_dic), 'Interaction:', len(interaction), 'Sparsity:', 100 - len(interaction) * 100.0 / len(user_id_dic) / len(item_id_dic), '%')
    
    return interaction

In [14]:
# train data func
def index_encoding(interaction):
    # mapping id into number
    # after filtering the dataset, we need to re-encode the index of users and items
    user_id_set = set()
    item_id_set = set()

    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in interaction:
        user_id_set.add(user_id)
        item_id_set.add(item_id)
    user_num2id = list(user_id_set)
    item_num2id = list(item_id_set)
    user_num2id.sort()
    item_num2id.sort()
    # user_id2num maps id to number, and user_num2id dictionary is not needed, user_ID
    user_id2num = {}
    for num in range(0, len(user_id_set)):
        user_id2num[user_num2id[num]] = num
    item_id2num = {}
    for num in range(0, len(item_id_set)):
        item_id2num[item_num2id[num]] = num
    interaction_number = []
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in interaction:
        interaction_number.append((user_id2num[user_id], item_id2num[item_id], time_ms, click, like, follow, comment, forward, longview))
    interaction = interaction_number
    return interaction, user_id2num, item_id2num

In [15]:
# train & validation & test func
def user_action_list_making(Interaction_train, user_id2num):
    user_num = len(user_id2num)
    user_action_list = []
    for i in range(user_num):
        user_action_list.append([]);
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in Interaction_train:
        user_action_list[user_id].append((item_id, time_ms));
        
    print('user_action_list init success, waiting filter...')
    
    Interaction_train_with_action = []
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in Interaction_train:
        history = user_action_list[user_id] # [(item_id, time_ms), ]
        history.sort(key=lambda x: x[1])     # time_stamp rank, small->big
        
        user_real_action = []
        for (p1, _) in history :
            if (p1 == item_id):
                break
            else:
                user_real_action.append(p1)
        
        Interaction_train_with_action.append((user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action))
    return Interaction_train_with_action

In [16]:
# train & validation & test func
# click action_list; non-click sample's action filter with 'if (time_ms < ts)'
def user_action_list_making_with_all_sample(Interaction_train, user_id2num, is_click = True):
    user_num = len(user_id2num)
    user_action_list = []
    for i in range(user_num):
        user_action_list.append([]);
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in Interaction_train:
        if is_click:
            if click == 1:
                user_action_list[user_id].append((item_id, time_ms))
        else:
            user_action_list[user_id].append((item_id, time_ms))
    print("len(user_action_list) = ", len(user_action_list))
    # print("user_action_list[:3] = ", user_action_list[:3])
    print('user_action_list init success, waiting filter...')
    
    Interaction_train_with_action = []
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in Interaction_train:
        history = user_action_list[user_id] # [(item_id, time_ms), ]
        history.sort(key=lambda x: x[1])     # time_stamp rank, small->big
        
        user_real_action = []
        for (p1, ts) in history :
            if (ts < time_ms):
                user_real_action.append(p1)
            else:
                break
        Interaction_train_with_action.append((user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action))
    return Interaction_train_with_action

In [17]:
# validation & test func
def filter_test_or_validation_data(data, user_id2num, item_id2num):
    # filter test && validation data
    new_data = []
    for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview) in data:
        if user_id in user_id2num and item_id in item_id2num:
            new_data.append((user_id2num[user_id], item_id2num[item_id], time_ms, click, like, follow , comment, forward, longview))
    return new_data

# write json
def write_data(path, data):
    f = open(path, 'w')
    jsObj = json.dumps(data)
    f.write(jsObj)
    f.close()

## Directory
##### 1 generate &nbsp;&nbsp; 'train_data.json' &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; for mmoe train (date 4.08-4.21)
##### 2 generate &nbsp;&nbsp; 'train_data_pred.json' &nbsp;&nbsp; for mmoe pred  (date 4.08-4.21)
##### 3 generate &nbsp;&nbsp; 'validation_data.json' &nbsp;&nbsp;&nbsp; for mmoe vali  (data 4.22-5.08)

----------------------------------------------------------------------------------------------------------------

## 1 generate &nbsp;&nbsp; 'train_data.json' &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; for mmoe train (date 4.08-4.21)

In [18]:
# 1 load data
data_train = []
# origin = pd.read_csv(path_train_read)
cols = ['user_id', 'video_id', 'tab', 'time_ms',
        'is_click', 'is_like', 'is_follow', 'is_comment', 'is_forward', 'long_view']
raw_data_train = pd.read_csv(path_train_read, usecols=cols)
raw_data_train.tail()

Unnamed: 0,user_id,video_id,time_ms,is_click,is_like,is_follow,is_comment,is_forward,long_view,tab
5055979,999,694615,1650552207405,0,0,0,0,0,0,1
5055980,999,1240364,1650552207405,0,0,0,0,0,0,1
5055981,999,2514654,1650552207405,1,0,0,0,0,1,1
5055982,999,4214495,1650552339920,0,0,0,0,0,0,1
5055983,999,2897178,1650552339920,1,0,0,0,0,1,1


In [19]:
# 2 click sample
raw_data_train_with_click = raw_data_train[(raw_data_train.is_click ==1)]
raw_data_train_with_click.head(10)

Unnamed: 0,user_id,video_id,time_ms,is_click,is_like,is_follow,is_comment,is_forward,long_view,tab
4,0,2528540,1649467982289,1,0,0,0,0,0,0
23,0,4067506,1649477390308,1,0,0,0,0,1,1
37,0,3556496,1649673423739,1,0,0,0,0,1,0
126,0,909,1649673704264,1,0,0,0,0,1,1
131,0,2407158,1649673704264,1,0,0,0,0,0,1
158,0,434186,1649675512388,1,0,0,0,0,1,1
213,0,420001,1649676359584,1,0,0,0,0,0,0
254,0,3998230,1649676395781,1,0,0,0,0,0,0
292,0,3322810,1649793902235,1,0,0,0,0,0,1
296,0,182968,1649793923341,1,0,0,0,0,1,1


In [20]:
# 3 add list
train_data_click = []
row = len(raw_data_train_with_click['user_id'])
print("row=", row)
          
for index, row in raw_data_train_with_click.iterrows():
    train_data_click.append((row['user_id'], row['video_id'], int(row['time_ms']), int(row['is_click']), int(row['is_like']), int(row['is_follow']), int(row['is_comment']), int(row['is_forward']), int(row['long_view'])))

row= 1917934


In [21]:
# 4 train data filter: frequence
train_data_click = dataset_filtering(train_data_click, core)

# Original training dataset
  User: 983 Item: 877897 Interaction: 1917934 Sparsity: 99.77775265814853 %
Fitering (core = 10) ... number of remained interactions: 474721 474623 474515 
# Filtered training dataset
  User: 960 Item: 25701 Interaction: 474515 Sparsity: 98.07678122122356 %


In [22]:
# 5 train data index_encoding
train_data_click, user_id2num, item_id2num = index_encoding(train_data_click)
print(train_data_click[0:10])

[(0, 52, 1649673704264, 1, 0, 0, 0, 0, 1), (0, 14331, 1649673704264, 1, 0, 0, 0, 0, 0), (0, 12833, 1649936855141, 1, 0, 0, 0, 0, 1), (0, 17144, 1649936894064, 1, 0, 0, 0, 0, 0), (0, 845, 1649936975456, 1, 0, 0, 0, 0, 1), (0, 7479, 1649937088780, 1, 0, 0, 0, 0, 1), (0, 17137, 1649937088780, 1, 0, 0, 0, 0, 1), (0, 17041, 1650330704859, 1, 0, 0, 0, 0, 1), (0, 15726, 1650338460613, 1, 0, 0, 0, 0, 1), (0, 12890, 1650338617363, 1, 0, 0, 0, 0, 1)]


In [23]:
# 6 make action_list
train_data_click_with_action = user_action_list_making(train_data_click, user_id2num)
print(train_data_click_with_action[0:10])

user_action_list init success, waiting filter...
[(0, 52, 1649673704264, 1, 0, 0, 0, 0, 1, []), (0, 14331, 1649673704264, 1, 0, 0, 0, 0, 0, [52]), (0, 12833, 1649936855141, 1, 0, 0, 0, 0, 1, [52, 14331]), (0, 17144, 1649936894064, 1, 0, 0, 0, 0, 0, [52, 14331, 12833]), (0, 845, 1649936975456, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144]), (0, 7479, 1649937088780, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845]), (0, 17137, 1649937088780, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479]), (0, 17041, 1650330704859, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479, 17137]), (0, 15726, 1650338460613, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041]), (0, 12890, 1650338617363, 1, 0, 0, 0, 0, 1, [52, 14331, 12833, 17144, 845, 7479, 17137, 17041, 15726])]


In [24]:
# 7 fianl train data && shuffle
final_data_train = []
random.shuffle(train_data_click_with_action)
# del time_ms
# for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action) in train_data_click_with_action:
#     final_data_train.append((user_id, item_id, click, like, follow, comment, forward, longview, user_real_action))

# keep time_ms
final_data_train = train_data_click_with_action
print(final_data_train[0:3])

[(410, 25133, 1650412901031, 1, 0, 0, 0, 0, 1, [23227, 15020, 14573, 4921, 3006, 20282, 6840, 17535, 3154, 22139, 18712, 18919, 23183, 21592, 11571, 9107, 14452, 2625, 12920, 9931, 21960, 13289, 15309, 17067, 10024, 864, 17768, 5359, 18847, 22413, 15614, 4566, 21399, 1897, 21705, 15413, 22287, 4132, 14712, 13061, 1918, 23511, 888, 24286, 11534, 16814, 18270, 2764, 19190, 14255, 24962, 15505, 9461, 20096, 8186, 18890, 7694, 8005, 2183, 24888, 12754, 544, 9595, 16879, 13783, 22108, 10187, 2739, 6228, 24758, 14930, 21517, 22724, 17508, 1579, 22295, 23299, 1163, 4728, 20248, 11420, 15707, 20648, 3397, 19110, 13158, 951, 14745, 11468, 15120, 16255, 17275, 1843, 11337, 4515, 8545, 21854, 9881, 20836, 9075, 3696, 18510, 15761, 8277, 4781, 14104, 10576, 17523, 11846, 15195, 13755, 24371, 13095, 15844, 8355, 19129, 368, 20784, 6508, 18399, 13026, 16872, 12863, 8495, 11330, 22966, 20226, 24753, 21841, 23411, 115, 12102, 15001, 10440, 6457, 12333, 1989, 17159, 24969, 10195, 202, 3995, 25491, 3496

In [25]:
print("fianl tain data, row", len(final_data_train))

fianl tain data, row 474515


In [26]:
# 8 wirte train data
write_data(path_train, final_data_train)

In [27]:
# 9 other 
with open(path_train) as f:
    line = f.readline()
    data = json.loads(line)
    f.close()
    user_num = len(data)
print("confirm data, num=",user_num)
data[:2]

confirm data, num= 474515


[[410,
  25133,
  1650412901031,
  1,
  0,
  0,
  0,
  0,
  1,
  [23227,
   15020,
   14573,
   4921,
   3006,
   20282,
   6840,
   17535,
   3154,
   22139,
   18712,
   18919,
   23183,
   21592,
   11571,
   9107,
   14452,
   2625,
   12920,
   9931,
   21960,
   13289,
   15309,
   17067,
   10024,
   864,
   17768,
   5359,
   18847,
   22413,
   15614,
   4566,
   21399,
   1897,
   21705,
   15413,
   22287,
   4132,
   14712,
   13061,
   1918,
   23511,
   888,
   24286,
   11534,
   16814,
   18270,
   2764,
   19190,
   14255,
   24962,
   15505,
   9461,
   20096,
   8186,
   18890,
   7694,
   8005,
   2183,
   24888,
   12754,
   544,
   9595,
   16879,
   13783,
   22108,
   10187,
   2739,
   6228,
   24758,
   14930,
   21517,
   22724,
   17508,
   1579,
   22295,
   23299,
   1163,
   4728,
   20248,
   11420,
   15707,
   20648,
   3397,
   19110,
   13158,
   951,
   14745,
   11468,
   15120,
   16255,
   17275,
   1843,
   11337,
   4515,
   8545,
   21854,
   

In [28]:
# 10 other: distribution
# count user action_list length distribution
dic_action_list_length = {}
for row in data:
    (user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action) = row
    length = len(user_real_action)
    try: dic_action_list_length[length] += 1
    except: dic_action_list_length[length] = 1

# for key, value in dic_action_list_length.items():
#     print ("length=", key, "number=", value)


sorted_keys = sorted(dic_action_list_length.keys())
sorted_dict = {}
for key in sorted_keys:
    sorted_dict[key] = dic_action_list_length[key]

In [29]:
x1 = []
y1 = []
for key, value in sorted_dict.items():
    x1.append(key)
    y1.append(value)
len(x1)
len(y1)

2703

In [30]:
import plotly.offline as py                    #保存图表，相当于plotly.plotly as py，同时增加了离线功能
py.init_notebook_mode(connected=True)          #离线绘图时，需要额外进行初始化
import plotly.graph_objects as go # 引入plotly底层绘图库

trace = go.Scatter(
     x = x1,
     y = y1,
     showlegend=True
)
data = [trace]

py.iplot(data)

In [31]:
dic_table = {}
for key, value in dic_action_list_length.items():
    if (key <= 100):
        try:dic_table['0-100'] += value;
        except: dic_table['0-100'] = value
    elif (key <= 200):
        try:dic_table['100-200'] += value;
        except: dic_table['100-200'] = value
    elif (key <= 300):
        try:dic_table['200-300'] += value;
        except: dic_table['200-300'] = value
    elif (key <= 400):
        try:dic_table['300-400'] += value;
        except: dic_table['300-400'] = value
    elif (key <= 500):
        try:dic_table['400-500'] += value;
        except: dic_table['400-500'] = value
    elif (key <= 1000):
        try:dic_table['500-1000'] += value;
        except: dic_table['500-1000'] = value
    elif (key <= 2000):
        try:dic_table['1000-2000'] += value;
        except: dic_table['1000-2000'] = value
    elif (key <= 2500):
        try:dic_table['2000-2500'] += value;
        except: dic_table['2000-2500'] = value
    elif (key <= 3000):
        try:dic_table['2500-3000'] += value;
        except: dic_table['2500-3000'] = value

for key, value in dic_table.items():
    print("ation_list length:", key, "log num:", value)

ation_list length: 1000-2000 log num: 45432
ation_list length: 300-400 log num: 50755
ation_list length: 0-100 log num: 90524
ation_list length: 400-500 log num: 40718
ation_list length: 200-300 log num: 61435
ation_list length: 100-200 log num: 75354
ation_list length: 500-1000 log num: 107543
ation_list length: 2000-2500 log num: 2420
ation_list length: 2500-3000 log num: 334


## 2 generate &nbsp;&nbsp; 'train_data_pred.json' &nbsp;&nbsp; for mmoe pred  (date 4.08-4.21)

In [32]:
# 1 click 1 & 0
# 2 get data
all_data = raw_data_train
print("len(all_data) = ", len(all_data))
all_data.head()

len(all_data) =  5055984


Unnamed: 0,user_id,video_id,time_ms,is_click,is_like,is_follow,is_comment,is_forward,long_view,tab
0,0,4354972,1649467982289,0,0,0,0,0,0,0
1,0,1329429,1649467982289,0,0,0,0,0,0,0
2,0,346081,1649467982289,0,0,0,0,0,0,0
3,0,2058916,1649467982289,0,0,0,0,0,0,0
4,0,2528540,1649467982289,1,0,0,0,0,0,0


In [33]:
# 3 add list  
# && filter data with user_id2num & item_id2num  
# && replace with user_id2num[row['user_id']]
train_data_all = []
row = len(all_data['user_id'])
print("row =", row)

print("prepare generating train_data_all....")
set_user_id = set()
set_item_id = set()
for key, _ in user_id2num.items():
    set_user_id.add(key)
for key, _ in item_id2num.items():
    set_item_id.add(key)

print("len(set_user_id) = ", len(set_user_id), ", len(set_item_id) = ", len(set_item_id))

for index, row in all_data.iterrows():
    if row['user_id'] in set_user_id and row['video_id'] in set_item_id:
        train_data_all.append((user_id2num[row['user_id']], item_id2num[row['video_id']], int(row['time_ms']), int(row['is_click']), int(row['is_like']), int(row['is_follow']), int(row['is_comment']), int(row['is_forward']), int(row['long_view'])))
print ("len(train_data_all) = ", len(train_data_all))
print ("train_data_all[0:5] = ", train_data_all[0:5])

row = 5055984
prepare generating train_data_all....
len(set_user_id) =  960 , len(set_item_id) =  25701
len(train_data_all) =  843819
train_data_all[0:5] =  [(0, 25588, 1649467982289, 0, 0, 0, 0, 0, 0), (0, 7695, 1649477382190, 0, 0, 0, 0, 0, 0), (0, 17787, 1649673604040, 0, 0, 0, 0, 0, 0), (0, 21144, 1649673604040, 0, 0, 0, 0, 0, 0), (0, 465, 1649673604040, 0, 0, 0, 0, 0, 0)]


In [34]:
# 4 5
# 6 make action_list
print("len(user_id2num) = ", len(user_id2num))
train_data_all_with_action = user_action_list_making_with_all_sample(train_data_all, user_id2num, is_click = True)
print(train_data_all_with_action[0:10])

len(user_id2num) =  960
len(user_action_list) =  960
user_action_list init success, waiting filter...
[(0, 25588, 1649467982289, 0, 0, 0, 0, 0, 0, []), (0, 7695, 1649477382190, 0, 0, 0, 0, 0, 0, []), (0, 17787, 1649673604040, 0, 0, 0, 0, 0, 0, []), (0, 21144, 1649673604040, 0, 0, 0, 0, 0, 0, []), (0, 465, 1649673604040, 0, 0, 0, 0, 0, 0, []), (0, 465, 1649673611472, 0, 0, 0, 0, 0, 0, []), (0, 11950, 1649673695911, 0, 0, 0, 0, 0, 0, []), (0, 20279, 1649673695911, 0, 0, 0, 0, 0, 0, []), (0, 52, 1649673704264, 1, 0, 0, 0, 0, 1, []), (0, 22040, 1649673704264, 0, 0, 0, 0, 0, 0, [])]


In [35]:
# 7 fianl train data
final_all_data_pred = []

# # del time_ms
# for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action) in train_data_all_with_action:
#     final_all_data_pred.append((user_id, item_id, click, like, follow, comment, forward, longview, user_real_action))

# keep time_ms
final_all_data_pred = train_data_all_with_action
print(final_all_data_pred[0:5])

[(0, 25588, 1649467982289, 0, 0, 0, 0, 0, 0, []), (0, 7695, 1649477382190, 0, 0, 0, 0, 0, 0, []), (0, 17787, 1649673604040, 0, 0, 0, 0, 0, 0, []), (0, 21144, 1649673604040, 0, 0, 0, 0, 0, 0, []), (0, 465, 1649673604040, 0, 0, 0, 0, 0, 0, [])]


In [36]:
print("final_all_data_pred, row", len(final_all_data_pred))

final_all_data_pred, row 843819


In [37]:
# 8 wirte train data
write_data(path_all_data_pred, final_all_data_pred)

In [38]:
# 9 other 
with open(path_all_data_pred) as f:
    line = f.readline()
    data_pred = json.loads(line)
    f.close()
    user_num = len(data_pred)
print("confirm data, num=",user_num)
data_pred[:3]

confirm data, num= 843819


[[0, 25588, 1649467982289, 0, 0, 0, 0, 0, 0, []],
 [0, 7695, 1649477382190, 0, 0, 0, 0, 0, 0, []],
 [0, 17787, 1649673604040, 0, 0, 0, 0, 0, 0, []]]

## 3 generate &nbsp;&nbsp; 'validation_data.json' &nbsp;&nbsp;&nbsp; for mmoe vali  (data 4.22-5.08)

In [39]:
test_train = []
cols_test = ['user_id', 'video_id', 'tab', 'time_ms',
        'is_click', 'is_like', 'is_follow', 'is_comment', 'is_forward', 'long_view', 'date']
raw_data_test = pd.read_csv(path_validation_read, usecols=cols_test)

In [40]:
# 2 click sample
raw_data_test_with_click = raw_data_test[(raw_data_test.is_click ==1)]
raw_data_test_with_click

Unnamed: 0,user_id,video_id,date,time_ms,is_click,is_like,is_follow,is_comment,is_forward,long_view,tab
9,0,1033272,20220422,1650585311434,1,0,0,0,0,1,1
19,0,2187381,20220422,1650585713727,1,0,0,0,0,1,1
24,0,1870951,20220422,1650585713727,1,0,0,0,0,1,1
25,0,1818365,20220422,1650585713727,1,0,0,0,0,0,1
28,0,2185068,20220422,1650585762241,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
6657053,999,1248551,20220508,1652020974013,1,0,0,0,0,0,1
6657055,999,2879149,20220508,1652021025602,1,0,0,0,0,0,1
6657057,999,2922688,20220508,1652021025602,1,0,0,0,0,1,1
6657059,999,4164425,20220508,1652021025602,1,0,0,0,0,1,1


In [41]:
# 3 add list: (user_id, item_id, time_ms, click, like, follow, comment, forward, longview)
validation_data_click = []
test_data_click = []

In [42]:
# no need data
for index, row in raw_data_test_with_click.iterrows():
    if row['date'] < 20220510:  # dataset=2022.04.22~2022.05.08
        validation_data_click.append((row['user_id'], row['video_id'], int(row['time_ms']), int(row['is_click']), int(row['is_like']), int(row['is_follow']), int(row['is_comment']), int(row['is_forward']), int(row['long_view'])))
    else:
        test_data_click.append((row['user_id'], row['video_id'], int(row['time_ms']), int(row['is_click']), int(row['is_like']), int(row['is_follow']), int(row['is_comment']), int(row['is_forward']), int(row['long_view'])))
print("origin, len(validation_data_click)=", len(validation_data_click), ", len(test_data_click)=", len(test_data_click))

origin, len(validation_data_click)= 2511906 , len(test_data_click)= 0


In [43]:
# 4 filter test && validation data: use user_id2num && item_id2num
validation_data_click = filter_test_or_validation_data(validation_data_click, user_id2num, item_id2num)
# test_data_click = filter_test_or_validation_data(test_data_click, user_id2num, item_id2num)

print("after filter, len(validation_data_click)=", len(validation_data_click), ", len(test_data_click)=", len(test_data_click))

after filter, len(validation_data_click)= 102588 , len(test_data_click)= 0


In [44]:
# 5
# 6 make action_list
validation_data_click_with_action = user_action_list_making(validation_data_click, user_id2num)
# test_data_click_with_action = user_action_list_making(test_data_click, user_id2num)

print(validation_data_click_with_action[0:10])
# print(test_data_click_with_action[0:10])

user_action_list init success, waiting filter...
[(0, 12994, 1650585713727, 1, 0, 0, 0, 0, 1, []), (0, 10907, 1650585713727, 1, 0, 0, 0, 0, 0, [12994]), (0, 10907, 1650586368362, 1, 0, 0, 0, 0, 1, [12994]), (0, 20944, 1650586905098, 1, 0, 0, 0, 0, 1, [12994, 10907, 10907]), (0, 10653, 1650587557154, 1, 0, 0, 0, 0, 1, [12994, 10907, 10907, 20944]), (0, 3494, 1650587844085, 1, 0, 0, 0, 0, 1, [12994, 10907, 10907, 20944, 10653]), (0, 6676, 1650588581753, 1, 0, 0, 0, 0, 1, [12994, 10907, 10907, 20944, 10653, 3494]), (0, 10292, 1650876188430, 1, 0, 0, 0, 0, 1, [12994, 10907, 10907, 20944, 10653, 3494, 6676]), (0, 3593, 1650973491300, 1, 0, 0, 0, 0, 1, [12994, 10907, 10907, 20944, 10653, 3494, 6676, 10292]), (0, 24281, 1650973661453, 1, 0, 0, 0, 0, 1, [12994, 10907, 10907, 20944, 10653, 3494, 6676, 10292, 3593])]


In [45]:
# 7 fianl validation & test data:  shuffle
final_data_validation = []
random.shuffle(validation_data_click_with_action)
# final_data_test = []

# for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action) in validation_data_click_with_action:
#     final_data_validation.append((user_id, item_id, click, like, follow, comment, forward, longview, user_real_action))
# for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action) in test_data_click_with_action:
#     final_data_test.append((user_id, item_id, click, like, follow, comment, forward, longview, user_real_action))

final_data_validation = validation_data_click_with_action

print(final_data_validation[0:5])
# print(final_data_test[0:10])

[(508, 1918, 1651051177101, 1, 0, 0, 0, 0, 1, [18290, 18826, 7359, 1048, 22445, 18609, 16994, 17587, 518, 11297, 17400, 17348, 16328, 9855, 8222, 9300, 19226, 20839, 4828, 21705, 22784, 8163, 23461, 18105, 17211, 16644, 6701, 12452, 16051, 23394, 17820, 20682, 5242, 1851, 13463, 25096, 14119, 21782, 15094, 14607, 8687, 10558, 1170, 12571, 4700, 21068, 9606, 6871, 24750, 11463, 15063, 18160, 2910, 1579, 15550, 17185, 15641, 18435, 22785, 24949, 7843, 15167, 4772, 22912, 6047, 12367, 5052, 6231, 9100, 18227, 12395, 20512, 6429, 10788, 24265, 3884, 12532, 11701, 5910, 9771, 8784, 14174, 7621]), (156, 4777, 1651984118795, 1, 0, 0, 0, 0, 1, [2023, 19561, 23145, 8308, 21146, 2418, 18105, 11130, 24090, 20652, 17731, 20354, 14207, 18634, 5170, 68, 14923, 14483, 15559, 23176, 24380, 21492, 17820, 5229, 10130, 20515, 24984, 862, 13764, 20677, 3495, 16718, 361, 7859, 14941, 1462, 14889, 17870, 21202, 21392, 690, 11550, 22350, 871, 11188, 1626, 2972, 19522, 21387, 13153, 6395, 6478, 271, 20978, 10

In [46]:
# 8 wirte validation & test data
write_data(path_validation, final_data_validation)
# write_data(path_test, final_data_test)
print("write success")

write success


In [47]:
# 9 other
with open(path_validation) as f:
    line = f.readline()
    data_val = json.loads(line)
    f.close()
    num_validation = len(data_val)
print("confirm data_val, num=",num_validation)

confirm data_val, num= 102588


In [49]:
# count validation: user action_list length distribution
dic_val_action_list_length = {}
for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action) in data_val:
    length = len(user_real_action)
    try: dic_val_action_list_length[length] += 1
    except: dic_val_action_list_length[length] = 1

# for key, value in dic_val_action_list_length.items():
#     print ("length=", key, "number=", value)

sorted_keys = sorted(dic_val_action_list_length.keys())
sorted_dict = {}
for key in sorted_keys:
    sorted_dict[key] = dic_val_action_list_length[key]

In [50]:
x1 = []
y1 = []
for key, value in sorted_dict.items():
    x1.append(key)
    y1.append(value)
len(x1)
len(y1)

700

In [51]:
import plotly.offline as py                    #保存图表，相当于plotly.plotly as py，同时增加了离线功能
py.init_notebook_mode(connected=True)          #离线绘图时，需要额外进行初始化
import plotly.graph_objects as go # 引入plotly底层绘图库

trace = go.Scatter(
     x = x1,
     y = y1,
     showlegend=True
)
data = [trace]

py.iplot(data)

In [52]:
dic_table = {}
for key, value in sorted_dict.items():
    if (key <= 100):
        try:dic_table['0-100'] += value;
        except: dic_table['0-100'] = value
    elif (key <= 200):
        try:dic_table['100-200'] += value;
        except: dic_table['100-200'] = value
    elif (key <= 300):
        try:dic_table['200-300'] += value;
        except: dic_table['200-300'] = value
    elif (key <= 400):
        try:dic_table['300-400'] += value;
        except: dic_table['300-400'] = value
    elif (key <= 500):
        try:dic_table['400-500'] += value;
        except: dic_table['400-500'] = value
    elif (key <= 1000):
        try:dic_table['500-1000'] += value;
        except: dic_table['500-1000'] = value
    elif (key <= 2000):
        try:dic_table['1000-2000'] += value;
        except: dic_table['1000-2000'] = value
    elif (key <= 2500):
        try:dic_table['2000-2500'] += value;
        except: dic_table['2000-2500'] = value
    elif (key <= 10000):
        try:dic_table['2500-10000'] += value;
        except: dic_table['2500-10000'] = value

for key, value in dic_table.items():
    print("ation_list length:", key, "log num:", value)

ation_list length: 0-100 log num: 67183
ation_list length: 100-200 log num: 24301
ation_list length: 200-300 log num: 7487
ation_list length: 300-400 log num: 2243
ation_list length: 400-500 log num: 891
ation_list length: 500-1000 log num: 483
