In [30]:
import pandas as pd

import random
import json

import os

In [54]:
# NOTE  tenrec_dataset.json (has time seq)
core = 10
path_train_read = 'tenrec_dataset.json'

# write
path_train = 'tenrec_train_data.json'
path_validation = 'tenrec_validation_data.json'
path_all_data_pred = 'tenrec_pred_data.json'

In [52]:
def read_data_with_pd(path):
    data = []
    with open(path) as f:
        line = f.readline()
        data = json.loads(line)
        f.close()
        log_num = len(data)
    print("confirm data, log_num=",log_num)
    
    analysis_data=pd.DataFrame(data, columns=['user_id', 'item_id', 'click', 'like', 'follow', 'forward'])
    print(analysis_data.head())
    
    return analysis_data

# write json
def write_data(path, data):
    f = open(path, 'w')
    jsObj = json.dumps(data)
    f.write(jsObj)
    f.close()

In [33]:
# train data func
def dataset_filtering(interaction, core):
    # filter the cold users and items within 10 interactions
    user_id_dic = {}  # record the number of interaction for each user and item
    item_id_dic = {}
    for(user_id, item_id, click, like, follow, forward) in interaction:
        try: user_id_dic[user_id] += 1
        except: user_id_dic[user_id] = 1
        try: item_id_dic[item_id] += 1
        except: item_id_dic[item_id] = 1
    print('# Original training dataset')
    print('  User:', len(user_id_dic), 'Item:', len(item_id_dic), 'Interaction:', len(interaction), 'Sparsity:', 100 - len(interaction) * 100.0 / len(user_id_dic) / len(item_id_dic), '%')
    sort_user = []
    sort_item = []
    for user_id in user_id_dic:
        sort_user.append((user_id, user_id_dic[user_id]))
    for item_id in item_id_dic:
        sort_item.append((item_id, item_id_dic[item_id]))
    
    sort_user.sort(key=lambda x: x[1])
    sort_item.sort(key=lambda x: x[1])
    print('Fitering (core = ' + str(core) + ') ... ', end = 'number of remained interactions: ')
    
    while sort_user[0][1] < core or sort_item[0][1] < core:
        # find out all users and items with less than core recorders
        user_LessThanCore = set()
        item_LessThanCore = set()
        for pair in sort_user:
            if pair[1] < core: user_LessThanCore.add(pair[0])
            else: break
        for pair in sort_item:
            if pair[1] < core: item_LessThanCore.add(pair[0])
            else: break
        # reconstruct the interaction record, remove the cool one
        interaction_filtered = []
        for (user_id, item_id, click, like, follow, forward) in interaction:
            if not (user_id in user_LessThanCore or item_id in item_LessThanCore):
                interaction_filtered.append((user_id, item_id, click, like, follow, forward))
        # update the record
        interaction = interaction_filtered
        # count the number of each user and item in new data, check if all cool users and items are removed
        # reset all memory variables
        user_id_dic = {}  # record the number of interaction for each user and item
        item_id_dic = {}
        for (user_id, item_id, click, like, follow, forward) in interaction:
            try: user_id_dic[user_id] += 1
            except: user_id_dic[user_id] = 1
            try: item_id_dic[item_id] += 1
            except: item_id_dic[item_id] = 1

        sort_user = []
        sort_item = []
        for user_id in user_id_dic:
            sort_user.append((user_id, user_id_dic[user_id]))
        for item_id in item_id_dic:
            sort_item.append((item_id, item_id_dic[item_id]))
        sort_user.sort(key=lambda x: x[1])
        sort_item.sort(key=lambda x: x[1])
        print (len(interaction), end = ' ')
    print()
    print ('# Filtered training dataset')
    print ('  User:', len(user_id_dic), 'Item:', len(item_id_dic), 'Interaction:', len(interaction), 'Sparsity:', 100 - len(interaction) * 100.0 / len(user_id_dic) / len(item_id_dic), '%')
    
    return interaction

In [42]:
# train data func
def index_encoding(interaction):
    # mapping id into number
    # after filtering the dataset, we need to re-encode the index of users and items
    user_id_set = set()
    item_id_set = set()

    for (user_id, item_id, click, like, follow, forward) in interaction:
        user_id_set.add(user_id)
        item_id_set.add(item_id)
    user_num2id = list(user_id_set)
    item_num2id = list(item_id_set)
    user_num2id.sort()
    item_num2id.sort()
    # user_id2num maps id to number, and user_num2id dictionary is not needed, user_ID
    user_id2num = {}
    for num in range(0, len(user_id_set)):
        user_id2num[user_num2id[num]] = num
    item_id2num = {}
    for num in range(0, len(item_id_set)):
        item_id2num[item_num2id[num]] = num
    interaction_number = []
    for (user_id, item_id, click, like, follow, forward) in interaction:
        interaction_number.append((user_id2num[user_id], item_id2num[item_id], click, like, follow, forward))
    interaction = interaction_number
    return interaction, user_id2num, item_id2num

In [47]:
# train & validation & test func
def user_action_list_making(Interaction_train, user_id2num):
    user_num = len(user_id2num)
    user_action_list = []
    for i in range(user_num):
        user_action_list.append([]);
    index = 0
    for (user_id, item_id, click, like, follow, forward) in Interaction_train:
        user_action_list[user_id].append((item_id, index));
        index = index + 1
        
    print('user_action_list init success, waiting filter...')
    
    Interaction_train_with_action = []
    for (user_id, item_id, click, like, follow, forward) in Interaction_train:
        history = user_action_list[user_id] # [(item_id, index_time_ms), ]
        history.sort(key=lambda x: x[1])     # index_time_ms rank, small->big
        
        user_real_action = []
        for (p1, _) in history :
            if (p1 == item_id):
                break
            else:
                user_real_action.append(p1)
        
        Interaction_train_with_action.append((user_id, item_id, click, like, follow, forward, user_real_action))
    return Interaction_train_with_action

In [71]:
# pred func
# click action_list; non-click sample's action filter with 'if (time_ms < ts)'
def user_action_list_making_with_all_sample(Interaction_train, user_id2num, is_click = True):
    # Note: Interaction_train with time seq
    user_num = len(user_id2num)
    user_action_list = []
    for i in range(user_num):
        user_action_list.append([]);
    index = 0
    Interaction_train_with_index = []
    for (user_id, item_id, click, like, follow, forward) in Interaction_train:
        if click:
            if click == 1:
                user_action_list[user_id].append((item_id, index))
        else:
            user_action_list[user_id].append((item_id, index))
        index = index + 1
        Interaction_train_with_index.append((user_id, item_id, click, like, follow, forward, index))

    print("len(user_action_list) = ", len(user_action_list))
    # print("user_action_list[:3] = ", user_action_list[:3])
    print('user_action_list init success, waiting filter...')
    
    Interaction_train_with_action = []
    step = 0
    for (user_id, item_id, click, like, follow, forward, index_time) in Interaction_train_with_index:
        history = user_action_list[user_id] # [(item_id, index_time_ms), ]
        history.sort(key=lambda x: x[1])     # index_time_stamp rank, small->big
        
        user_real_action = []
        for (p1, ts) in history :
            if (ts < index_time):
                user_real_action.append(p1)
            else:
                break
        Interaction_train_with_action.append((user_id, item_id, click, like, follow, forward, user_real_action))
        step = step + 1
        if step % 1000000 == 0:
            print("step=", step)
    return Interaction_train_with_action

## Directory
##### 1 generate &nbsp;&nbsp; 'tenrec_train_data.json' & 'tenrec_validation_data.json' &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;for mmoe train
##### 2 generate &nbsp;&nbsp; 'tenrec_pred_data.json' &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;for mmoe pred  
----------------------------------------------------------------------------------------------------------------

## 1 generate &nbsp;&nbsp; 'tenrec_train_data.json' & 'tenrec_validation_data.json' &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;for mmoe train

In [34]:
# 1 load data
data_train = []
origin_data = read_data_with_pd(path_train_read)

raw_data_train = origin_data

confirm data, log_num= 12031351
   user_id  item_id  click  like  follow  forward
0        1        4      0     0       0        0
1        1     1201      1     0       0        0
2        1   250502      1     0       0        0
3        1    50885      1     0       0        0
4        1    16934      1     0       0        0


In [36]:
# 2 click sample
raw_data_train_with_click = raw_data_train[(raw_data_train.click ==1)]
raw_data_train_with_click.head()

Unnamed: 0,user_id,item_id,click,like,follow,forward
1,1,1201,1,0,0,0
2,1,250502,1,0,0,0
3,1,50885,1,0,0,0
4,1,16934,1,0,0,0
5,1,311,1,0,0,0


In [39]:
# 3 add list
train_data_click = []
row = len(raw_data_train_with_click['user_id'])
print("row=", row)
          
for index, row in raw_data_train_with_click.iterrows():
    train_data_click.append((row['user_id'], row['item_id'], int(row['click']), int(row['like']), int(row['follow']), int(row['forward'])))
    if index % 200000 == 0:
        print ("step=", index)

row= 2894768
step= 400000
step= 2000000
step= 2400000
step= 2800000
step= 3200000
step= 3600000
step= 3800000
step= 4200000
step= 4400000
step= 5000000
step= 6200000
step= 7200000
step= 7800000
step= 9800000
step= 10000000
step= 10200000
step= 11400000


In [40]:
# 4 train data filter: frequence
core = 10
train_data_click = dataset_filtering(train_data_click, core)

# Original training dataset
  User: 99808 Item: 374640 Interaction: 2894768 Sparsity: 99.9922583369465 %
Fitering (core = 10) ... number of remained interactions: 2090024 2025248 2009798 2006653 2005657 2005362 2005254 2005228 2005219 
# Filtered training dataset
  User: 56751 Item: 37453 Interaction: 2005219 Sparsity: 99.90565873791812 %


In [43]:
# 5 train data index_encoding
train_data_click, user_id2num, item_id2num = index_encoding(train_data_click)
print(train_data_click[0:10])

[(0, 679, 1, 0, 0, 0), (0, 7224, 1, 0, 0, 0), (0, 179, 1, 0, 0, 0), (0, 1746, 1, 0, 0, 0), (0, 1974, 1, 0, 0, 0), (0, 23597, 1, 0, 0, 0), (0, 23599, 1, 0, 0, 0), (0, 24260, 1, 0, 0, 0), (0, 21016, 1, 0, 0, 0), (0, 25051, 1, 0, 0, 0)]


In [48]:
# 6 make action_list
train_data_click_with_action = user_action_list_making(train_data_click, user_id2num)
print(train_data_click_with_action[0:10])

user_action_list init success, waiting filter...
[(0, 679, 1, 0, 0, 0, []), (0, 7224, 1, 0, 0, 0, [679]), (0, 179, 1, 0, 0, 0, [679, 7224]), (0, 1746, 1, 0, 0, 0, [679, 7224, 179]), (0, 1974, 1, 0, 0, 0, [679, 7224, 179, 1746]), (0, 23597, 1, 0, 0, 0, [679, 7224, 179, 1746, 1974]), (0, 23599, 1, 0, 0, 0, [679, 7224, 179, 1746, 1974, 23597]), (0, 24260, 1, 0, 0, 0, [679, 7224, 179, 1746, 1974, 23597, 23599]), (0, 21016, 1, 0, 0, 0, [679, 7224, 179, 1746, 1974, 23597, 23599, 24260]), (0, 25051, 1, 0, 0, 0, [679, 7224, 179, 1746, 1974, 23597, 23599, 24260, 21016])]


In [49]:
# 7 fianl train data && shuffle
final_data_train = []
random.shuffle(train_data_click_with_action)
# del time_ms
# for (user_id, item_id, time_ms, click, like, follow, comment, forward, longview, user_real_action) in train_data_click_with_action:
#     final_data_train.append((user_id, item_id, click, like, follow, comment, forward, longview, user_real_action))

# keep time_ms
final_data_train = train_data_click_with_action
print(final_data_train[0:3])

[(30758, 23961, 1, 0, 0, 0, [7853, 17681, 10773, 22096, 1986, 37316, 36321, 36920, 33659, 35623, 27597, 37351, 36673, 35737, 24225, 35664, 37121, 36777, 37264, 24343, 23861]), (19879, 28451, 1, 0, 0, 0, [3185, 1762, 11268, 397, 930, 9193, 895, 2359, 1687, 7209, 4967, 5257, 4984, 3985, 22106, 722, 2204, 2772, 2816, 1096, 2202, 21067, 2192, 13564, 835, 16208, 8505, 23855, 23838, 27946, 23853, 28183, 29357]), (16270, 26119, 1, 0, 0, 0, [3260, 200, 1762, 80, 9149, 357, 962, 6477, 2022])]


In [50]:
print("fianl tain data, row", len(final_data_train))

fianl tain data, row 2005219


In [55]:
# 8 train & validation
row = len(final_data_train)
train_row = int(row * 0.9)
validation_row = row - train_row
print("train_row:", train_row, ", validation_row=", validation_row)
tenrec_train = final_data_train[:train_row]
tenrec_validation = final_data_train[train_row:]

train_row: 1804697 , validation_row= 200522


In [57]:
# 8 wirte train & validation data
write_data(path_train, tenrec_train)
write_data(path_validation, tenrec_validation)
print("write success")

write success


In [58]:
# 9 other 
with open(path_train) as f:
    line = f.readline()
    data = json.loads(line)
    f.close()
    user_num = len(data)
print("confirm data, num=",user_num)
data[:1]

confirm data, num= 1804697


[[30758,
  23961,
  1,
  0,
  0,
  0,
  [7853,
   17681,
   10773,
   22096,
   1986,
   37316,
   36321,
   36920,
   33659,
   35623,
   27597,
   37351,
   36673,
   35737,
   24225,
   35664,
   37121,
   36777,
   37264,
   24343,
   23861]]]

In [59]:
# 10 other: distribution
# count user action_list length distribution
dic_action_list_length = {}
for row in data:
    (user_id, item_id, click, like, follow, forward, user_real_action) = row
    length = len(user_real_action)
    try: dic_action_list_length[length] += 1
    except: dic_action_list_length[length] = 1

for key, value in dic_action_list_length.items():
    print ("length=", key, "number=", value)
    break

sorted_keys = sorted(dic_action_list_length.keys())
sorted_dict = {}
for key in sorted_keys:
    sorted_dict[key] = dic_action_list_length[key]

length= 21 number= 30914


In [60]:
x1 = []
y1 = []
for key, value in sorted_dict.items():
    x1.append(key)
    y1.append(value)
print("len(x1)=", len(x1), ", len(y1)=", len(y1))

len(x1)= 673 , len(y1)= 673


In [61]:
import plotly.offline as py                    #保存图表，相当于plotly.plotly as py，同时增加了离线功能
py.init_notebook_mode(connected=True)          #离线绘图时，需要额外进行初始化
import plotly.graph_objects as go # 引入plotly底层绘图库

trace = go.Scatter(
     x = x1,
     y = y1,
     showlegend=True
)
data = [trace]

py.iplot(data)

In [65]:
dic_table = {}
for key, value in dic_action_list_length.items():
    if (key <= 10):
        try:dic_table['0-10'] += value;
        except: dic_table['0-10'] = value
    elif (key <= 30):
        try:dic_table['10-30'] += value;
        except: dic_table['10-30'] = value
    elif (key <= 50):
        try:dic_table['30-50'] += value;
        except: dic_table['30-50'] = value
    elif (key <= 100):
        try:dic_table['50-100'] += value;
        except: dic_table['50-100'] = value
    elif (key <= 200):
        try:dic_table['100-200'] += value;
        except: dic_table['100-200'] = value
    elif (key <= 300):
        try:dic_table['200-300'] += value;
        except: dic_table['200-300'] = value
    elif (key <= 400):
        try:dic_table['300-400'] += value;
        except: dic_table['300-400'] = value
    elif (key <= 500):
        try:dic_table['400-500'] += value;
        except: dic_table['400-500'] = value
    elif (key <= 1000):
        try:dic_table['500-1000'] += value;
        except: dic_table['500-1000'] = value
    elif (key <= 2000):
        try:dic_table['1000-2000'] += value;
        except: dic_table['1000-2000'] = value
    elif (key <= 2500):
        try:dic_table['2000-2500'] += value;
        except: dic_table['2000-2500'] = value
    elif (key <= 3000):
        try:dic_table['2500-3000'] += value;
        except: dic_table['2500-3000'] = value

for key, value in dic_table.items():
    print("ation_list length:", key, "log num:", value)

ation_list length: 10-30 log num: 649151
ation_list length: 30-50 log num: 293065
ation_list length: 0-10 log num: 561122
ation_list length: 200-300 log num: 5866
ation_list length: 50-100 log num: 232348
ation_list length: 100-200 log num: 61505
ation_list length: 300-400 log num: 1266
ation_list length: 400-500 log num: 200
ation_list length: 500-1000 log num: 174


## 2 generate &nbsp;&nbsp; 'tenrec_pred_data.json' &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;for mmoe pred  

In [66]:
# 1 click 1 & 0
# 2 get data
all_data = origin_data
print("len(all_data) = ", len(all_data))
all_data.head()

len(all_data) =  12031351


Unnamed: 0,user_id,item_id,click,like,follow,forward
0,1,4,0,0,0,0
1,1,1201,1,0,0,0
2,1,250502,1,0,0,0
3,1,50885,1,0,0,0
4,1,16934,1,0,0,0


In [68]:
# 3 add list  
# && filter data with user_id2num & item_id2num  
# && replace with user_id2num[row['user_id']]
train_data_all = []
row = len(all_data['user_id'])
print("row =", row)

print("prepare generating train_data_all....")
set_user_id = set()
set_item_id = set()
for key, _ in user_id2num.items():
    set_user_id.add(key)
for key, _ in item_id2num.items():
    set_item_id.add(key)

print("len(set_user_id) = ", len(set_user_id), ", len(set_item_id) = ", len(set_item_id))

for index, row in all_data.iterrows():
    if row['user_id'] in set_user_id and row['item_id'] in set_item_id:
        train_data_all.append((user_id2num[row['user_id']], item_id2num[row['item_id']], int(row['click']), int(row['like']), int(row['follow']), int(row['forward'])))
    if index % 2000000 == 0:
        print("step=", index)
print ("len(train_data_all) = ", len(train_data_all))
print ("train_data_all[0:5] = ", train_data_all[0:5])

row = 12031351
prepare generating train_data_all....
len(set_user_id) =  56751 , len(set_item_id) =  37453
step= 0
step= 2000000
step= 4000000
step= 6000000
step= 8000000
step= 10000000
step= 12000000
len(train_data_all) =  6338947
train_data_all[0:5] =  [(0, 2, 0, 0, 0, 0), (0, 679, 1, 0, 0, 0), (0, 7224, 1, 0, 0, 0), (0, 179, 1, 0, 0, 0), (0, 1746, 1, 0, 0, 0)]


In [72]:
# 4 5
# 6 make action_list
print("len(user_id2num) = ", len(user_id2num))
train_data_all_with_action = user_action_list_making_with_all_sample(train_data_all, user_id2num, is_click = True)
print(train_data_all_with_action[0:10])

len(user_id2num) =  56751
len(user_action_list) =  56751
user_action_list init success, waiting filter...
step= 1000000
step= 2000000
step= 3000000
step= 4000000
step= 5000000
step= 6000000
[(0, 2, 0, 0, 0, 0, [2]), (0, 679, 1, 0, 0, 0, [2, 679]), (0, 7224, 1, 0, 0, 0, [2, 679, 7224]), (0, 179, 1, 0, 0, 0, [2, 679, 7224, 179]), (0, 1746, 1, 0, 0, 0, [2, 679, 7224, 179, 1746]), (0, 1974, 1, 0, 0, 0, [2, 679, 7224, 179, 1746, 1974]), (0, 23597, 1, 0, 0, 0, [2, 679, 7224, 179, 1746, 1974, 23597]), (0, 23598, 0, 0, 0, 0, [2, 679, 7224, 179, 1746, 1974, 23597, 23598]), (0, 23599, 1, 0, 0, 0, [2, 679, 7224, 179, 1746, 1974, 23597, 23598, 23599]), (0, 23600, 0, 0, 0, 0, [2, 679, 7224, 179, 1746, 1974, 23597, 23598, 23599, 23600])]


In [73]:
# 7 fianl all data
final_all_data_pred = []
final_all_data_pred = train_data_all_with_action
print(final_all_data_pred[0:5])

[(0, 2, 0, 0, 0, 0, [2]), (0, 679, 1, 0, 0, 0, [2, 679]), (0, 7224, 1, 0, 0, 0, [2, 679, 7224]), (0, 179, 1, 0, 0, 0, [2, 679, 7224, 179]), (0, 1746, 1, 0, 0, 0, [2, 679, 7224, 179, 1746])]


In [74]:
print("final_all_data_pred, row=", len(final_all_data_pred))

final_all_data_pred, row= 6338947


In [75]:
# 8 wirte train data
write_data(path_all_data_pred, final_all_data_pred)
print("write success")

write success


In [76]:
# 9 other 
with open(path_all_data_pred) as f:
    line = f.readline()
    data = json.loads(line)
    f.close()
    user_num = len(data)
print("confirm data, num=",user_num)
data[:1]

confirm data, num= 6338947


[[0, 2, 0, 0, 0, 0, [2]]]

In [77]:
# 10 other: distribution
# count user action_list length distribution
dic_action_list_length = {}
for row in data:
    (user_id, item_id, click, like, follow, forward, user_real_action) = row
    length = len(user_real_action)
    try: dic_action_list_length[length] += 1
    except: dic_action_list_length[length] = 1

for key, value in dic_action_list_length.items():
    print ("length=", key, "number=", value)
    break

sorted_keys = sorted(dic_action_list_length.keys())
sorted_dict = {}
for key in sorted_keys:
    sorted_dict[key] = dic_action_list_length[key]

length= 1 number= 56751


In [78]:
x1 = []
y1 = []
for key, value in sorted_dict.items():
    x1.append(key)
    y1.append(value)
print("len(x1)=", len(x1), ", len(y1)=", len(y1))

len(x1)= 1135 , len(y1)= 1135


In [79]:
import plotly.offline as py                    #保存图表，相当于plotly.plotly as py，同时增加了离线功能
py.init_notebook_mode(connected=True)          #离线绘图时，需要额外进行初始化
import plotly.graph_objects as go # 引入plotly底层绘图库

trace = go.Scatter(
     x = x1,
     y = y1,
     showlegend=True
)
data = [trace]

py.iplot(data)

In [80]:
dic_table = {}
for key, value in dic_action_list_length.items():
    if (key <= 10):
        try:dic_table['0-10'] += value;
        except: dic_table['0-10'] = value
    elif (key <= 30):
        try:dic_table['10-30'] += value;
        except: dic_table['10-30'] = value
    elif (key <= 50):
        try:dic_table['30-50'] += value;
        except: dic_table['30-50'] = value
    elif (key <= 100):
        try:dic_table['50-100'] += value;
        except: dic_table['50-100'] = value
    elif (key <= 200):
        try:dic_table['100-200'] += value;
        except: dic_table['100-200'] = value
    elif (key <= 300):
        try:dic_table['200-300'] += value;
        except: dic_table['200-300'] = value
    elif (key <= 400):
        try:dic_table['300-400'] += value;
        except: dic_table['300-400'] = value
    elif (key <= 500):
        try:dic_table['400-500'] += value;
        except: dic_table['400-500'] = value
    elif (key <= 1000):
        try:dic_table['500-1000'] += value;
        except: dic_table['500-1000'] = value
    elif (key <= 2000):
        try:dic_table['1000-2000'] += value;
        except: dic_table['1000-2000'] = value
    elif (key <= 2500):
        try:dic_table['2000-2500'] += value;
        except: dic_table['2000-2500'] = value
    elif (key <= 3000):
        try:dic_table['2500-3000'] += value;
        except: dic_table['2500-3000'] = value

for key, value in dic_table.items():
    print("ation_list length:", key, "log num:", value)

ation_list length: 0-10 log num: 567510
ation_list length: 10-30 log num: 1116684
ation_list length: 30-50 log num: 997313
ation_list length: 50-100 log num: 1711510
ation_list length: 100-200 log num: 1377958
ation_list length: 200-300 log num: 392313
ation_list length: 300-400 log num: 119223
ation_list length: 400-500 log num: 37317
ation_list length: 500-1000 log num: 18858
ation_list length: 1000-2000 log num: 261
