### pipeline:
#### 1. load data
#### 2. remove actions of which timestamp are not accurate
#### 3. remove infrequent items
#### 4. remove infrequent users
#### 5. map item from 1 to n
#### 6. map category from 1 to n (or unknown for minor categories)

In [1]:
import pandas as pd
import numpy as np
import random
import pickle
import matplotlib.pyplot as plt

In [2]:
random.seed(30)

#### 1. load data

In [3]:
item_file = "../Data/tmall/taobao_100000_user.pickle"
df = pd.read_pickle(item_file)

print("initial data size, item num, user num", df.shape, df.userid.nunique(), df.itemid.nunique())

initial data size, item num, user num (10142896, 5) 100000 1600922


#### remove actions out of time range

In [4]:
sorted_df = df.sort_values(by=['userid', 'timestamp'], ascending=[True, True])
print("before processing", sorted_df.shape)

before processing (10142896, 5)


In [5]:
sorted_df = sorted_df[sorted_df.timestamp >= 1511481600]
print("remove time stamp > 1511481600", sorted_df.shape)

remove time stamp > 1511481600 (10141285, 5)


In [6]:
sorted_df = sorted_df[sorted_df.timestamp <= 1512345600]
print("remove time stamp < 1512345600", sorted_df.shape)

remove time stamp < 1512345600 (10141068, 5)


##### split data into train and test df

In [7]:
trainTime_threshold = 1512172800
validTime_threshold = 1512259200

train_df = sorted_df[sorted_df.timestamp <= trainTime_threshold]
valid_test_df = sorted_df[sorted_df.timestamp > trainTime_threshold]
valid_df = valid_test_df[valid_test_df.timestamp <= validTime_threshold]
test_df = valid_test_df[valid_test_df.timestamp > validTime_threshold]
print("train num", train_df.shape)
print("valid num", valid_df.shape)
print("test num", test_df.shape)

train num (7509968, 5)
valid num (1421671, 5)
test num (1209429, 5)


#### remove infrequent items

In [8]:
remove_item_list = []
item_freq_map = train_df['itemid'].value_counts()
for item, freq in item_freq_map.items():
    if freq < 20:
        remove_item_list.append(item)
remove_item_num = len(remove_item_list)
print("remove item num", remove_item_num, remove_item_num*1.0/len(item_freq_map.keys()))
print("before filtering item num", train_df.itemid.nunique())
train_df = train_df[~train_df.itemid.isin(remove_item_list)]
print("after filtering itemnum", train_df.itemid.nunique())

remove item num 1304091 0.9504343706726915
before filtering item num 1372100
after filtering itemnum 68009


#### remove infrequent users

In [9]:
user_freq_map = train_df.userid.value_counts()
filter_users_num_total = 0

filter_users = user_freq_map[user_freq_map>=300].index
filter_users_num = len(filter_users)
print("filter user actions >=300", filter_users_num)
filter_users_num_total += filter_users_num

train_df = train_df[~train_df.userid.isin(filter_users)]

filter_users = user_freq_map[user_freq_map<=5].index
filter_users_num = len(filter_users)
print("filter user actions <= 5", filter_users_num)
filter_users_num_total += filter_users_num

train_df = train_df[~train_df.userid.isin(filter_users)]

print("filter user num", filter_users_num_total)

filter user actions >=300 41
filter user actions <= 5 14937
filter user num 14978


In [10]:
unique_item_num = train_df.itemid.nunique()
total_action_num = train_df.shape[0]
print("unique item num", unique_item_num)
print("total action num", total_action_num)
print("action num per item", total_action_num*1.0/unique_item_num)

unique item num 68008
total action num 3381110
action num per item 49.716356899188334


In [None]:
#### remove users whose train actions < 15
# train_user_freq_map = train_df.userid.value_counts()
# print("before filtering", len(train_user_freq_map))
# train_userid_list = train_user_freq_map[train_user_freq_map > 15].index
# print("train_userid_list", len(train_userid_list))

In [11]:
train_userid_list = train_df.userid.unique()
train_df = train_df[train_df.userid.isin(train_userid_list)]
valid_df = valid_df[valid_df.userid.isin(train_userid_list)]
test_df = test_df[test_df.userid.isin(train_userid_list)]

In [12]:
train_itemid_list = train_df.itemid.unique()
train_df = train_df[train_df.itemid.isin(train_itemid_list)]
valid_df = valid_df[valid_df.itemid.isin(train_itemid_list)]
test_df = test_df[test_df.itemid.isin(train_itemid_list)]

In [13]:
sorted_df = pd.concat([train_df, valid_df, test_df])
print("action num", sorted_df.shape)

action num (4391921, 5)


#### remove duplicate categories

In [14]:
itemids = sorted_df.itemid.unique()
categoryids = sorted_df.categoryid.unique()
print("unique cate num", sorted_df.categoryid.nunique())
category_map = {}
idx = 0
for c in categoryids:
    category_map[c] = idx
    idx += 1

unique cate num 2299


In [15]:
adj = np.zeros((len(categoryids), len(categoryids)), dtype=int)
for i in itemids:
    same_category = sorted_df[sorted_df.itemid == i].categoryid.unique()
    N = len(same_category)
    if N > 1:
#         print('found', same_category)
        for i in range(N):
            c1 = category_map[ same_category[i] ]
            for j in range(i, N):
                c2 = category_map[ same_category[j] ]
                adj[c1][c2] = 1
                adj[c2][c1] = 1

In [19]:
groupids = np.zeros((len(categoryids)), int)
gid = 1
def dfs(idx, gid):
    if groupids[idx] != 0: 
        return
#     print(adj[idx].nonzero())
    for n in adj[idx].nonzero()[0]:
        groupids[idx] = gid
        dfs(n, gid)
        
for i in range(len(categoryids)):
    if groupids[i] == 0:
        dfs(i, gid)
        groupids[i] = gid
        gid += 1

In [20]:
sorted_df['categoryid'] = sorted_df['categoryid'].apply(lambda x: groupids[ category_map[x] ])
print("unique cate num", sorted_df.categoryid.nunique())

KeyError: 1

In [33]:
import torch

In [35]:
a = torch.randn(1, 2, 4)
b = torch.randn(1, 4, 3)
torch.bmm(a, b)

tensor([[[-0.2769, -4.2810, -1.6323],
         [ 1.1940, -2.6774, -0.8209]]])

In [21]:
sorted_df.shape

(4391921, 5)

In [22]:
sorted_df.head()

Unnamed: 0,userid,itemid,categoryid,behavior,timestamp
41582965,16,3840502,1,pv,1511626494
41582966,16,3716887,2,pv,1511626551
41582968,16,3840502,1,pv,1511626597
41582969,16,4115794,1,pv,1511626761
41582970,16,2111454,3,pv,1511626770


In [23]:
sorted_df.timestamp.max(), sorted_df.timestamp.min()

(1512322883, 1511481682)

In [24]:
trainTime_threshold = 1512172800
validTime_threshold = 1512259200

train_df = sorted_df[sorted_df.timestamp <= trainTime_threshold]
valid_test_df = sorted_df[sorted_df.timestamp > trainTime_threshold]
valid_df = valid_test_df[valid_test_df.timestamp <= validTime_threshold]
test_df = valid_test_df[valid_test_df.timestamp > validTime_threshold]
print("train num", train_df.shape)
print("valid num", valid_df.shape)
print("test num", test_df.shape)

train num (3381110, 5)
valid num (549380, 5)
test num (461431, 5)


#### map item

In [19]:
# itemid_list_train = train_df.itemid.unique()
# itemid_map_train = {}

# itemid_map_train['<PAD>'] = 0
# for itemid in itemid_list_train:
#     itemid_map_train[itemid] = len(itemid_map_train)
# print("train item num", len(itemid_map_train)-1)

train item num 96354


In [25]:
item_freq = train_df.itemid.value_counts()
itemid_map_train = {}

itemid_map_train['<PAD>'] = 0
for item, freq in item_freq.items():
    itemid_map_train[item] = len(itemid_map_train)
print("train item num", len(itemid_map_train)-1)

train item num 68008


In [26]:
sorted_df.itemid = sorted_df.itemid.apply(lambda x: itemid_map_train[x])

#### map category

In [27]:
itemNum_list_category = sorted_df.groupby('categoryid')['itemid'].nunique().tolist()
actionNum_list_category = sorted_df.groupby('categoryid')['itemid'].count().tolist()
cate_index_list = sorted_df.groupby('categoryid').groups.keys()

In [28]:
temp = sorted(zip(itemNum_list_category, actionNum_list_category, cate_index_list), reverse=True)
sorted_itemNum_list_cate, sorted_actionNum_list_cate, sorted_cate_index_list= zip(*temp)

In [29]:
major_cate_num = 200
minor_cate_index_list = sorted_cate_index_list[major_cate_num:]
major_cate_index_list = sorted_cate_index_list[:major_cate_num]

cate_id_map = {}
cate_id_map['<PAD>'] = 0

for cate in major_cate_index_list:
    cate_id_map[cate] = len(cate_id_map)
    
unknownCate_id = len(cate_id_map)

for cate in minor_cate_index_list:
    cate_id_map[cate] = unknownCate_id

sorted_df.categoryid = sorted_df.categoryid.apply(lambda x: cate_id_map[x])

#### save pickle

In [32]:
sorted_df.to_pickle("../Data/ctr_tmall/data_time.pickle")

In [3]:
sorted_df= pd.read_pickle("./100k_unknown_cate/data_time.pickle")

In [4]:
cate_nunique_list = sorted_df.groupby('userid')['categoryid'].nunique()

In [15]:
sum(list(cate_nunique_list.values))/

748554

In [16]:
userid_group_list = list(sorted_df.groupby(['userid']).groups.keys())
user_num = len(userid_group_list)
print("user num", user_num)

user num 51275


In [6]:
cate_list = sorted_df.groupby('userid')['categoryid'].apply(list)

In [11]:
cate_list[:6]

userid
16    [5, 6, 5, 5, 22, 3, 201, 3, 64, 5, 3, 29, 201,...
17    [21, 21, 1, 8, 4, 22, 201, 78, 19, 1, 3, 3, 3,...
44    [1, 5, 1, 1, 7, 1, 7, 17, 1, 201, 201, 201, 20...
45    [93, 2, 1, 1, 48, 2, 9, 2, 9, 2, 1, 1, 189, 2,...
72    [3, 3, 9, 15, 108, 201, 201, 201, 171, 171, 17...
74    [76, 76, 76, 76, 76, 76, 76, 76, 4, 4, 4, 64, ...
Name: categoryid, dtype: object

In [33]:
sorted_df = df.sort_values(by=['userid', 'timestamp'], ascending=[True, True])
print("before processing", sorted_df.shape)

before processing (10142896, 5)


In [57]:
userid_group_list = list(sorted_df.groupby(['userid']).groups.keys())
user_num = len(userid_group_list)
print("user num", user_num)

item_list = []
time_list = []
cate_list = []
user_list = []

item_list = sorted_df.groupby('userid')['itemid'].apply(list).values
time_list = sorted_df.groupby('userid')['timestamp'].apply(list).values
cate_list = sorted_df.groupby('userid')['categoryid'].apply(list).values

print(len(item_list), len(time_list), len(cate_list))

len_list = [len(item_list_user) for item_list_user in item_list]
user_list = []
for len_index in range(len(item_list)):
    len_i = len_list[len_index]
    user_list_per = [len_index for i in range(len_i)]
    user_list.append(user_list_per)


user num 51275
51275 51275 51275


In [60]:
def save_pickle_cate_unknown_time(item, time, cate, user):
    item_file = "100k_unknown_cate/"+"item_time.pickle"
    item_f = open(item_file, "wb")
    pickle.dump(item, item_f)
    
    time_file = "100k_unknown_cate/"+"time_time.pickle"
    time_f = open(time_file, "wb")
    pickle.dump(time, time_f)

    cate_file = "100k_unknown_cate/"+"cate_time.pickle"
    cate_f = open(cate_file, "wb")
    pickle.dump(cate, cate_f)
    
    user_file = "100k_unknown_cate/"+"user_time.pickle"
    user_f = open(user_file, "wb")
    pickle.dump(user, user_f)

In [61]:
save_pickle_cate_unknown_time(item_list, time_list, cate_list, user_list)

In [42]:
trainTime_threshold = 1512172800
validTime_threshold = 1512259200

train_df = sorted_df[sorted_df.timestamp <= trainTime_threshold]
valid_test_df = sorted_df[sorted_df.timestamp > trainTime_threshold]
valid_df = valid_test_df[valid_test_df.timestamp <= validTime_threshold]
test_df = valid_test_df[valid_test_df.timestamp > validTime_threshold]
print("train num", train_df.shape)
print("valid num", valid_df.shape)
print("test num", test_df.shape)

train num (2995258, 5)
valid num (430797, 5)
test num (359906, 5)


In [43]:
train_userid_group_list = list(train_df.groupby(['userid']).groups.keys())
train_user_num = len(train_userid_group_list)
print("train user num", train_user_num)

train_item_list = []
train_time_list = []
# rating_list = []
train_cate_list = []
train_user_list = []

train_item_list = train_df.groupby('userid')['itemid'].apply(list).values
train_time_list = train_df.groupby('userid')['timestamp'].apply(list).values
train_cate_list = train_df.groupby('userid')['categoryid'].apply(list).values

print(len(train_item_list), len(train_time_list), len(train_cate_list))

len_list = [len(item_list_user) for item_list_user in train_item_list]
train_user_list = []
for len_index in range(len(train_item_list)):
    len_i = len_list[len_index]
    user_list_per = [len_index for i in range(len_i)]
    train_user_list.append(user_list_per)

train user num 51275
51275 51275 51275


In [44]:
test_userid_group_list = list(test_df.groupby(['userid']).groups.keys())
test_user_num = len(test_userid_group_list)
print("test user num", test_user_num)

test_item_list = []
test_time_list = []
# rating_list = []
test_cate_list = []
test_user_list = []

test_item_list = test_df.groupby('userid')['itemid'].apply(list).values
test_time_list = test_df.groupby('userid')['timestamp'].apply(list).values
test_cate_list = test_df.groupby('userid')['categoryid'].apply(list).values

len_list = [len(item_list_user) for item_list_user in test_item_list]
test_user_list = []
for len_index in range(len(test_item_list)):
    len_i = len_list[len_index]
    user_list_per = [len_index for i in range(len_i)]
    test_user_list.append(user_list_per)

test user num 41935


In [45]:
valid_userid_group_list = list(valid_df.groupby(['userid']).groups.keys())
valid_user_num = len(valid_userid_group_list)
print("valid user num", valid_user_num)

valid_item_list = []
valid_time_list = []
# rating_list = []
valid_cate_list = []
valid_user_list = []

valid_item_list = valid_df.groupby('userid')['itemid'].apply(list).values
valid_time_list = valid_df.groupby('userid')['timestamp'].apply(list).values
valid_cate_list = valid_df.groupby('userid')['categoryid'].apply(list).values

len_list = [len(item_list_user) for item_list_user in valid_item_list]
valid_user_list = []
for len_index in range(len(valid_item_list)):
    len_i = len_list[len_index]
    user_list_per = [len_index for i in range(len_i)]
    valid_user_list.append(user_list_per)

valid user num 44148


### save train, valid, test data

In [48]:
def save_pickle_cate_unknown(train_test, item, time, cate, user):
    item_file = "100k_unknown_cate/"+train_test+"_item.pickle"
    item_f = open(item_file, "wb")
    pickle.dump(item, item_f)
    
    time_file = "100k_unknown_cate/"+train_test+"_time.pickle"
    time_f = open(time_file, "wb")
    pickle.dump(time, time_f)

    cate_file = "100k_unknown_cate/"+train_test+"_cate.pickle"
    cate_f = open(cate_file, "wb")
    pickle.dump(cate, cate_f)
    
    user_file = "100k_unknown_cate/"+train_test+"_user.pickle"
    user_f = open(user_file, "wb")
    pickle.dump(user, user_f)

In [49]:
save_pickle_cate_unknown("train", train_item_list, train_time_list, train_cate_list, train_user_list)
save_pickle_cate_unknown("valid", valid_item_list, valid_time_list, valid_cate_list, valid_user_list)
save_pickle_cate_unknown("test", test_item_list, test_time_list, test_cate_list, test_user_list)

In [47]:
train_df.categoryid.max(), train_df.categoryid.min()

(201, 1)

In [33]:
train_test = "train"
action_file = "100000_unknown_cate/"+train_test+"_item.pickle"
action_f = open(action_file, "rb")

In [34]:
action_seq_arr_total = pickle.load(action_f)

In [4]:
sorted_df = pd.read_pickle("./100k_unknown_cate/data_time.pickle")
print(sorted_df.head())

          userid  itemid  categoryid behavior   timestamp
41582965      16   38850           5       pv  1511626494
41582966      16   16692           6       pv  1511626551
41582968      16   38850           5       pv  1511626597
41582969      16    7369           5       pv  1511626761
41582970      16    2395          22       pv  1511626770


In [5]:
total_user_num = sorted_df.userid.nunique()
total_item_num = sorted_df.itemid.nunique()
total_action_num = sorted_df.shape[0]
total_cate_num = sorted_df.categoryid.nunique()

print("total user num", total_user_num)
print("total item num", total_item_num)
print("total action num", total_action_num)
print("total cate num", total_cate_num)
print("avg action per use", total_action_num/total_user_num)
print("avg action per item", total_action_num/total_item_num)
print("avg action per cate", total_action_num/total_cate_num)

total user num 51275
total item num 68007
total action num 3785961
total cate num 201
avg action per use 73.83639200390054
avg action per item 55.670166306409634
avg action per cate 18835.626865671642


In [7]:
user_cate_num_map = sorted_df.groupby('userid')['categoryid'].nunique()
total_cate_num = 0
total_user_num = 0
for user, cate_num in user_cate_num_map.items():
#     print("user", user, cate_num)
    total_cate_num += cate_num
    total_user_num += 1

print("user num", total_user_num)
print("avg cate num per user", total_cate_num/total_user_num)

user num 51275
avg cate num per user 14.598810336421257


In [55]:
trainTime_threshold = 1512172800
validTime_threshold = 1512259200

train_df = sorted_df[sorted_df.timestamp <= trainTime_threshold]
valid_test_df = sorted_df[sorted_df.timestamp > trainTime_threshold]
valid_df = valid_test_df[valid_test_df.timestamp <= validTime_threshold]
test_df = valid_test_df[valid_test_df.timestamp > validTime_threshold]
print("train num", train_df.shape)
print("valid num", valid_df.shape)
print("test num", test_df.shape)

train num (2995258, 5)
valid num (430797, 5)
test num (359906, 5)


In [56]:
train_df.itemid.value_counts()

1        2334
2        1976
3        1555
4        1474
5        1449
6        1433
7        1372
8        1362
10       1278
9        1278
11       1235
12       1186
13       1183
14       1160
15       1129
16       1114
17       1100
18       1098
19       1047
20       1004
21        983
22        930
23        909
24        906
25        887
26        866
27        861
28        856
29        853
30        851
         ... 
67979       5
67978       5
67988       5
67976       5
67977       5
67974       5
67971       5
67975       5
67972       5
67970       5
67983       5
67994       4
67998       4
67990       4
67996       4
67997       4
67995       4
67992       4
67993       4
67991       4
67989       4
68000       3
68001       3
67999       3
68002       2
68003       1
68004       1
68005       1
68006       1
68007       1
Name: itemid, Length: 68007, dtype: int64