In [1]:
import numpy as np
import pandas as pd
import datetime as dt



In [2]:
train = pd.read_csv('data/Trivago/train.csv', usecols=['session_id', 'timestamp', 'step', 'action_type','reference'])

In [3]:
train.action_type.value_counts()

interaction item image     11860750
clickout item               1586586
filter selection             695917
search for destination       403066
change of sort order         400584
interaction item info        285402
interaction item rating      217246
interaction item deals       193794
search for item              152203
search for poi               137444
Name: action_type, dtype: int64

In [4]:
# filter 4 behaviors，change of soft order, filter selection, search for destination, search for poi
train = train[train['action_type'] != 'change of sort order']
train = train[train['action_type'] != 'filter selection']
train = train[train['action_type'] != 'search for destination']
train = train[train['action_type'] != 'search for poi']

In [5]:
train.action_type.value_counts()

interaction item image     11860750
clickout item               1586586
interaction item info        285402
interaction item rating      217246
interaction item deals       193794
search for item              152203
Name: action_type, dtype: int64

In [6]:
print(train.reference.value_counts())
print(train.session_id.value_counts())



1649739    7131
8796       6348
2053822    6044
36455      5149
104802     4899
           ... 
3166174       1
9955578       1
1294008       1
3366854       1
6554458       1
Name: reference, Length: 363879, dtype: int64
3167404ed3197    3515
948641e533837    2811
9233fb83c116b    2797
191ae48e3cb8e    2647
c9b863c921a2d    2601
                 ... 
54d57518b5f71       1
f0ab7ac79ffe3       1
966ff828dfce8       1
67afee1467fd5       1
af28a1aeb82ae       1
Name: session_id, Length: 885270, dtype: int64


In [None]:
item_supports = train.groupby('reference').size()
train = train[np.in1d(train.reference, item_supports[item_supports >=5].index)]

session_lengths = train.groupby('session_id').size() 
train = train[np.in1d(train.session_id, session_lengths[session_lengths>1].index)] 



In [None]:
# renumber
old_itemids = train.reference.unique()
old_itemids.sort()
new_item_ids = np.arange(1, train.reference.nunique() + 1)
itemsid_map = pd.DataFrame({'item_id':old_itemids, 'new_item_id': new_item_ids})
train.rename(columns={'reference': 'item_id'}, inplace=True)
train = pd.merge(train, itemsid_map, on='item_id' )
del(train['item_id'])
train.rename(columns={'new_item_id': 'item_Id'}, inplace=True)

train.rename(columns={'item_Id': 'item_id'}, inplace=True)

def action_id_map(x):
    if x == 'clickout item':
        return 1
    elif x == 'interaction item rating':
        return 2
    elif x == 'interaction item info':
        return 3
    elif x == 'interaction item image':
        return 4
    elif x == 'interaction item deals':
        return 5
    elif x == 'search for item':
        return 6

train['action'] = train.action_type.apply(action_id_map)

old_sessionids = train.session_id.unique()
# old_sessionids.sort()
new_session_ids = np.arange(1, train.session_id.nunique() + 1)
sessionid_map = pd.DataFrame({'session_id':old_sessionids, 'new_session_id': new_session_ids})
train = pd.merge(train, sessionid_map, on='session_id' )
del(train['session_id'])
train.rename(columns={'new_session_id': 'session_id'}, inplace=True)


In [None]:
# split train, valid and test
session_max_time = train.groupby('session_id').timestamp.max()
session_max_time.sort_values(inplace=True)
session_number = train.session_id.nunique()
train_time = session_max_time.values[int(session_number*0.7)]
valid_time = session_max_time.values[int(session_number*0.8)]


In [None]:
train_sessions = session_max_time[session_max_time <= train_time].index
valid_sessions = session_max_time[(session_max_time > train_time)  & (session_max_time <= valid_time)].index
test_sessions = session_max_time[session_max_time > valid_time].index


In [None]:
train_data = train[np.in1d(train.session_id, train_sessions)]
valid_data = train[np.in1d(train.session_id, valid_sessions)]
test_data = train[np.in1d(train.session_id, test_sessions)]
print(len(train_data))
print(len(valid_data))
print(len(test_data))

In [None]:
print(train_data.session_id.value_counts())
print(valid_data.session_id.value_counts())
print(test_data.session_id.value_counts())

In [None]:
#  save item sequence  and action sequence
train_items = train_data.groupby('session_id').item_id.apply(list).tolist()
train_actions = train_data.groupby('session_id').action.apply(list).tolist()
valid_items = valid_data.groupby('session_id').item_id.apply(list).tolist()
valid_actions = valid_data.groupby('session_id').action.apply(list).tolist()
test_items = test_data.groupby('session_id').item_id.apply(list).tolist()
test_actions = test_data.groupby('session_id').action.apply(list).tolist()



In [None]:
# filter the last item, generate pos and action_pairs
import torch

def filter_last_item(all_items_list, all_actions):
    new_a = list()
    new_actions = list()
    for i in range(len(all_items_list)):
        item_list = all_items_list[i]
        action_list = all_actions[i]
        while (len(item_list) > 1) and (item_list[-1] == item_list[-2]):
            item_list.pop()
            action_list.pop()
        if len(item_list) > 1:
            new_a.append(item_list)
            new_actions.append(action_list)
    return new_a, new_actions

train_items, train_actions = filter_last_item(train_items, train_actions)
valid_items, valid_actions = filter_last_item(valid_items, valid_actions)
test_items, test_actions = filter_last_item(test_items, test_actions)
        

In [None]:
# set length = 50
new_train_items = []
new_valid_items = []
new_test_items = []
item_dict = dict()
item_ctr = 1

for item_list in train_items:
    new_item_list = []
    if len(item_list) > 50:
        item_list = item_list[-50:]
    for item in item_list:
        if item not in item_dict.keys():
            item_dict[item] = item_ctr
            item_ctr += 1
        new_item_list.append(item_dict[item])
    for j in range(50 - len(item_list)):
        new_item_list.insert(0, 0)
    new_train_items.append(new_item_list)
        
for item_list in valid_items:
    new_item_list = []
    if len(item_list) > 50:
        item_list = item_list[-50:]
    for item in item_list:
        if item not in item_dict.keys():
            item_dict[item] = item_ctr
            item_ctr += 1
        new_item_list.append(item_dict[item])
    for j in range(50 - len(item_list)):
        new_item_list.insert(0, 0)
    new_valid_items.append(new_item_list)

for item_list in test_items:
    new_item_list = []
    if len(item_list) > 50:
        item_list = item_list[-50:]
    for item in item_list:
        if item not in item_dict.keys():
            item_dict[item] = item_ctr
            item_ctr += 1
        new_item_list.append(item_dict[item])
    for j in range(50 - len(item_list)):
        new_item_list.insert(0, 0)
    new_test_items.append(new_item_list)

In [None]:
def generate_actions_50(all_actions):
    new_actions_list = []
    for actions in all_actions:
        if len(actions) > 50:
            actions  = actions[-50:]
        new_actions_list.append(actions)
    return new_actions_list

new_train_actions = generate_actions_50(train_actions)
new_valid_actions = generate_actions_50(valid_actions)
new_test_actions = generate_actions_50(test_actions)

In [None]:
max_seqlen = 50
def generate_pairs(all_actions):
    new_actions = []
    all_pairs = []
    all_pos = []
    pairs_padding = [0 for i in range(max_seqlen)]
    for actions in all_actions:
        pairs = []
        pos = []
        for i in range(len(actions)):
            pos.append(i+1)
        for i in range(len(actions)):
            now_a = actions[i] - 1
            now_pairs = []
            for j in range(len(actions)):
                other_a = actions[j]
                pair = now_a * 6 + other_a
                now_pairs.append(pair)
            for j in range(max_seqlen - len(actions)):
                now_pairs.insert(0, 0)
            pairs.append(now_pairs)
        for i in range(max_seqlen-len(actions)):
            actions.insert(0, 0)
            pos.insert(0, 0)
            pairs.insert(0, pairs_padding)
        new_actions.append(actions)
        all_pos.append(pos)
        all_pairs.append(pairs)
    return all_pairs, all_pos

train_pairs, train_pos = generate_pairs(new_train_actions)
valid_pairs, valid_pos = generate_pairs(new_valid_actions)
test_pairs, test_pos = generate_pairs(new_test_actions)


In [None]:
import torch
data_name = 'Trivago'
train_items = torch.LongTensor(new_train_items)
valid_items = torch.LongTensor(new_valid_items)
test_items = torch.LongTensor(new_test_items)
train_actions = torch.LongTensor(new_train_actions)
valid_actions = torch.LongTensor(new_valid_actions)
test_actions = torch.LongTensor(new_test_actions)


In [None]:
def Construct_connection_matrix(seqs, seq_index, seq_len, actions):
    """
    根据输入序列构造图的边连矩阵
    由于内存问题，此处不能进行padding
    对于 v1,v2,v3,v2,v1 入度邻接矩阵为：
       v1 v2 v3 v2 v1
    v1           1
    v2 1     1
    v3    1
    注意，这样的表示方法主要是应为存在重复的边
    :param seqs: B，seq
    :return: B, seq, seq, 2
    """
    # mask = seqs.gt(0)
    items, n_node, A, alias_inputs, seq_real_len, macro_items, micro_actions, micro_actions_len = [], [], [], [], [], [], [], []
    num_seqs = seqs.size(0)
    # max_n_node = seqs.size(1)
    item_seq = seqs.cpu().numpy()
    for i in range(num_seqs):
        u_input = item_seq[i]
        u_index = seq_index[i]
        u_seq_len = seq_len[i]
        u_actions = actions[i]
        u_index = u_index[:u_seq_len.long()] 
        # seq_index[-1] = seq_index[-2] + 1
        u_macro_items = torch.index_select(seqs[i], dim=0, index=u_index.long()) # 得到所有宏观item
        u_macro_items = u_macro_items.numpy()
        node = np.unique(u_macro_items) 
        u_A_length = len(node) 
        u_A_edge = len(u_macro_items) 
        
        items.append(torch.LongTensor(node)) 
        u_A_in = np.zeros((u_A_length, u_A_edge)) 
        u_A_out = np.zeros((u_A_length, u_A_edge))
        for i in np.arange(len(u_macro_items) - 1): 
            if u_macro_items[i] == 0:
                continue
            u = np.where(node == u_macro_items[i])[0][0]
            v = np.where(node == u_macro_items[i + 1])[0][0]
            u_A_in[v][i] = 1 
            u_A_out[u][i] = 1
        macro_items.append(torch.LongTensor(u_macro_items))
        u_A = np.concatenate([u_A_in, u_A_out])
        A.append(torch.LongTensor(u_A))
        alias_input = []
        for i in u_input:
            if i == 0:
                continue
            else:
                alias_input.append(np.where(node == i)[0][0]) 
        seq_real_len.append(len(alias_input))
        alias_inputs.append(torch.LongTensor(alias_input))
        u_micro_actions, u_micro_actions_len = seq_action_to_micro_action(u_actions, u_index.long())
        micro_actions.append(u_micro_actions)
        micro_actions_len.append(u_micro_actions_len)
    # B, n_items; B, n_seq; B, n_items, n_edges; B, 1; B, n_edges; B, n_edges, n_micro_actions; B, n_edges, 1
    return items, alias_inputs, A, seq_real_len, macro_items, micro_actions, micro_actions_len

def seq_action_to_micro_action(actions, seq_index):
    seq_len = actions.size(0)
    mask = actions.gt(0).float()
    real_length = torch.sum(mask).long()
    padding_len = seq_len - real_length
    last_index = padding_len
    micro_actions = []
    micro_actions_len = []
    for action_index in seq_index:
        micro_action = actions[last_index:action_index+1]
        last_index = action_index + 1
        micro_actions.append(torch.LongTensor(micro_action.long()))
        micro_actions_len.append(len(micro_action))
    micro_actions_len = torch.LongTensor(micro_actions_len)

    return micro_actions, micro_actions_len



In [None]:
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F

class ListDataset(Dataset):
    def __init__(self, *datalist):
        assert all(len(datalist[0]) == len(data) for data in datalist)
        self.datalist = datalist
    
    def __getitem__(self, index):
        return tuple(data[index] for data in self.datalist)
    
    def __len__(self):
        return len(self.datalist[0])
    
def batch_padding(batch):
    # 根据信息进行padding
    # print(len(batch[0]))
    item_ids, alias_inputs, A, seq_real_len, macro_items, micro_actions, micro_len, action, pairs, poses, y = zip(*batch)
    item_ids = pad_sequence(item_ids, batch_first=True).long()
    alias_inputs = pad_sequence(alias_inputs, batch_first=True, padding_value=-1).long()
    max_action_len = alias_inputs.size(1)
    batch_max_length = item_ids.size(1)
    macro_items = pad_sequence(macro_items, batch_first=True, padding_value=0) # n_edges
    batch_edge_length = macro_items.size(1)
    new_A = []
    for a in A:
        node_len, edge_len = int(a.size(0)/2), a.size(1)
        a_in, a_out = a[:node_len, :], a[node_len:, :]
        pad_items_length = batch_max_length - node_len
        pad_edges_length = batch_edge_length - edge_len
        pad_tuple = (0, pad_edges_length, 0, pad_items_length)
        a_in, a_out = F.pad(a_in, pad_tuple), F.pad(a_out, pad_tuple)
        new_A.append(torch.cat((a_in, a_out), 1).tolist())
    new_micro_actions = []
    for ac in micro_actions:
        new_ac = pad_sequence(ac, batch_first=True, padding_value=0)
        pad_edge_size = batch_edge_length - new_ac.size(0)
        pad_action_size = max_action_len - new_ac.size(1)
        new_ac = F.pad(new_ac, (0, pad_action_size, 0, pad_edge_size))
        new_micro_actions.append(new_ac.tolist())
    micro_len = pad_sequence(micro_len, batch_first=True, padding_value=1) # 这里需要注意，padding的内容最后是用不上的
    action = pad_sequence(action, batch_first=True)
    poses = pad_sequence(poses, batch_first=True)
    new_pairs = []
    seq_len = action.size(1)
    for pair in pairs:
        length = pair.size(0)
        pad_length = seq_len - length
        pad_tuple = (0, pad_length, 0, pad_length)
        new_pair = F.pad(pair, pad_tuple)
        new_pairs.append(new_pair.tolist())   
    return item_ids, alias_inputs, torch.LongTensor(new_A), torch.LongTensor(seq_real_len),torch.LongTensor(macro_items), torch.LongTensor(new_micro_actions), torch.LongTensor(micro_len), action, torch.Tensor(new_pairs), poses, torch.LongTensor(y)



In [None]:
item_vocab_size = 183561 + 1
max_position = 50

special_item_index = item_vocab_size 
train_x_items = train_items[:, :-1]
train_x_action = train_actions[:, :]
train_x_actions = train_pairs[:, :, :]
train_y =  train_items[:, -1]
train_pos_x = train_pos[:, :]
# train_pos_y = train_pos_t[:, -1]
# train_test_length = int(len(test)/3)

test_x_items, valid_x_items = test_items[:, :-1], valid_items[:, :-1]
test_x_action, valid_x_action = test_actions[:, :], valid_actions[:, :]
test_x_actions, valid_x_actions = test_pairs[:, :, :], valid_pairs[:, :, :]
test_y, valid_y = test_items[:, -1], valid_items[:, -1]
test_pos_x, valid_pos_x = test_pos[:, :], valid_pos[:, :]

def get_unique_items(items):
    """
    :param items:  B, L
    :return: 
    """
    n, length = items.size(0), items.size(1)
    items_index, items_len = [], []
    for i in range(n):
        seq_index = []
        for j in range(length-1):
            item = items[i][j]
            if item == 0 or item == items[i][j+1]:
                continue
            else:
                seq_index.append(j)
        seq_index.append(length-1)
        items_index.append(torch.Tensor(seq_index))
        items_len.append(len(seq_index))
    return items_index, torch.LongTensor(items_len)

In [None]:
def re_data(bh_seqs, pos_seqs, pair_seqs):
    """
    :param bh_seqs: |
    :param pos_seqs: 
    :param pair_seqs: 
    :return: 
    """
    seqs_actions = list()
    seqs_poses = list()
    seqs_pairs = list()
    data_len = len(bh_seqs)
    for i in range(data_len):
        new_seq = list()
        new_pos = list()
        new_pairs = list()
        bh_seq = bh_seqs[i]
        pos_seq = pos_seqs[i]
        pair_seq = pair_seqs[i]
        length = len(bh_seq)
        for j in range(length):
            action = bh_seq[j]
            pos = pos_seq[j]
            pair = pair_seq[j]
            if action == 0:
                continue
            else:
                new_seq.append(action)
                new_pos.append(pos)
                new_pair = list()
                for pair_content in pair:
                    if pair_content == 0:
                        continue
                    else:
                        new_pair.append(pair_content)
                new_pairs.append(new_pair)
        new_seq, new_pos, new_pairs = reverse_last(new_seq, new_pos, new_pairs)
        seqs_actions.append(torch.Tensor(new_seq))
        seqs_poses.append(torch.Tensor(new_pos))
        seqs_pairs.append(torch.Tensor(new_pairs))
    return seqs_actions, seqs_poses, seqs_pairs

def reverse_last(one_actions, one_pos, one_pairs):
    """
    :param one_actions: 
    :param one_pos: 
    :param one_pairs: 
    :return: 
    """
    last_action = one_actions.pop()
    last_pos = one_pos.pop()
    new_actions = [last_action] + one_actions
    new_pos = [last_pos] + one_pos
    new_pairs = []
    for pairs in one_pairs:
        last_ = pairs.pop()
        new_pair = [last_] + pairs
        new_pairs.append(new_pair)
    last_pair = new_pairs.pop()
    new_pairs = [last_pair] + new_pairs
    return new_actions, new_pos, new_pairs

In [None]:
train_items_index, train_items_len = get_unique_items(train_x_items)
test_items_index, test_items_len = get_unique_items(test_x_items)
valid_items_index, valid_items_len = get_unique_items(valid_x_items)
train_items_index = pad_sequence(train_items_index, batch_first=True,padding_value=-1)
test_items_index = pad_sequence(test_items_index, batch_first=True, padding_value=-1)
valid_items_index = pad_sequence(valid_items_index, batch_first=True, padding_value=-1)
torch.save(train_items_index, 'data/%s/new_train_items_index.pt' % data_name)
torch.save(train_items_len,  'data/%s/new_train_items_len.pt' % data_name)
torch.save(test_items_index,  'data/%s/new_test_items_index.pt' % data_name)
torch.save(test_items_len,  'data/%s/new_test_items_len.pt' % data_name)
torch.save(valid_items_index, 'data/%s/new_valid_items_index.pt' % data_name)
torch.save(valid_items_len, 'data/%s/new_valid_items_len.pt' % data_name)

In [None]:
train_items, train_alias_inputs, train_A, train_rel_len, train_macro_items, train_micro_actions, train_micro_len = Construct_connection_matrix(train_x_items, train_items_index, train_items_len, train_x_action)
train_bh, train_pos, train_pairs = re_data(train_x_action, train_pos_x, train_x_actions)

valid_items, valid_alias_inputs, valid_A, valid_rel_len, valid_macro_items, valid_micro_acitons, valid_micro_len = Construct_connection_matrix(valid_x_items, valid_items_index, valid_items_len, valid_x_action)
val_bh, val_pos, val_pairs = re_data(valid_x_action, valid_pos_x, valid_x_actions)

test_items, test_alias_inputs, test_A, test_rel_len, test_macro_items, test_micro_actions, test_micro_len = Construct_connection_matrix(test_x_items, test_items_index, test_items_len, test_x_action)
test_bh, test_pos, test_pairs = re_data(test_x_action, test_pos_x, test_x_actions)



In [None]:
train_sets = ListDataset(train_items, train_alias_inputs, train_A, train_rel_len, train_macro_items, train_micro_actions, train_micro_len, train_bh, train_pairs, train_pos, train_y.long())

valid_sets = ListDataset(valid_items, valid_alias_inputs, valid_A, valid_rel_len,valid_macro_items, valid_micro_acitons, valid_micro_len, val_bh, val_pairs, val_pos, valid_y.long())

test_sets = ListDataset(test_items, test_alias_inputs, test_A, test_rel_len, test_macro_items, test_micro_actions, test_micro_len, test_bh, test_pairs, test_pos,test_y.long())



In [None]:
torch.save(train_sets, 'data/%s/train_sets_EMBSR.pt'% data_name)
torch.save(valid_sets, 'data/%s/valid_sets_EMBSR.pt'% data_name)
torch.save(test_sets, 'data/%s/test_sets_EMBSR.pt'% data_name)

In [3]:
import torch
data_name = 'Trivago'
train_sets = torch.load('data/%s/train_sets_EMBSR.pt'% data_name)
valid_sets = torch.load('data/%s/valid_sets_EMBSR.pt'% data_name)
test_sets = torch.load('data/%s/test_sets_EMBSR.pt'% data_name)
 