In [1]:
import time
import csv
import pickle
import numpy as np
import operator
import tensorflow as tf

In [116]:
with open("train-queries.csv", "r") as f:
    reader = csv.DictReader(f, delimiter=';')
    sess_clicks = {}
    sess_date = {}
    sess_tokens = {}
    sess_divider = {}
    ctr = 0
    trainset = []
    testset = []
    for data in reader:
        sessid = data['queryId'] #query id
        #sessid = data['sessionId'] #session id
        items = data['items'] #string of items id
        clicks = items.split(',')
        sess_clicks[sessid] = clicks
        tokens = data['searchstring.tokens'] #string of search tokens
        searches = tokens.split(',')
        sess_tokens[sessid] = searches
        curdate = data['eventdate'] #time 
        date = time.mktime(time.strptime(curdate, '%Y-%m-%d'))
        sess_date[sessid] = date
        istest = data['is.test']
        sess_divider[sessid] = istest
        ctr += 1
#         if istest == 'FALSE':
#             trainset += [sessid]
#         else:
#             testset += [sessid]
        if ctr % 100000 == 0:
            print ('Loaded', ctr)
            #break
    print('total loaded sessions are:', sessid)

Loaded 100000
Loaded 200000
Loaded 300000
Loaded 400000
Loaded 500000
Loaded 600000
Loaded 700000
Loaded 800000
Loaded 900000
total loaded sessions are: 980503


In [117]:
# Filter out length 1 sessions
store_clicks = {}
store_date = {}
store_tokens = {}
store_divider = {}
for s in sess_clicks.keys():
    if len(sess_clicks[s]) != 1:
        store_clicks[s]=sess_clicks[s]
        store_date[s]=sess_date[s]
        store_tokens[s] = sess_tokens[s]
        store_divider[s] = sess_divider[s]
sess_clicks = store_clicks
sess_date = store_date
sess_tokens = store_tokens
sess_divider = store_divider

In [118]:
# Count number of times each item appears
iid_counts = {}
for s in sess_clicks:
    seq = sess_clicks[s]
    for iid in seq:
        if iid in iid_counts:
            iid_counts[iid] += 1
        else:
            iid_counts[iid] = 1

In [119]:
# Shows how many times each item has appeared in all sessions in a sorted manner
sorted_counts = sorted(iid_counts.items(), key=operator.itemgetter(1))

# Filter out those items appear less than 5 times in all sessions
store_clicks = {}
store_date = {}
store_tokens = {}
store_divider = {}
for s in sess_clicks.keys():
    curseq = sess_clicks[s]
    #Filter out those items that shows less than 5 times in total
    filseq = list(filter(lambda i: iid_counts[i] >= 5, curseq))   
    # Keep those session that longer than 2 items after filtering and dump others
    if len(filseq) >= 2:
        store_clicks[s] = filseq
        store_date[s] = sess_date[s]
        store_tokens[s] = sess_tokens[s]
        store_divider[s] = sess_divider[s]
# Update sess_clisks and sess_date
sess_clicks = store_clicks
sess_date = store_date
sess_tokens = store_tokens
sess_divider = store_divider

In [120]:
# Split training set and test set per sess_divider{}
for key in sess_divider:
    istest = sess_divider[key]
    if istest == 'FALSE':
        trainset += [key]
    else:
        testset += [key]

In [121]:
train_items = {}
train_tokens = {}
train_date = {}
for key in trainset:
    train_items[key] = sess_clicks[key]
    train_tokens[key] = sess_tokens[key]
    train_date[key] = sess_date[key]
test_items = {}
test_tokens = {}
test_date = {}
for key in testset:
    test_items[key] = sess_clicks[key]
    test_tokens[key] = sess_tokens[key]
    test_date[key] = sess_date[key]

In [122]:
# Convert training sessions to sequences and renumber items to start from 1
item_dict = {}
token_dict = {}
item_ctr = 1
token_ctr = 1
train_seqs = []
train_dates = []
train_tks = []
for s, date in train_date.items():
    seq = train_items[s]
    tk_seq = train_tokens[s]
    outseq = []
    tk_outseq = []
    for i in seq:
        if i in item_dict:
            outseq += [item_dict[i]]
        else:
            outseq += [item_ctr]
            item_dict[i] = item_ctr
            item_ctr += 1
            
    for i in tk_seq:   
        if i in token_dict:
            tk_outseq += [token_dict[i]]
        else:
            tk_outseq += [token_ctr]
            token_dict[i] = token_ctr
            token_ctr += 1
            
    train_seqs += [outseq]
    train_dates += [date]
    train_tks += [tk_outseq]
print("\ntotal number of sessions in the training set is:", len(train_date))


total number of sessions in the training set is: 633454


In [123]:
store_tokens = {}
test_seqs = []
test_dates = []
test_tks = []
# Convert test sessions to sequences, ignoring items that do not appear in training set
for s, date in test_date.items():
    seq = test_items[s]
    tk_seq = test_tokens[s]
#     test_tks += [test_tokens[s]]
    outseq = []
    tk_outseq = []
    for i in seq:
        if i in item_dict:
            outseq += [item_dict[i]]
    if len(outseq) < 2:
        continue
    else:
        #Filter out corresponding test_tokens
        store_tokens[s] = test_tokens[s]    
    test_seqs += [outseq]
    test_dates += [date]        
for s in store_tokens:  
    for i in tk_seq:
        if i in token_dict:
            tk_outseq += [token_dict[i]]
    test_tks += [tk_outseq]
print("total number of sessions in the test set is:", len(test_date))
print("\ntotal number of different items in the training set is:", len(item_dict))

total number of sessions in the test set is: 285892

total number of different items in the training set is: 123422


In [124]:
print(len(test_seqs))
print(len(test_dates))
print(len(test_tks))

285892
285892
285892


In [None]:
#Data augmentation and labels setting
def process_seqs(iseqs, idates, itokens):
    out_seqs = []
    out_dates = []
    out_tokens = []
    labs = []
    for seq, date in zip(iseqs, idates):
        for i in range(1, len(seq)):
            tar = seq[-i]
            labs += [tar]
            out_seqs += [seq[:-i]]
            out_dates += [date]        
    for i in range(len(itokens)):
        for t in range(len(iseqs[i]) - 1):
            out_tokens += [itokens[i]]
    return out_seqs, out_dates, out_tokens, labs

tr_seqs, tr_dates, tr_tokens, tr_labs = process_seqs(train_seqs,train_dates, train_tks)
te_seqs, te_dates, te_tokens, te_labs = process_seqs(test_seqs,test_dates, test_tks)
train = (tr_seqs, tr_labs)
test = (te_seqs, te_labs)
print('\nAfter augmentation, total number of sessions in the training set is: ', len(tr_seqs))
print('\nAfter augmentation, total number of sessions in the test set is: ', len(te_seqs))
print('Done.')

In [None]:
def get_minibatches_idx(n, minibatch_size, shuffle=False):
    """
    Used to shuffle the dataset at each iteration.
    """

    idx_list = np.arange(n, dtype="int32")

    if shuffle:
        np.random.shuffle(idx_list)

    minibatches = []
    minibatch_start = 0
    for i in range(n // minibatch_size):
        minibatches.append(idx_list[minibatch_start:
                                    minibatch_start + minibatch_size])
        minibatch_start += minibatch_size

    if minibatch_start != n:
        # Make a minibatch out of what is left
        minibatches.append(idx_list[minibatch_start:])

    return zip(range(len(minibatches)), minibatches)

In [None]:
def divide_minibatches(seqs, name):
    lengths = [len(i) for i in seqs]
    maxlen=np.max(lengths)
    n_samples = len(seqs)
    
    x = np.zeros((maxlen, n_samples))
    X_mask = np.ones((maxlen, n_samples))
    for idx, s in enumerate(seqs):
        x[-lengths[idx]:, idx] = s

    X_mask *= (1 - (x == 0))

    X = np.transpose(x)
    X_mask = np.transpose(X_mask)

    aa=get_minibatches_idx(len(X), 512, shuffle=False)
    b=0
    bb=[]
    X = X.astype(int)

    for _, train_index in aa:
        bb.append(train_index)
        b+=1
    bb.remove(bb[-1])
    print('\nNumber of minibatches of ' + name + ' is: ', len(bb))
    return X, X_mask, bb


In [None]:
labs=np.array(tr_labs)
labs_test=np.array(te_labs)
X, X_mask, bb = divide_minibatches(tr_seqs, 'training set')
X_test, X_test_mask, bb_test = divide_minibatches(te_seqs, 'test set')
X_tr_tokens, X_tr_tokens_mask, bb_tr_tokens = divide_minibatches(tr_tokens, 'tr_tokens set')
X_te_tokens, X_te_tokens_mask, bb_te_tokens = divide_minibatches(te_tokens, 'te_tokens set')

In [3]:
A = {'a':1,'b':2}
print(A)

{'a': 1, 'b': 2}


In [4]:
B = list(A)
print(B)

['a', 'b']


In [5]:
import random

In [7]:
random.shuffle(B)