In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import time
import os

## Preprocessing

In [2]:
## set parameters
#ROOT = 'PATH/TO/data/processed/'
ROOT = '/home/kddlab/swyoo/data/'
PATH_TO_TRAIN = ROOT + 'train_data.csv'
PATH_TO_TEST = ROOT + 'test_data.csv'
checkpoint_dir = './checkpoint'
if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir)
        
layers = 1
rnn_size = 100
batch_size = 50
drop_keep_prob = 0.7

n_epochs = 10
learning_rate = 0.001
decay = 0.96
decay_steps = 1e4
grad_cap = 0
print_step = 1e3

In [3]:
## load data
data = pd.read_csv(PATH_TO_TRAIN)
valid = pd.read_csv(PATH_TO_TEST)

In [4]:
data.columns = ['SessionId','ItemId','timestamp'] 
valid.columns = ['SessionId','ItemId','timestamp']

In [5]:
data = data.sort_values(['SessionId', 'timestamp'], ascending=[True,True])
valid = valid.sort_values(['SessionId', 'timestamp'], ascending=[True,True])

In [6]:
len(data)

2454710

In [7]:
## add item index 
itemids = data['ItemId'].unique()
n_items = len(itemids)
itemidmap = pd.Series(data=np.arange(n_items), index=itemids).to_dict()
%time data['ItemIdx'] = data['ItemId'].map(lambda x: itemidmap[x])
data[:5]

CPU times: user 824 ms, sys: 23.1 ms, total: 847 ms
Wall time: 835 ms


Unnamed: 0,SessionId,ItemId,timestamp,ItemIdx
1214492,0000c0fb51e11,673981,1541240881,0
1214493,0000c0fb51e11,673981,1541240881,0
1214494,0000c0fb51e11,673981,1541240887,0
1214495,0000c0fb51e11,10369176,1541240932,1
1214496,0000c0fb51e11,10369176,1541240932,1


In [8]:
## offset sessions
offset_sessions = np.zeros(data['SessionId'].nunique()+1, dtype=np.int32)
offset_sessions[1:] = data.groupby('SessionId').size().cumsum()
offset_sessions[:5]

array([ 0,  5, 15, 20, 32], dtype=int32)

## Prepare Model

In [9]:
## placeholder & learning rate
X = tf.placeholder(tf.int32, [batch_size], name='input')
Y = tf.placeholder(tf.int32, [batch_size], name='output')
States = [tf.placeholder(tf.float32, [batch_size, rnn_size], name='rnn_state') for _ in range(layers)]
global_step = tf.Variable(0, name='global_step', trainable=False)
lr = tf.maximum(1e-5,tf.train.exponential_decay(
    learning_rate, global_step, decay_steps, decay, staircase=True
)) 

## gru weigths
with tf.variable_scope('gru_layer', reuse=tf.AUTO_REUSE):
    initializer = tf.glorot_uniform_initializer()
    embedding = tf.get_variable('embedding', [n_items, rnn_size], initializer=initializer)
    softmax_W = tf.get_variable('softmax_w', [n_items, rnn_size], initializer=initializer)
    softmax_b = tf.get_variable('softmax_b', [n_items], initializer=tf.zeros_initializer())
    
## gru_cell
with tf.variable_scope('gru_cell', reuse=tf.AUTO_REUSE):
    cell = tf.nn.rnn_cell.GRUCell(rnn_size, activation=tf.nn.tanh)
    drop_cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=drop_keep_prob)
    stacked_cell = tf.nn.rnn_cell.MultiRNNCell([drop_cell] * layers)

## feedforward gur_cell
inputs = tf.nn.embedding_lookup(embedding, X)
output, state_ = stacked_cell(inputs, tuple(States))
final_state = state_

### for training
sampled_W = tf.nn.embedding_lookup(softmax_W, Y)
sampled_b = tf.nn.embedding_lookup(softmax_b, Y)
logits = tf.matmul(output, sampled_W, transpose_b=True) + sampled_b
### cross-entropy loss
# yhat = tf.nn.softmax(logits)
# cost = tf.reduce_mean(-tf.log(tf.diag_part(yhat)+1e-24))
### bpr loss
yhat = logits
yhatT = tf.transpose(yhat)
cost = tf.reduce_mean(-tf.log(tf.nn.sigmoid(tf.diag_part(yhat)-yhatT)))
### top1 loss
# yhat = logits
# yhatT = tf.transpose(yhat)
# term1 = tf.reduce_mean(tf.nn.sigmoid(-tf.diag_part(yhat)+yhatT)+tf.nn.sigmoid(yhatT**2), axis=0)
# term2 = tf.nn.sigmoid(tf.diag_part(yhat)**2) / batch_size
# cost = tf.reduce_mean(term1 - term2)

### for prediction
logits_all = tf.matmul(output, softmax_W, transpose_b=True) + softmax_b
yhat_all = tf.nn.softmax(logits_all)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.GRUCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


#### Ranking Loss 

BPR loss <br>
$L_s = - \frac{1}{N_s} \cdot \sum_{j=1}^{N_s} \text{log}(\sigma(\hat{r}_{s,i} -\hat{r}_{s,j}))$ <br>
$N_s$: sample size，$\hat{r}_{s,i}$: yhat of positive sample， $\hat{r}_{s,j}$: yhat of positive sample

TOP loss <br>
$L_s = \frac{1}{N_s} \cdot \sum_{j=1}^{N_s} (\sigma(\hat{r}_{s,j} - \hat{r}_{s,i})) +\sigma(\hat{r^2_j})$ <br>

In [10]:
## optimize
optimizer = tf.train.AdamOptimizer(lr)
tvars = tf.trainable_variables()
gvs = optimizer.compute_gradients(cost, tvars)
if grad_cap > 0:
    capped_gvs = [(tf.clip_by_norm(grad, grad_cap), var) for grad, var in gvs]
else:
    capped_gvs = gvs 
train_op = optimizer.apply_gradients(capped_gvs, global_step=global_step)

Instructions for updating:
Use tf.cast instead.


## Training

In [11]:
## session start
sess = tf.Session()
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(tf.global_variables())

In [12]:
## training
tic = time.time()
for epoch in range(n_epochs):
    epoch_cost = []
    state = [np.zeros([batch_size, rnn_size], dtype=np.float32) for _ in range(layers)]
    iters = np.arange(batch_size)
    maxiter = iters.max()
    
    start = offset_sessions[iters]
    end = offset_sessions[iters+1]
    
    finished = False
    while not finished:
        minlen = (end-start).min()
        out_idx = data.ItemIdx.values[start]
        for i in range(minlen-1):
            in_idx = out_idx
            out_idx = data.ItemIdx.values[start+i+1]
            # prepare inputs, targeted outputs and hidden states
            fetches = [cost, final_state, global_step, lr, train_op]
            feed_dict = {X: in_idx, Y: out_idx}
            for j in range(layers): 
                feed_dict[States[j]] = state[j]
            
            cost_, state, step, lr_, _ = sess.run(fetches, feed_dict)
            epoch_cost.append(cost_)
                
            if step == 1 or step % print_step == 0:
                avgc = np.mean(epoch_cost)
                print('Epoch {}\tStep {}\tlr: {:.5f}\tloss: {:.4f}\tElapsed: {:.1f}'.
                      format(epoch, step, lr_, avgc, time.time()-tic))

        start = start+minlen-1
        mask = np.arange(len(iters))[(end-start)<=1]
        for idx in mask:
            maxiter += 1
            if maxiter >= len(offset_sessions)-1:
                finished = True
                break
            iters[idx] = maxiter
            start[idx] = offset_sessions[maxiter]
            end[idx] = offset_sessions[maxiter+1]
        if len(mask):
            for i in range(layers):
                state[i][mask] = 0
        
    avgc = np.mean(epoch_cost)
    if np.isnan(avgc):
        print('Epoch {}: Nan error!'.format(epoch, avgc))
        break
    saver.save(sess, '{}/gru-model'.format(checkpoint_dir), global_step=epoch)
print("1 epoch elapsed time:", time.time() - tic)

Epoch 0	Step 1	lr: 0.00100	loss: 0.6932	Elapsed: 0.4
Epoch 0	Step 1000	lr: 0.00100	loss: 0.4175	Elapsed: 16.8
Epoch 0	Step 2000	lr: 0.00100	loss: 0.3467	Elapsed: 33.3
Epoch 0	Step 3000	lr: 0.00100	loss: 0.3056	Elapsed: 49.7
Epoch 0	Step 4000	lr: 0.00100	loss: 0.2767	Elapsed: 66.2
Epoch 0	Step 5000	lr: 0.00100	loss: 0.2513	Elapsed: 82.6
Epoch 0	Step 6000	lr: 0.00100	loss: 0.2314	Elapsed: 99.0
Epoch 0	Step 7000	lr: 0.00100	loss: 0.2144	Elapsed: 115.5
Epoch 0	Step 8000	lr: 0.00100	loss: 0.2008	Elapsed: 131.9
Epoch 0	Step 9000	lr: 0.00100	loss: 0.1888	Elapsed: 148.4
Epoch 0	Step 10000	lr: 0.00100	loss: 0.1782	Elapsed: 164.8
Epoch 0	Step 11000	lr: 0.00096	loss: 0.1691	Elapsed: 181.2
Epoch 0	Step 12000	lr: 0.00096	loss: 0.1609	Elapsed: 197.6
Epoch 0	Step 13000	lr: 0.00096	loss: 0.1535	Elapsed: 214.0
Epoch 0	Step 14000	lr: 0.00096	loss: 0.1468	Elapsed: 230.5
Epoch 0	Step 15000	lr: 0.00096	loss: 0.1409	Elapsed: 246.9
Epoch 0	Step 16000	lr: 0.00096	loss: 0.1353	Elapsed: 263.4
Epoch 0	Step 17000

In [13]:
sess.close()

## Prediction & Evaluation

In [14]:
## parameters
cut_off = 20     # @20
batch_size = 50

In [15]:
## session restore
### 마지막(최신) 학습 checkpoint 정보를 restore한다.
sess = tf.Session()
saver = tf.train.Saver(tf.global_variables())
ckpt = tf.train.latest_checkpoint(checkpoint_dir)
saver.restore(sess, ckpt)

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ./checkpoint/gru-model-0


In [16]:
def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        
        return False
    
def f1(x):
    if not isfloat(x):
        return  
    elif int(float(x)) not in itemidmap:
        return
    else:
        return int(itemidmap[int(float(x))])

In [17]:
valid['ItemIdx'] = valid['ItemId'].map(f1)
valid = valid.dropna()
valid.ItemIdx = valid.ItemIdx.astype('int') 
display(valid[:2])
print(len(valid))

Unnamed: 0,SessionId,ItemId,timestamp,ItemIdx
700928,000138ab4f789,1211616,1541444221,4058
700929,000138ab4f789,2423456,1541444452,8334


1067819


In [18]:
## valdation data set
valid['ItemIdx'] = valid['ItemId'].map(lambda x: itemidmap[x])
valid[:5]

Unnamed: 0,SessionId,ItemId,timestamp,ItemIdx
700928,000138ab4f789,1211616,1541444221,4058
700929,000138ab4f789,2423456,1541444452,8334
700930,000138ab4f789,2423456,1541444609,8334
700931,000138ab4f789,152444,1541446323,4099
700932,000138ab4f789,2423456,1541446487,8334


In [19]:
## valid offset sessions
### 위 학습과 동일하게 각 세션의 시작점의 index list를 만든다.
offset_sessions = np.zeros(valid['SessionId'].nunique()+1, dtype=np.int32)
offset_sessions[1:] = valid.groupby('SessionId').size().cumsum()
offset_sessions[:5]

array([  0,  11,  17, 197, 360], dtype=int32)

In [20]:
## init prediction
### 예측 세션의 배치 사이즈 보다 작을 경우 배치 사이즈를 조정한다.
if len(offset_sessions) - 1 < batch_size:
    batch_size = len(offset_sessions) - 1
### training step과 동일
iters = np.arange(batch_size).astype(np.int32)
maxiter = iters.max()
start = offset_sessions[iters]
end = offset_sessions[iters+1]
in_idx = np.zeros(batch_size, dtype=np.int32)
predict_state = [np.zeros([batch_size, rnn_size], dtype=np.float32) for _ in range(layers)]

In [21]:
## prediction & evaluation
evalutation_point_count = 0
mrr, recall = 0.0, 0.0
tic = time.time()
while True:
    ### iters는 batch placeholder로 0보다 큰 즉, 마지막 세션까지는 모든 위치를 켜두고
    ### 아래에서 session 데이터가 다 소진되면 해당 위치를 -1로 할당할 것이다.
    ### valid_mask가 0이 되면 즉 모든 위치가 꺼지면 학습을 종료한다.
    valid_mask = iters >= 0
    if valid_mask.sum() == 0:
        print("break at endpoint", evalutation_point_count)
        break
        
    start_valid = start[valid_mask]
    minlen = (end[valid_mask]-start_valid).min()
    in_idx[valid_mask] = valid.ItemIdx.values[start_valid]
    
    for i in range(minlen-1):
        out_idx = valid.ItemIdx.values[start_valid+i+1]
        ## --- prediction --- ##
        fetches = [yhat_all, final_state]
        feed_dict = {X: in_idx}
        for j in range(layers): 
            feed_dict[States[j]] = predict_state[j]
        preds, predict_state = sess.run(fetches, feed_dict)
        preds = pd.DataFrame(data=np.asarray(preds).T)
        preds.fillna(0, inplace=True) ### preds shape: (item_size, batch_size)
        ## --- evaluation --- ##
        in_idx[valid_mask] = out_idx
        ranks = (preds.values.T[valid_mask].T > 
                 np.diag(preds.loc[in_idx].values)[valid_mask]).sum(axis=0) + 1
        rank_ok = ranks < cut_off
        recall += rank_ok.sum()
        mrr += (1.0 / ranks[rank_ok]).sum()
        evalutation_point_count += len(ranks)
        
    start = start+minlen-1
    mask = np.arange(len(iters))[(valid_mask) & (end-start<=1)]
    
    for idx in mask:
        maxiter += 1
        ## 더 이상 할당할 세션이 없으면 해당 위치에 -1을 할당하여 끈다.
        if maxiter >= len(offset_sessions)-1:
            iters[idx] = -1
        else:
            iters[idx] = maxiter
            start[idx] = offset_sessions[maxiter]
            end[idx] = offset_sessions[maxiter+1]
            
    if len(mask):
        for i in range(layers):
            predict_state[i][mask] = 0

recall = recall/evalutation_point_count
mrr = mrr/evalutation_point_count
print("recall: ", recall, "mrr:", mrr, "elapsed time:", time.time()-tic)

break at endpoint 1025935
recall:  0.9218654203238996 mrr: 0.7211920217467956 elapsed time: 131.9587483406067


In [22]:
### evalute that all dataset is used 
print(evalutation_point_count)
print(sum(valid.groupby('SessionId').size() - 1))

1025935
1025935


In [23]:
fetches = [yhat_all, final_state]
feed_dict = {X: in_idx}
for j in range(layers): 
            feed_dict[States[j]] = predict_state[j]        
preds = sess.run(yhat_all, feed_dict)
top_25_idx = np.argsort(preds, axis=1)[...,-25:]
#print(top_25_idx.shape)
clicked_it_idx = in_idx[-1]
submission = np.flip(top_25_idx[-1]) # decending order of impression
print("query: ")
print("clicked_item_id: ", clicked_it_idx)
print("submission: ", submission)
print("score: ", [preds[-1][i] for i in submission])

query: 
clicked_item_id:  2367
submission:  [5743 2367 7421 6036 8587 5086 6253 5870 8900    8 7812 2362 2248    9
 6148 9270 3082 8428 6747 2788 3617 8928 2063 5700 9105]
score:  [0.06477986, 0.05402035, 0.0250135, 0.0190592, 0.0145876305, 0.012599608, 0.012306675, 0.012305204, 0.011211358, 0.010889083, 0.007889688, 0.00744067, 0.0071209683, 0.006755861, 0.00675532, 0.0067508984, 0.006673782, 0.006638033, 0.006569237, 0.006359222, 0.006289683, 0.0061950884, 0.0060110167, 0.005997303, 0.005979884]


In [24]:
sess.close()