# User Variables

In [1]:
# Data
SESSIONS_SAMPLE = None
MIN_SESSIONS_SIZE = 4
MIN_ITEM_OCCURENCE = 1
DATASET = "dataset/case_study_sessions.csv"

# Predictive Models
TRAIN_RATIO = 0.8
MIN_TEST_SESSIONS_SIZE = 4
SRGNN_EPOCHS = 20 # 20
TOP_K = 1

# Encoding
MIN_ATTENTION = 0.25

# SR-GNN
OUTPUT_FILE = "tmp/sr-gnn_predictions.csv"
SRGNN_PREDICTION_FILE = "tmp/docomo_sr-gnn_predictions.csv"

# STAMP
STAMP_EPOCHS = 20 #20
STAMP_PREDICTION_FILE = "tmp/docomo_stamp_predictions.csv"

# NARM
NARM_EPOCHS = 3 #20
NARM_PREDICTION_FILE = "tmp/docomo_narm_predictions.csv"

# Others
RANDOM_SEED = 42

# Constants

# Imports

In [2]:
import pandas as pd
import random
import pylfit

random.seed(RANDOM_SEED)


# Dataset formating

In [3]:
# Initialise df
df_sessions = pd.read_csv(DATASET)

# Standard column names
df_sessions.rename(columns={"UserID": "SessionId", "ItemID": "ItemId"}, inplace=True)

print("Found",df_sessions["SessionId"].nunique(),"sessions")

# Clean too short sessions
df_count = df_sessions.groupby("SessionId").count().reset_index()
df_sessions = df_sessions[~df_sessions["SessionId"].isin(list(df_count[df_count["Time"] < MIN_SESSIONS_SIZE]["SessionId"].unique()))]
df_sessions = df_sessions.sort_values(by=["SessionId", "Time"])

print("Found",df_sessions["SessionId"].nunique(),"sessions of size >=",MIN_SESSIONS_SIZE)

# Remove sessions with item with too low occurence
df_count = df_sessions.groupby("ItemId").count().reset_index()
discard_item = list(df_count[df_count["Time"] < MIN_ITEM_OCCURENCE]["ItemId"].unique())
discard_sessions = list(df_sessions[df_sessions["ItemId"].isin(discard_item)]["SessionId"].unique())
df_sessions = df_sessions[~df_sessions["SessionId"].isin(discard_sessions)]

print("Found",df_sessions["SessionId"].nunique(),"sessions with item count >=",MIN_ITEM_OCCURENCE)

# Remove additional sessions
if SESSIONS_SAMPLE is not None:
    df_sessions = df_sessions[df_sessions["SessionId"].isin(df_sessions["SessionId"].unique()[:SESSIONS_SAMPLE])]

print("Extracted",df_sessions["SessionId"].nunique(),"from it")


nb_item_ids = df_sessions["ItemId"].nunique()
print("Unique item ids:", nb_item_ids)

df_sessions

Found 12832 sessions
Found 12832 sessions of size >= 4
Found 12832 sessions with item count >= 1
Extracted 12832 from it
Unique item ids: 12


Unnamed: 0,SessionId,ItemId,Time
0,0,1,142646400
1,0,2,145152000
2,0,3,147744000
3,0,6,208731081
4,0,7,208731081
...,...,...,...
55372,12830,5,225763200
55373,12831,2,162345600
55374,12831,4,218592000
55375,12831,3,222825600


In [4]:
sessions_ids = list(set(df_sessions["SessionId"].unique()))
train_sessions = random.sample(sessions_ids, int(TRAIN_RATIO*len(sessions_ids)))

df_train = df_sessions[df_sessions["SessionId"].isin(train_sessions)]
df_test = df_sessions[~df_sessions["SessionId"].isin(train_sessions)]

print("Sessions")
print("Total:", len(df_sessions["SessionId"].unique()))
print("Train:", len(df_train["SessionId"].unique()))
print("Test:", len(df_test["SessionId"].unique()))
print()

display(df_train)
display(df_test)

Sessions
Total: 12832
Train: 10265
Test: 2567



Unnamed: 0,SessionId,ItemId,Time
0,0,1,142646400
1,0,2,145152000
2,0,3,147744000
3,0,6,208731081
4,0,7,208731081
...,...,...,...
55372,12830,5,225763200
55373,12831,2,162345600
55374,12831,4,218592000
55375,12831,3,222825600


Unnamed: 0,SessionId,ItemId,Time
16,3,1,76464000
17,3,6,208223210
18,3,7,208223210
19,3,10,216421200
44,10,2,145411200
...,...,...,...
55359,12828,6,220826929
55360,12828,9,224600868
55361,12828,9,224605596
55362,12828,9,224605624


# SR-GNN

In [5]:
import argparse
import time
import csv
import pickle
import operator
import datetime
import os

sess_clicks = {}
sess_date = {}
ctr = 0
curid = -1
curdate = None

for index, data in df_sessions.iterrows():
    sessid = data['SessionId']
    #if curdate and not curid == sessid:
        #date = ''
        #date = time.mktime(time.strptime(curdate, '%Y-%m-%d'))
        #sess_date[curid] = date
    curid = sessid
    item = int(data['ItemId'])
    #curdate = ''
    #curdate = data['eventdate']
    if sessid in sess_clicks:
        sess_clicks[sessid] += [item]
    else:
        sess_clicks[sessid] = [item]
    ctr += 1
#date = ''
#date = time.mktime(time.strptime(curdate, '%Y-%m-%d'))
#sess_date[curid] = date

tra_sess = df_train["SessionId"].unique()
tes_sess = df_test["SessionId"].unique()

print(len(tra_sess))    # 186670    # 7966257
print(len(tes_sess))    # 15979     # 15324
print(tra_sess[:3])
print(tes_sess[:3])
print("-- Splitting train set and test set @ %ss" % datetime.datetime.now())

# Choosing item count >=5 gives approximately the same number of items as reported in paper
item_dict = {}
# Convert training sessions to sequences and renumber items to start from 1
def obtian_tra():
    train_ids = []
    train_seqs = []
    item_ctr = 1
    for s in tra_sess:
        seq = sess_clicks[s]
        outseq = []
        for i in seq:
            if i in item_dict:
                outseq += [item_dict[i]]
            else:
                outseq += [item_ctr]
                item_dict[i] = item_ctr
                item_ctr += 1
        if len(outseq) < 2:  # Doesn't occur
            continue
        train_ids += [s]
        train_seqs += [outseq]
    print(item_ctr)     # 43098, 37484
    return train_ids, train_seqs, item_ctr


# Convert test sessions to sequences, ignoring items that do not appear in training set
def obtian_tes():
    test_ids = []
    test_seqs = []
    for s in tes_sess:
        seq = sess_clicks[s]
        outseq = []
        for i in seq:
            if i in item_dict:
                outseq += [item_dict[i]]
        if len(outseq) < 2:
            continue
        test_ids += [s]
        test_seqs += [outseq]
    return test_ids, test_seqs


tra_ids, tra_seqs, item_ctr = obtian_tra()
tes_ids, tes_seqs = obtian_tes()

def process_seqs(iseqs, test=False):
    out_seqs = []
    labs = []
    ids = []
    for id, seq in zip(range(len(iseqs)), iseqs):
        for i in range(1, len(seq) - MIN_SESSIONS_SIZE + 2):
            tar = seq[-i]
            labs += [tar]
            out_seqs += [seq[:-i]]
            ids += [id]
    return out_seqs, labs, ids


tr_seqs, tr_labs, tr_ids = process_seqs(tra_seqs)
te_seqs, te_labs, te_ids = process_seqs(tes_seqs, True)
tra = (tr_seqs, tr_labs)
tes = (te_seqs, te_labs)
print(len(tr_seqs))
print(len(te_seqs))
all = 0

for seq in tra_seqs:
    all += len(seq)
for seq in tes_seqs:
    all += len(seq)
print('avg length: ', all/(len(tra_seqs) + len(tes_seqs) * 1.0))

#tra
#tes
#tra_seqs

pickle.dump(tra, open('tmp/train.txt', 'wb'))
pickle.dump(tes, open('tmp/test.txt', 'wb'))
n_node = item_ctr
n_node

10265
2567
[0 1 2]
[ 3 10 13]
-- Splitting train set and test set @ 2023-11-30 18:34:36.925969s
13
13505
3376
avg length:  4.31553927680798


13

In [6]:
#display(te_seqs, te_labs)

# SR-GNN

In [7]:
%run algorithms/SR-GNN/pytorch_code/main.py --train tmp/train.txt --test tmp/test.txt --n_node {n_node} --epoch {SRGNN_EPOCHS} --topk {TOP_K} --output_file {OUTPUT_FILE}

Namespace(batchSize=100, dataset='sample', epoch=20, hiddenSize=100, l2=1e-05, lr=0.001, lr_dc=0.1, lr_dc_step=3, n_node=13, nonhybrid=False, output_file='tmp/sr-gnn_predictions.csv', patience=10, step=1, test='tmp/test.txt', topk=1, train='tmp/train.txt', valid_portion=0.1, validation=False)
Loading training data from tmp/train.txt
Loading test data from tmp/test.txt
-------------------------------------------------------
epoch:  0
start training:  2023-11-30 18:34:37.516067
[0/136] Loss: 2.4603




[28/136] Loss: 1.4006
[56/136] Loss: 0.7597
[84/136] Loss: 0.8954
[112/136] Loss: 0.9041
	Loss:	145.244
start predicting:  2023-11-30 18:34:39.643140
Best Result:
	Recall@1: 78.2879 	MMR@1: 78.2879 	Epoch: 0 	best: 0
-------------------------------------------------------
epoch:  1
start training:  2023-11-30 18:34:40.061065
[0/136] Loss: 0.7217
[28/136] Loss: 0.6837
[56/136] Loss: 0.6077
[84/136] Loss: 0.5822
[112/136] Loss: 0.5676
	Loss:	86.478
start predicting:  2023-11-30 18:34:42.172463
Best Result:
	Recall@1: 79.7097 	MMR@1: 79.7097 	Epoch: 1 	best: 1
-------------------------------------------------------
epoch:  2
start training:  2023-11-30 18:34:42.573948
[0/136] Loss: 0.5929
[28/136] Loss: 0.5631
[56/136] Loss: 0.5133
[84/136] Loss: 0.3603
[112/136] Loss: 0.5707
	Loss:	79.849
start predicting:  2023-11-30 18:34:44.634084
Best Result:
	Recall@1: 79.7097 	MMR@1: 79.7097 	Epoch: 2 	best: 2
-------------------------------------------------------
epoch:  3
start training:  2023-1

# Extract predictions with attention

In [8]:
inv_map = {v: k for k, v in item_dict.items()}
inv_map

{1: 1, 2: 2, 3: 3, 4: 6, 5: 7, 6: 10, 7: 4, 8: 5, 9: 9, 10: 11, 11: 12, 12: 8}

In [9]:
df_preds = pd.read_csv(OUTPUT_FILE, index_col="SessionId")

#display(df_preds)

# Decode item ids
df_preds["Input"] = [[inv_map[j] for j in i] for i in te_seqs]
df_preds["Target"] = df_preds["Target"].apply(lambda x: inv_map[x])

# Normalize attention
predictions = []
attentions = []
hit_rate = []
for idx, row in df_preds.iterrows():
    prediction = [inv_map[int(i)] for i in row["Predictions"].replace("[","").replace("]","").split(", ")]
    predictions += [prediction]

df_preds["Predictions"] = predictions

# Save for EDA and LFIT notebook
df_out = pd.DataFrame()
df_out["Expected"] = df_preds["Target"]
df_out.reset_index(inplace=True)
df_out.drop("SessionId",axis=1,inplace=True)
df_out["Model_input"] = df_preds["Input"]
df_out["Model_attention"] = df_preds["Attention"]
df_out["Model_prediction"] = df_preds["Predictions"]
df_out.to_csv(SRGNN_PREDICTION_FILE,index=False)
display(df_out)


Unnamed: 0,Expected,Model_input,Model_attention,Model_prediction
0,10,"[1, 6, 7]","[-3.7130167484283447, -3.8705673217773438, -4....","[2, 10, 4, 9, 12, 1, 11, 3, 8, 5, 6, 7]"
1,7,"[2, 3, 6]","[-3.1652047634124756, -3.1256532669067383, -3....","[7, 1, 9, 8, 10, 4, 12, 11, 6, 2, 5, 3]"
2,7,"[2, 3, 6]","[-3.1652047634124756, -3.1256532669067383, -3....","[7, 1, 9, 8, 10, 4, 12, 11, 6, 2, 5, 3]"
3,7,"[2, 3, 6]","[-3.1652047634124756, -3.1256532669067383, -3....","[7, 1, 9, 8, 10, 4, 12, 11, 6, 2, 5, 3]"
4,7,"[2, 3, 6]","[-3.1652047634124756, -3.1256532669067383, -3....","[7, 1, 9, 8, 10, 4, 12, 11, 6, 2, 5, 3]"
...,...,...,...,...
3371,6,"[2, 3, 7]","[-3.5456700325012207, -3.7342920303344727, -3....","[6, 1, 10, 4, 7, 9, 5, 2, 12, 11, 3, 8]"
3372,6,"[2, 3, 7]","[-3.5456700325012207, -3.7342920303344727, -3....","[6, 1, 10, 4, 7, 9, 5, 2, 12, 11, 3, 8]"
3373,4,"[7, 6, 5]","[-3.5796241760253906, -3.9678800106048584, -3....","[4, 10, 1, 12, 9, 2, 3, 11, 8, 7, 5, 6]"
3374,9,"[6, 9, 9, 9]","[-2.7884278297424316, -2.856088638305664, -2.8...","[9, 8, 12, 11, 4, 10, 7, 2, 5, 1, 6, 3]"


## STAMP

In [10]:
import pandas as pd
import numpy as np
import random
import os
import sys
import tensorflow as tf
import time
import warnings

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
#tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.logging.set_verbosity(tf.logging.ERROR)

# STAMP
from algorithms.STAMP.model.STAMP import Seq2SeqAttNN
from algorithms.STAMP.util.batcher.equal_len.batcher_p import batcher

# NARM
from algorithms.NARM.narm import NARM

random.seed(RANDOM_SEED)
tf.random.set_random_seed(RANDOM_SEED)
#tf.random.set_seed(RANDOM_SEED)


os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
warnings.filterwarnings('ignore')

with tf.Session() as sess:
  devices = sess.list_devices()
devices

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


[_DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 268435456, 16693321766870628439)]

In [11]:
model = Seq2SeqAttNN(n_epochs=STAMP_EPOCHS, model_save_path="", model_path="", is_save=True)
model.fit(df_train,df_test)

stamp
GPU: False
reload the datasets.
rsc15_64
read finish
sort finish
list finish
I am reading
id: 10264
session_id: 12831
items: [2, 7, 3, 8]
click_items: [2, 4, 3, 5]
out: [7, 3, 8]
in: [2, 7, 3]
label: []

13
read finish
sort finish
list finish
I am reading
id: 2566
session_id: 12828
items: [4, 9, 9, 9, 9]
click_items: [6, 9, 9, 9, 9]
out: [9, 9, 9, 9]
in: [4, 9, 9, 9]
label: []

13
-----
Epoch0	loss: 1.871697	time: 1.4365754127502441s
Epoch1	loss: 1.132598	time: 1.3490056991577148s
Epoch2	loss: 0.987764	time: 1.3671607971191406s
Epoch3	loss: 0.954674	time: 1.3280353546142578s
Epoch4	loss: 0.902993	time: 1.3680682182312012s
Epoch5	loss: 0.913747	time: 1.2980358600616455s
Epoch6	loss: 0.904447	time: 1.408010482788086s
Epoch7	loss: 0.886487	time: 1.298856496810913s
Epoch8	loss: 0.883468	time: 1.383084774017334s
Epoch9	loss: 0.871365	time: 1.3576443195343018s
Epoch10	loss: 0.860527	time: 1.3852713108062744s
Epoch11	loss: 0.881752	time: 1.285630702972412s
Epoch12	loss: 0.881961	time: 1

In [12]:
def stamp_predict_next(model, session_id, input_item_id):
  '''
  Gives prediction scores for a selected item in a selected session.
  The self.s variable allow to shift the items in the selected session.
  Parameters
  --------
  session_id : int
      Contains the session ID.
  input_item_id : int
      Contains the item ID of the events of the session.
  Returns
  --------
  out : (input, attention, predictions)
      input: list of int
        The input session of the model
      attention: list of int
        The attention layer value of the model
      predictions: pandas.Serie
        Prediction scores given the input_item_id and session_id for the next item.
        Columns: 1 column containing the scores; rows: items. Rows are indexed by the item IDs.'''

  sample = [x for x in model.test_data.samples if x.session_id == session_id]
  #if model.old_session_id != session_id:
  #    model.s = 0
  
  # DBG
  #print(input_item_id)
  model.s = input_item_id

  c_loss = []
  bt = batcher(
      samples=sample,
      class_num=model.n_items,
      random=False
  )

  while bt.has_next():  # batch round.
      batch_data = bt.next_batch()

      tmp_in_data = batch_data['in_idxes']
      tmp_out_data = batch_data['out_idxes']
      tmp_batch_ids = batch_data['batch_ids']
      # for s in range(len(tmp_in_data[0])):
      batch_in = []
      batch_out = []
      batch_last = []
      batch_seq_l = []
      for tmp_in, tmp_out in zip(tmp_in_data, tmp_out_data):
          _in = tmp_in[model.s]
          _out = tmp_out[model.s] - 1
          batch_last.append(_in)
          batch_in.append(tmp_in[:model.s + 1])
          batch_out.append(_out)
          batch_seq_l.append(model.s + 1)
      feed_dict = {
          model.inputs: batch_in,
          model.last_inputs: batch_last,
          model.lab_input: batch_out,
          model.sequence_length: batch_seq_l

      }
      

      preds, loss, alpha = model.sess.run([model.softmax_input, model.loss, model.alph],feed_dict=feed_dict)
      model.test_data.pack_ext_matrix('alpha', alpha, tmp_batch_ids)
      c_loss += list(loss)
      rev_map = {v: k for k, v in model.mappingitem2idx.items()}
      return [rev_map[i] for i in feed_dict[model.inputs][0]], alpha[0][0], pd.DataFrame(data=np.asanyarray(preds.reshape(len(preds[0]), 1)), index=list(model.mappingitem2idx.keys()))[0]

stamp_predict_next(model, session_id=df_test.iloc[0]["SessionId"], input_item_id=0)

([1],
 array([2.757456], dtype=float32),
 1    -3.084484
 2     3.616011
 3    -0.862608
 6     1.639037
 7     1.969161
 10    0.586589
 4     0.196670
 5    -1.530922
 9    -1.620706
 11   -2.558436
 12   -3.767925
 8    -1.655533
 Name: 0, dtype: float32)

In [13]:
predictions = []
for index, row in df_test.groupby('SessionId', observed=True)['ItemId'].apply(list).reset_index().iterrows(): #df_test.groupby("SessionId", observed=True).count().reset_index().iterrows():
  session_id = row["SessionId"]
  session_length = len(row["ItemId"])
  # Subsessions
  for i in range(MIN_TEST_SESSIONS_SIZE-2, session_length-1):
    predictions.append([row["ItemId"][i+1]] + list(stamp_predict_next(model, session_id=session_id, input_item_id=i)))

predictions[:2]

[[10,
  [1, 6, 7],
  array([3.5233486, 4.3522983, 4.5918145], dtype=float32),
  1    -0.757491
  2     1.888521
  3    -0.737072
  6    -2.527257
  7    -2.725819
  10    0.617299
  4     1.622910
  5    -1.104082
  9     0.257805
  11   -1.132285
  12    0.368262
  8    -2.480730
  Name: 0, dtype: float32],
 [7,
  [2, 3, 6],
  array([4.0930343, 4.648213 , 4.131812 ], dtype=float32),
  1     2.248339
  2    -3.191094
  3    -1.957756
  6    -1.785464
  7     5.277082
  10    0.592810
  4     0.022180
  5    -2.986837
  9     2.021633
  11   -5.117090
  12    0.307857
  8     1.012788
  Name: 0, dtype: float32]]

In [14]:
def predictions_to_dataframe(predictions):
    df_preds = pd.DataFrame(predictions, columns=['Expected', 'Model_input', 'Model_attention', 'Model_prediction'])
    df_preds["Expected"] = [a for a,b,c,d in predictions]
    df_preds["Model_input"] = [list(b) for a,b,c,d in predictions]
    df_preds["Model_attention"] = [list(c) for a,b,c,d in predictions]
    df_preds["Model_prediction"] = [list(d.sort_values(ascending=False).index) for a,b,c,d in predictions]
    #df_preds["HIT_"+str(TOP_K)] = [float(i in df_preds["Model_prediction"].iloc[idx][:TOP_K]) for idx, i in enumerate(df_preds["Expected"].values)]
    #df_preds["HIT_"+str(TOP_K)] = [float(i in df_preds["Model_prediction"].iloc[idx][:TOP_K]) for idx, i in enumerate(df_preds["Expected"].values)]
    return df_preds

df_preds = predictions_to_dataframe(predictions)
display(df_preds)

df_preds.to_csv(STAMP_PREDICTION_FILE,index=False)

Unnamed: 0,Expected,Model_input,Model_attention,Model_prediction
0,10,"[1, 6, 7]","[3.5233486, 4.3522983, 4.5918145]","[2, 4, 10, 12, 9, 3, 1, 5, 11, 8, 6, 7]"
1,7,"[2, 3, 6]","[4.0930343, 4.648213, 4.131812]","[7, 1, 9, 8, 10, 12, 4, 6, 3, 5, 2, 11]"
2,7,"[2, 3, 6]","[4.0930343, 4.648213, 4.131812]","[7, 1, 9, 8, 10, 12, 4, 6, 3, 5, 2, 11]"
3,7,"[2, 3, 6]","[4.0930343, 4.648213, 4.131812]","[7, 1, 9, 8, 10, 12, 4, 6, 3, 5, 2, 11]"
4,7,"[2, 3, 6]","[4.0930343, 4.648213, 4.131812]","[7, 1, 9, 8, 10, 12, 4, 6, 3, 5, 2, 11]"
...,...,...,...,...
3371,6,"[2, 3, 7]","[4.3466716, 4.851342, 4.636951]","[6, 1, 4, 10, 3, 7, 11, 2, 9, 5, 8, 12]"
3372,6,"[2, 3, 7]","[4.3466716, 4.851342, 4.636951]","[6, 1, 4, 10, 3, 7, 11, 2, 9, 5, 8, 12]"
3373,4,"[7, 6, 5]","[4.818439, 4.5853662, 4.9188433]","[4, 1, 12, 9, 8, 2, 3, 7, 10, 11, 6, 5]"
3374,9,"[6, 9, 9]","[4.530071, 4.525478, 4.525478]","[9, 8, 4, 1, 7, 2, 11, 5, 12, 6, 3, 10]"


In [15]:
model = NARM(epochs=NARM_EPOCHS,session_key='SessionId', item_key='ItemId')
model.fit(df_train,df_test)


model options {'test_size': -1, 'reload_model': None, 'use_dropout': True, 'valid_batch_size': 512, 'batch_size': 512, 'is_save': False, 'is_valid': True, 'saveto': 'gru_model.npz', 'encoder': 'gru', 'n_items': 13, 'lrate': 0.001, 'dispFreq': 10000, 'max_epochs': 3, 'patience': 5, 'hidden_units': 100, 'dim_proj': 100, 'self': <algorithms.NARM.narm.NARM object at 0x0000016B854718D0>}
Loading data
Building model
Optimization
30632 train examples
3403 valid examples
Best perfomance updated!
Valid Recall@20: 1.0    Valid Mrr@20: 0.8643077422463258
Seen 30632 samples


This epoch took 4315.2s


Best perfomance updated!
Valid Recall@20: 1.0    Valid Mrr@20: 0.8680081640043439
Seen 30632 samples


This epoch took 4319.5s


Best perfomance updated!
Valid Recall@20: 1.0    Valid Mrr@20: 0.8689702109037989
Seen 30632 samples
Valid Recall@20: nan    Valid Mrr@20: nan


This epoch took 4213.4s


In [16]:
def narm_predict_next(model, session):
    '''
    Gives predicton scores for a selected set of items on how likely they be the next item in the session.
            
    Parameters
    --------
    session_id : int or string
        The session IDs of the event.
    input_item_id : int or string
        The item ID of the event.
    predict_for_item_ids : 1D array
        IDs of items for which the network should give prediction scores. Every ID must be in the set of item IDs of the training set.
        
    Returns
    --------
    out : pandas.Series
        Prediction scores for selected items on how likely to be the next item of this session. Indexed by the item IDs.
    
    '''
    
    #model.session = session_id
    model.session_items = list(session)
    
    x = [model.itemmap[model.session_items].values]
    y = x
    
    x, mask, y = model.prepare_data(x,y)
    preds = model.pred_function(x, mask)
    attention = model.attention(x, mask)[0]

    return session, attention, pd.Series(data=preds[0][1:], index=model.itemmap.index)

narm_predict_next(model, [1,1,1,1])

([1, 1, 1, 1],
 array([0.43813477, 0.23418211, 0.17892223, 0.14876089]),
 1     0.000009
 2     0.937193
 3     0.005729
 6     0.024083
 7     0.015368
 10    0.014318
 4     0.002655
 5     0.000186
 9     0.000043
 11    0.000079
 12    0.000061
 8     0.000172
 dtype: float64)

In [17]:
predictions = []
for index, row in df_test.groupby('SessionId', observed=True)['ItemId'].apply(list).reset_index().iterrows(): #df_test.groupby("SessionId", observed=True).count().reset_index().iterrows():
  session_id = row["SessionId"]
  session_length = len(row["ItemId"])
  # Subsessions
  for i in range(MIN_TEST_SESSIONS_SIZE-2, session_length-1):
    predictions.append([row["ItemId"][i+1]] + list(narm_predict_next(model,row["ItemId"][:i+1])))

predictions[:2]

[[10,
  [1, 6, 7],
  array([0.33919114, 0.37246001, 0.28834885]),
  1     0.010914
  2     0.624937
  3     0.068836
  6     0.002471
  7     0.005747
  10    0.148283
  4     0.105451
  5     0.003280
  9     0.009637
  11    0.011189
  12    0.005197
  8     0.002264
  dtype: float64],
 [7,
  [2, 3, 6],
  array([0.22606462, 0.60602971, 0.16790567]),
  1     0.004677
  2     0.000083
  3     0.000098
  6     0.002088
  7     0.978083
  10    0.003283
  4     0.004192
  5     0.000298
  9     0.004977
  11    0.000174
  12    0.000620
  8     0.001264
  dtype: float64]]

In [18]:
df_preds = predictions_to_dataframe(predictions)
display(df_preds)

df_preds.to_csv(NARM_PREDICTION_FILE, index=False)

Unnamed: 0,Expected,Model_input,Model_attention,Model_prediction
0,10,"[1, 6, 7]","[0.339191144203947, 0.3724600103787826, 0.2883...","[2, 10, 4, 3, 11, 1, 9, 7, 12, 5, 6, 8]"
1,7,"[2, 3, 6]","[0.22606461943432452, 0.606029710447257, 0.167...","[7, 9, 1, 4, 10, 6, 8, 12, 5, 11, 3, 2]"
2,7,"[2, 3, 6]","[0.22606461943432452, 0.606029710447257, 0.167...","[7, 9, 1, 4, 10, 6, 8, 12, 5, 11, 3, 2]"
3,7,"[2, 3, 6]","[0.22606461943432452, 0.606029710447257, 0.167...","[7, 9, 1, 4, 10, 6, 8, 12, 5, 11, 3, 2]"
4,7,"[2, 3, 6]","[0.22606461943432452, 0.606029710447257, 0.167...","[7, 9, 1, 4, 10, 6, 8, 12, 5, 11, 3, 2]"
...,...,...,...,...
3371,6,"[2, 3, 7]","[0.2324626796950567, 0.6231815080922732, 0.144...","[1, 10, 6, 4, 3, 7, 5, 11, 8, 9, 2, 12]"
3372,6,"[2, 3, 7]","[0.2324626796950567, 0.6231815080922732, 0.144...","[1, 10, 6, 4, 3, 7, 5, 11, 8, 9, 2, 12]"
3373,4,"[7, 6, 5]","[0.10122615086426462, 0.13078626627339607, 0.7...","[4, 10, 1, 3, 9, 2, 11, 7, 12, 6, 8, 5]"
3374,9,"[6, 9, 9]","[0.013656520003575513, 0.3182928013702186, 0.6...","[9, 8, 12, 11, 5, 4, 1, 10, 2, 7, 6, 3]"
