# NARM+




In [1]:
#Importing functions.
import pandas
import csv
from collections import Counter, OrderedDict, defaultdict, namedtuple
import gc
import gensim
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import itertools
import math
import numpy as np
import pandas
import random
from sklearn.model_selection import train_test_split
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# PyTorch can run on CPU or on Nvidia GPU (video card) using CUDA
# This cell selects the GPU if one is available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device == "cuda":
    torch.cuda.synchronize() 

In [2]:
gensim.__version__

'3.6.0'

In [3]:
# Create sessions
sample = pandas.read_pickle("./data/processedData.pkl")
sample['SESSION'] = pandas.to_datetime(sample['TIMESTAMP'],unit='s').dt.date

print(len(sample["USERID"].unique()))
print(len(sample["PRODUCTID"].unique()))


11832
67172


In [4]:
print("Average number of sessions per user")
print(sample.groupby('USERID')['SESSION'].nunique().mean())

print("Average number of clicks per session")
sample.groupby(['USERID', 'SESSION'])['ACTION'].count().mean()

Average number of sessions per user
5.765128465179175
Average number of clicks per session


11.017386715142274

In [5]:
#DATASET CREATION

# read CSV file
# sample = pandas.read_csv("./data/UserBehavior.csv", names=["USERID", "PRODUCTID", "CATEGORYID", "ACTION", "TIMESTAMP"])

# edit size and print size of userID
# sample.set_index("USERID")
# sampleNew = sample[sample["USERID"] < 10319]
# print(len(sampleNew["USERID"].unique()))
# print(len(sampleNew["PRODUCTID"].unique()))

#sampleNew.to_pickle('./data/datasetName.pkl')

In [6]:
# ### DO NOT RUN THIS CELL IF YOU READ THE PICKLE processedData

# ##Preprocessing the Data
# #Remove products that occur less than 5 times
# productsize = sample.groupby(["PRODUCTID"]).size() > 4
# indexProducts = productsize.index[productsize]
# productTotal = sample[sample["PRODUCTID"].isin(indexProducts)]

# #Remove sessions that have a length of 1 or less
# #Remove users that have 2 or less sessions
# sessionList = productTotal.groupby(["USERID","SESSION"]).size() > 1
# sessionBase = sessionList.to_frame(name = "realSESSION").reset_index()
# enoughSessions = sessionBase.groupby(["USERID", "realSESSION"]).size() > 2
# trueSessionBase = enoughSessions.to_frame(name = "enoughSESSIONS").reset_index()


# #Merge DataSets such that original Dataset is restored with newly added filters
# totalDataSet = productTotal.merge(trueSessionBase)
# totalDataSet = totalDataSet[totalDataSet["enoughSESSIONS"] == True]

# #Drop useless columns
# totalDataSet = totalDataSet.drop("enoughSESSIONS", axis=1)
# totalDataSet = totalDataSet.drop("realSESSION", axis=1)
# sample = totalDataSet

# #Piece of mind stuff
# userList = totalDataSet["USERID"].unique()
# productList = totalDataSet["PRODUCTID"].unique()
# print(len(totalDataSet2))
# print(len(userList))
# print(len(productList))


# sample.to_pickle("./data/processedData.pkl")

In [7]:
userList = sample["USERID"].unique()
productList = sample["PRODUCTID"].unique()

In [8]:
# Here we first define a class that can map a product to an ID (p2i)
# and back (i2p).

class OrderedCounter(Counter, OrderedDict):
    """Counter that remembers the order elements are first seen"""
    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__,
                      OrderedDict(self))
    def __reduce__(self):
        return self.__class__, (OrderedDict(self),)


class Vocabulary:
    """A vocabulary, assigns IDs to tokens"""
    def __init__(self):
        self.freqs = OrderedCounter()
        self.users = []
        self.u2i = {}
        self.i2u = []
        self.p2i = {}
        self.i2p = []
        self.p2e = {}
        self.u2e = {}

    def count_product(self, t):
        self.freqs[t] += 1
    
    def count_user(self, t):
        self.users.append(t)

    def add_product(self, t):
        self.p2i[t] = str(len(self.p2i))
        self.i2p.append(t) 
        
    def add_user(self, t):
        self.u2i[t] = str(len(self.u2i))
        self.i2u.append(t)

    def build(self, min_freq=0):
#         self.add_product("<unk>")  # reserve 0 for <unk> (unknown products (products only occuring in test set))
#         self.add_user("<unk>")
        tok_freq = list(self.freqs.items())
        tok_freq.sort(key=lambda x: x[1], reverse=True)
        for tok, freq in tok_freq:
            if freq >= min_freq:
                self.add_product(tok)
        for user in self.users:
            self.add_user(user)

In [9]:
# This process should be deterministic and should have the same result 
# if run multiple times on the same data set.

def build_voc(userList, productList):
    v = Vocabulary()
    for product in productList:
        v.count_product(product)
    for user in userList:
        v.count_user(user)
    v.build()
    return v

v = build_voc(userList, productList)
print("Vocabulary size:", len(v.p2i))



Vocabulary size: 67172


In [10]:
# Create nested list of sessions and items per user
userBase = sample.groupby(['USERID', 'SESSION'])['PRODUCTID'].apply(list).groupby('USERID').apply(list)
print(userBase[1])

[[2268318, 2333346], [4365585, 230380], [2951368, 3108797], [2734026, 4152983, 266784, 266784, 1305059], [2087357, 3157558], [2087357, 1340922, 4954999], [3219016, 2028434, 3219016], [4954999, 818610, 271696]]


In [11]:
# More efficient create examples function
# A simple way to define a class is using namedtuple.
Example = namedtuple("Example", ["userID", "history", "inputs", "target"])

allSessions = []
allUsers = []

def f(userid, sessions, train):
    #print(sessions)
    sessions = [[v.p2i.get(t,0) for t in ses] for ses in sessions if len(ses) > 1]
    if userid == 11905:
        print(sessions)
    if train:
        object_train = Example(userID = str(userid), history = 
                               [item for sublist in sessions[:-2] for item in sublist], 
                               inputs = sessions[-2][:-1], target = sessions[-2][1:])
        return object_train
    else:
        # store info for the pretrained embeddings
        allSessions.extend(sessions)
        userDoc = [t for ses in sessions for t in ses]
        allUsers.append(TaggedDocument(userDoc, [str(userid)]))
        return Example(userID = str(userid), history = 
                       [item for sublist in sessions[:-1] for item in sublist], 
                       inputs = sessions[-1][:-1], 
                       target = sessions[-1][1:])

def createExamples(userBase):
    ''' Create training and testing set '''
    userBase = pandas.DataFrame(userBase)
    userBase.reset_index(level = 0, inplace = True)
    trainData = [x for x in 
                 userBase.apply(lambda x: f(x['USERID'], x['PRODUCTID'], True), axis = 1).tolist() 
                 if x is not None]
    testData = [x for x in 
                userBase.apply(lambda x: f(x['USERID'], x['PRODUCTID'], False), axis = 1).tolist() 
                if x is not None]
    return trainData, testData

trainData, testData = createExamples(userBase)
print(trainData[0])
print('')
print(testData[0])
    
    

[['25413', '55820'], ['19169', '11952', '15103', '33969', '8333', '35062', '16380', '18524', '33446', '50208', '13150', '33449', '33446', '33164', '33164'], ['22040', '33164', '22040'], ['36412', '44564', '24160'], ['44564', '8622', '52184'], ['46819', '27654', '46819', '27654']]
[['25413', '55820'], ['19169', '11952', '15103', '33969', '8333', '35062', '16380', '18524', '33446', '50208', '13150', '33449', '33446', '33164', '33164'], ['22040', '33164', '22040'], ['36412', '44564', '24160'], ['44564', '8622', '52184'], ['46819', '27654', '46819', '27654']]
Example(userID='1', history=['0', '1', '2', '3', '4', '5', '6', '7', '8', '8', '9', '10', '11', '10', '12', '13'], inputs=['14', '15'], target=['15', '14'])

Example(userID='1', history=['0', '1', '2', '3', '4', '5', '6', '7', '8', '8', '9', '10', '11', '10', '12', '13', '14', '15', '14'], inputs=['13', '16'], target=['16', '17'])


In [12]:
# product_embeddings = Word2Vec(allSessions, size=16, window=5, min_count=1)
# print(product_embeddings.wv['1'])
# user_embeddings = Doc2Vec(allUsers, vector_size=8, window=5, min_count=1)
# print(user_embeddings.wv['1'])
# v.u2e = user_embeddings.wv
# v.p2e = product_embeddings.wv

In [69]:
# HELPER FUNCTIONS

# function to yield one example at a time
def get_examples(data, shuffle=True, **kwargs):
    """Shuffle data set and return 1 example at a time (until nothing left)"""
    if shuffle:
#         print("Shuffling training data")
        random.shuffle(data)  # shuffle training data each epoch
    for example in data:
        yield example
        
def get_minibatch(data, batch_size=25, shuffle=True):
    """Return minibatches, optional shuffling"""

    if shuffle:
#         print("Shuffling training data")
        random.shuffle(data)  # shuffle training data each epoch

    batch = []

    # yield minibatches
    for example in data:
        batch.append(example)

        if len(batch) == batch_size:
            yield batch
            batch = []

        # in case there is something left
    if len(batch) > 0:
        yield batch
    
# function to prepare an example for usage by the model
def prepare_example(example, vocab):
    """
    Turn an example into tensors of inputs and target.
    """
    u = vocab.u2i.get(example.userID,0)
    v = torch.LongTensor([u])
    v = v.to(device)
    
    w = torch.LongTensor([int(t) for t in example.history])
    w = w.to(device)
    
    x = torch.LongTensor([int(t) for t in example.inputs])[:,None]
    x = x.to(device)

    y = torch.LongTensor([int(t) for t in example.target])[:,None]
    y = y.to(device)

    return (v, w, (x, 0)), y

def prepare_example_pre_trained(example, vocab):
    """
    Turn an example into tensors of inputs and target.
    """
    u = vocab.u2e[example.userID]
    v = torch.FloatTensor([u])
    v = v.to(device)
    
    w = torch.FloatTensor([vocab.p2e[t] for t in example.history])
    w = w.to(device)
    
    x = torch.FloatTensor([[vocab.p2e[t] for t in example.inputs]])
    x = x.to(device)

    y = torch.LongTensor([int(t)-1 for t in example.target])[:,None]
    y = y.to(device)

    return (v, w, (x, 0)), y

def pad(tokens, length, pad_value=-1):
    """add padding 0s to a sequence to that it has the desired length"""
    return tokens + [pad_value] * (length - len(tokens))

def prepare_minibatch(mb, vocab):
    """
    Minibatch is a list of examples.
    This function converts products to IDs and returns
    torch tensors to be used as input/targets.
    """
    batch_size = len(mb)
    
    u = [vocab.u2i.get(example.userID,0) for example in mb]
    v = torch.LongTensor(u)
    v = v.to(device)
    # shape v (batch size, user id)
    
    # vocab returns 0 if the word is not there
    maxlen = max([len(ex.history) for ex in mb])
    w = [[pad([int(t) for t in ex.history], maxlen)] for ex in mb]
    w = torch.LongTensor(w)
    w = w.to(device)
    # shape w (batch size, max history length)

    # vocab returns 0 if the word is not there
    maxlen = max([len(ex.inputs) for ex in mb])
    x = [pad([int(t) for t in ex.inputs], maxlen) for ex in mb]
    x = torch.LongTensor(x)
    x = x.to(device)
    xlengths = torch.LongTensor([len(ex.inputs) for ex in mb])
    # shape x (batch size, max current session length)

    y = [pad([int(t)-1 for t in ex.target], maxlen) for ex in mb]
    y = torch.LongTensor(y)
    y = y.to(device)
    
    gc.collect()

    return (v,w,(x,xlengths)), y

def prepare_minibatch_pre_trained(mb, vocab):
    """
    Minibatch is a list of examples.
    This function converts products to IDs and returns
    torch tensors to be used as input/targets.
    """
    batch_size = len(mb)
    embedding_dim = len(vocab.p2e['1'])
    
    u = [[vocab.u2e[example.userID]] for example in mb]
    v = torch.FloatTensor(u)
    v = v.to(device)
    # shape v (batch size, 1, embedding size)
    
    # vocab returns 0 if the word is not there
    maxlen = max([len(ex.history) for ex in mb])
    w = [pad(ex.history, maxlen, pad_value='0') for ex in mb]
    w = [[vocab.p2e[t] for t in ex] for ex in w]
    w = torch.FloatTensor(w)
    w = w.to(device)
    # shape w (batch size, max history length, embedding size)

    # vocab returns 0 if the word is not there
    maxlen = max([len(ex.inputs) for ex in mb])
    x = [pad(ex.inputs, maxlen, pad_value='0') for ex in mb]
    x = [[vocab.p2e[t] for t in ex] for ex in x]
    x = torch.FloatTensor(x)
    x = x.to(device)
    xlengths = torch.LongTensor([len(ex.inputs) for ex in mb])
    # shape x (batch size, max current session length, embedding size)

    y = [pad([int(t)-1 for t in ex.target], maxlen) for ex in mb]
    y = torch.LongTensor(y)
    y = y.to(device)
        
    gc.collect()
    
    return (v,w,(x,xlengths)), y

# simple evaluation function
def simple_evaluate(model, data, prep_fn=prepare_example, **kwargs):
    """Precision of a model on given data set."""
    model.eval()  # disable dropout
    targets = []
    predictions = []
    
    vocab = model.vocab
    for example in data:
        # convert the example input and targets to PyTorch tensors
        targets.extend([int(t) for t in example.target])
        x, target = prep_fn(example, vocab)

        # forward pass
        # get the output from the neural network for input x
        with torch.no_grad():
            output, alphas = model(x)
        # output shape: (sequence length, score for each item)
        prediction = torch.argmax(output, dim=1).tolist()
        predictions.extend(prediction)
            
    precision = sum([1 if p==t else 0 for p,t in zip(predictions,targets)])/len(targets)

    return precision, None

def recall(model, data, prep_fn=prepare_example, batch_fn=prepare_minibatch, at=5, batch_size=25, **kwargs):
    model.eval() # disable dropout
    targets = []
    predictions = []
    recall = 0
    
    vocab = model.vocab
    for batch in batch_fn(train_data, batch_size=batch_size):
        # convert the example input and targets to PyTorch tensors
#         targets.extend([int(t) for t in example.target])
        x, target = prep_fn(batch, vocab)
        # forward pass
        # get the output from the neural network for input x
        with torch.no_grad():
            output, alphas = model(x)
        # output shape: (batch size, sequence length, score for each item)
        prediction = torch.argsort(output, dim=2, descending=True)[:,:,:at].tolist()
        print(prediction)
        print(target.tolist())
        predictions.extend(prediction)
        
#     print(predictions[:10],targets[:10])
#     recall = sum([1 if t in p else 0 for t,p in zip(targets,predictions)])/len(targets)
    
    return recall, None

def mrr(model, data, prep_fn=prepare_example, batch_fn=prepare_minibatch, at=5, **kwargs):
    model.eval() # disable dropout
    targets = []
    predictions = []
    
    vocab = model.vocab
    for example in data:
        # convert the example input and targets to PyTorch tensors
        targets.extend([int(t)-1 for t in example.target])
        x, target = prep_fn(example, vocab)

        # forward pass
        # get the output from the neural network for input x
        with torch.no_grad():
            output, alphas = model(x)
        # output shape: (sequence length, nr of products)
        prediction = torch.argsort(output, dim=1, descending=True)[:,:at].tolist()
        predictions.extend(prediction)
        
    mrr = sum([1/(p.index(t) + 1) if t in p else 0 for t,p in zip(targets,predictions)])/len(targets)
    
    return mrr, None

In [70]:
# Custom NN

#Item embedding & User Embedding equal size
#

class NarmPlus(nn.Module):
    def __init__(self, 
                 item_embedding_dim, user_embedding_dim, hidden_size, output_dim, num_layers, 
                 vocab, pre_trained=False, batch_size=10,
                 activation_fn=nn.RReLU(), dropout=0.2):
        super(NarmPlus, self).__init__()
        # Store parameters
        self.item_embedding_dim = item_embedding_dim
        self.user_embedding_dim = user_embedding_dim
        self.hidden_size = hidden_size # hidden size is also user embedding dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.pre_trained = pre_trained
        self.batch_size = batch_size
        # Shape of hidden_state: (num_layers * num_directions, batch, hidden_size)
        self.hidden_state_dim = (num_layers, batch_size, hidden_size)
        self.hidden_state_size = num_layers * hidden_size
        self.vocab = vocab
        num_users = len(vocab.u2i)
        num_items = len(vocab.p2i)
        
        # General part
        self.ActivationFn = activation_fn
        self.Softmax = nn.Softmax(dim=1)
        self.loss = self.top1loss
        self.dropout = nn.Dropout(p=dropout)
        
        # History part
        self.UserEmbedding = nn.Embedding(num_users, user_embedding_dim)
        self.ItemEmbedding = nn.Embedding(num_items, item_embedding_dim)
        self.LatentItemHistory = nn.Linear(item_embedding_dim, user_embedding_dim)
        self.ProfileToHidden = nn.Linear(user_embedding_dim, self.hidden_state_size)
        
        # NARM Part
        # Input to the GRU is the item embedding: input_size = embedding_size
        # Hidden size is something we can experiment with
        self.Local = nn.GRU(item_embedding_dim, hidden_size, num_layers=num_layers, batch_first=True)
        self.Global = nn.GRU(item_embedding_dim, hidden_size, num_layers=num_layers, batch_first=True)
        self.Decoder = nn.Bilinear(item_embedding_dim, 2*hidden_size, output_dim)
        
        # Inner working of NARM attention part
        # Latent space for alpha: what value to pick?
        # I assume no bias, based on the paper
        latent_space = hidden_size
        self.A1 = nn.Linear(hidden_size,latent_space,bias=False)
        self.A2 = nn.Linear(hidden_size,latent_space,bias=False)
        self.v = nn.Linear(latent_space,1,bias=False)
        
        
    def forward(self,x):
        user, history, (inputs, input_lengths) = x
        
        if self.pre_trained:
            # user shape (1, embedding size)
            # history shape (history length, embedding size)
            # inputs shape (1, sequence length, embedding size)
            print(f'Inputs shape: {inputs.shape}')
            print(f'History shape: {history.shape}')
            print(f'User shape: {user.shape}')
            print(f'Input lengths shape: {input_lengths.shape}')
            print(f'Input lengths: {input_lengths}')
            
#             dense = self.LatentItemHistory(history)
#             dense = self.ActivationFn(dense)
            dense = history
#             print(f'Dense shape: {dense.shape}')
            # dense shape (history_length, hidden_state_size)
            if isinstance(input_lengths, int):
                alpha1 = torch.softmax(torch.matmul(user,torch.transpose(dense, 0, 1)),1)
                profile = torch.sum(torch.mul(alpha1,torch.transpose(dense,0,1)),1)
                h_0 = self.dropout(torch.reshape(profile, self.hidden_state_dim))
#             print(f'Alpha1 shape: {alpha1.shape}')
            # alpha shape (history_length)
            
#             print(f'Profile shape: {profile.shape}')
            # profile shape (hidden_state_size)
            # reshape to correct hidden state dimensions
#             h_0 = torch.reshape(self.ActivationFn(self.ProfileToHidden(profile)),self.hidden_state_dim)
            
#             print(f'h_0 shape: {h_0.shape}')
            else:
                alpha1 = self.Softmax(torch.matmul(dense,torch.transpose(user, 1, 2)))
                print(f'Alpha shape: {alpha1.shape}')
                profile = torch.sum(torch.mul(alpha1,dense),1)
                print(f'Profile shape: {profile.shape}')
                # profile shape (batch size, embedding size)
                h_0 = profile[None,:,:]
                print(f'h_0 shape: {h_0.shape}')
                # h_0 needs to be of shape: num layers, batch size, embedding size)

    
#             out_local, _ = self.Local(inputs, h_0)
#             print(f'Out_local shape: {out_local.shape}')
            
            if not isinstance(input_lengths, int):
                inputs = nn.utils.rnn.pack_padded_sequence(inputs, input_lengths, batch_first=True, enforce_sorted=False)
            out_global, _ = self.Global(inputs, h_0)
            if not isinstance(input_lengths, int):
                out_global, l = nn.utils.rnn.pad_packed_sequence(out_global, batch_first=True)
            print(f'Unpacked Out_global shape: {out_global.shape}')
            print(l)
            # out shape (batch_size, seq_length, hidden_size), containing hidden state output for every step
            # Shape of c_global and c_local should be: (sequence, hidden_size)
            c_global = out_global
            print(f'c_global shape: {c_global.shape}')
            # c_global shape (seq_length, hidden_size)
            c_local, alphas = self.calculate_c_local(out_global)
            print(f'c_local shape: {c_local.shape}')

            # Shape of c_global and c_local should be: (sequence, hidden_size)
            c = self.dropout(torch.cat((c_global, c_local), dim=2))
            # shape c (batch size, sequence length, embedding dim * 2)
            print(c.shape)
            # Decoder takes as inputs: embeddings for each item and c
            embeds = torch.FloatTensor([[self.vocab.p2e[str(i)] for i in range(len(self.vocab.p2i))]]).to(device)
            # shape embeds (1, number of products, embedding dim)
            print(embeds.shape)
#             embeds = self.ItemEmbedding(torch.LongTensor([i for i in range(len(self.vocab.p2i))]).to(device))
            batch_size = c.shape[0]
            sequence_length = c.shape[1]
            nr_of_products = embeds.shape[1]
#             out = c.new_empty(batch_size, sequence_length, nr_of_products)
#             for b in range(batch_size):
#                 for t in range(sequence_length):
#                     for p in range(nr_of_products):
# #                         tmp_c = c[b,t,:]
# #                         print(tmp_c.shape)
# #                         tmp_p = embeds[0,p,:]
# #                         print(tmp_p.shape)
#                         out[b,t,p] = self.Decoder(embeds[0,p,:].unsqueeze(0),c[b,t,:].unsqueeze(0)).squeeze()  
#                     gc.collect()
                
            # Make embeds and c the same shape
            embeds = embeds.repeat(c.shape[0],c.shape[1],1,1).contiguous()
            print(embeds.shape)
            c = c[:,:,None,:].repeat(1,1,embeds.shape[2],1).contiguous()
            print(c.shape)
            print(f'c shape: {c.shape}')
            print(f'All item embeds shape: {embeds.shape}')
            out = self.Decoder(embeds, c)[:,:,:,0]
            print(f'Output shape: {out.shape}')
            output = self.Softmax(out)
            return output, alphas
        else:    
            # user shape (1)
            # history shape (history_length)
            # inputs shape (seq_length,1)
            item_embeds_h = self.dropout(self.ItemEmbedding(history))
    #         print(f'Item embedding shape (history): {item_embeds_h.shape}')
            # item_embeds_h shape (history_length, embedding_size)
            user_embed = self.dropout(self.UserEmbedding(user))
    #         print(f'User embedding shape: {user_embed.shape}')
            # user_embed shape (1, embedding_size) <- shouldn't this be (1, hidden_state_size)?
            dense = self.LatentItemHistory(item_embeds_h)
            dense = self.ActivationFn(dense)
    #         print(f'Dense shape: {dense.shape}')
            # dense shape (history_length, hidden_state_size)
            alpha1 = self.Softmax(torch.matmul(user_embed,torch.transpose(dense, 0, 1)))
    #         print(f'Alpha1 shape: {alpha1.shape}')
            # alpha shape (history_length)
            profile = torch.sum(torch.mul(alpha1,torch.transpose(dense,0,1)),1)
    #         print(f'Profile shape: {profile.shape}')
            # profile shape (hidden_state_size)
            # reshape to correct hidden state dimensions
            h_0 = torch.reshape(self.ActivationFn(self.ProfileToHidden(profile)),self.hidden_state_dim)
    #         print(f'h_0 shape: {h_0.shape}')

            # Embed the items of the current session
            item_embeds_c = self.dropout(torch.transpose(self.ItemEmbedding(inputs),0,1))
    #         print(f'Item embedding shape (current): {item_embeds_c.shape}')
            # shape (seq_length, embedding_size)
            # add a batch dimension to the front, necessary for GRU
    #         item_embeds_c = item_embeds_c[None,:,:] 
    #         print(f'Item embedding shape (current): {item_embeds_c.shape}')

#             out_local, _ = self.Local(item_embeds_c,h_0)
    #         print(f'Out_local shape: {out_local.shape}')
            out_global, _ = self.Global(item_embeds_c,h_0)
    #         print(f'Out_global shape: {out_global.shape}')
            # out shape (batch_size, seq_length, hidden_size), containing hidden state output for every step
            # Shape of c_global and c_local should be: (sequence, hidden_size)
            c_global = out_global[0,:,:]
    #         print(f'c_global shape: {c_global.shape}')
            # c_global shape (seq_length, hidden_size)
            c_local, alphas = self.calculate_c_local(out_global)
    #         print(f'c_local shape: {c_local.shape}')

            # Shape of c_global and c_local should be: (sequence, hidden_size)
            c = self.dropout(torch.cat((c_global, c_local), dim=2))
            print(c.shape)
            # Decoder takes as inputs: embeddings for each item and c
            embeds = self.ItemEmbedding(torch.LongTensor([i for i in range(len(self.vocab.p2i))]).to(device))
            print(embeds.shape)
            # Make embeds and c the same shape
            embeds = embeds.repeat(c.shape[0],1,1).contiguous()
            c = c[:,None,:].repeat(1,embeds.shape[1],1).contiguous()
    #         print(f'c shape: {c.shape}')
    #         print(f'All item embeds shape: {embeds.shape}')
            out = self.Decoder(embeds, c)[:,:,0]
    #         print(f'Output shape: {out.shape}')
            output = self.Softmax(out)
            return output, alphas
    
    def calculate_c_local(self,H):
        # H: hidden states returned from the GRU
        # H shape (batch, sequence, hidden size)
        # Initialise c_local with the output hidden states: every hidden state has a similarity of 
        # 1 with itself, so the entire hidden state is taken into account
        c_local_base = H.clone().detach().requires_grad_(True)
        c_local = H.new_empty(H.shape)
  
        alphas = torch.ones((H.shape[0],H.shape[1],H.shape[1]),dtype=torch.float32).to(device)
        for b in range(H.shape[0]):
            cs = [torch.zeros((1,H.shape[2]), dtype=torch.float32).to(device)]
            for t in range(H.shape[1]):
                # If it is the first hidden state, then there are no previous hidden states to calculate the 
                # alpha from and the current hidden state has already been saved in c_local.
                if t == 0:
                    continue

                # Technically we do not need to store the alphas, but maybe we want to do something with these values
                alphas_t = torch.zeros(H.shape[1], dtype=torch.float32).to(device)
                A1 = self.A1(H[b,t,:])
                for j in range(t):
                    A2 = self.A2(H[b,j,:])
                    # next three lines could be done in one line
                    alphas_t[j] = self.v(self.ActivationFn(A1 + A2))
    #                 ct_j = torch.mul(alphas_t[j],H[j])
    #                 c_local[t] = torch.add(c_local[t],ct_j)
                ct = torch.sum(alphas_t[:t].unsqueeze(1) * H[b,:t,:],dim=0).unsqueeze(0)
                cs.append(ct)
                alphas[b][t] = alphas_t
            c_local[b] = torch.cat(cs,dim=0) + H[b]
            del(cs)
        return c_local, alphas
    
    def top1loss(self, output, targets):
        print(output.shape, targets.shape)
        output = output.view(1,-1,output.shape[2]).squeeze()
        targets = targets.view(-1,1)
        print(output.shape, targets.shape)
        
        # create a mask by filtering out all tokens that ARE NOT the padding token
        print(targets)
        mask = (targets > -1)
        masklist = mask.view(-1).tolist()
        inx = [r for r in range(len(masklist)) if masklist[r] == 1]
#         print(mask)
#         print(targets[mask])
#         print(output)
#         print(output[inx])
        targets = targets[mask].unsqueeze(1)
        output = output[inx]
#         # count how many tokens we have
#         nb_tokens = int(torch.sum(mask).data[0])
#         # pick the values for the label and zero out the rest with the mask
#         output = output[range(Y_hat.shape[0]), Y] * mask
        
        scores_for_targets = torch.gather(output, 1, targets)
        loss = torch.mean(torch.sigmoid(output - scores_for_targets) +
                torch.sigmoid(output**2))
        return loss
        

In [34]:
# function to train a model
name_extension = ''
def train_model(model, optimizer, num_epochs=200, 
                print_every=1, eval_every=1,
                batch_fn=get_examples, 
                prep_fn=prepare_example,
                eval_fn=simple_evaluate,
                batch_size=10, eval_batch_size=None,
                pre_trained=False,
                predict=False
               ):
    """Train a model."""  
    train_loss = 0.
    start = time.time()
    best_eval = 0.
    best_iter = 0
    eval_iter = 0
    criterion=model.loss

    # store train loss and validation accuracy during training
    # so we can plot them afterwards
    losses = []
    accuracies = []  

    if eval_batch_size is None:
        eval_batch_size = batch_size
    
    vocab = model.vocab
    
    product_embeddings = Word2Vec(allSessions, size=model.hidden_state_size, window=5, min_count=1)
#     print(product_embeddings.wv['1'])
    user_embeddings = Doc2Vec(allUsers, vector_size=model.hidden_state_size, window=5, min_count=1)
#     print(user_embeddings.wv['1'])
    vocab.u2e = user_embeddings.wv
    vocab.u2e.add(['0'], [np.zeros(model.hidden_state_size)])
    vocab.p2e = product_embeddings.wv
    vocab.p2e.add(['0'], [np.zeros(model.hidden_state_size)])
    
    for epoch in range(num_epochs):

        for example in batch_fn(train_data, batch_size=batch_size): # goes through the entire training data once, a.k.a. an epoch
            gc.collect()
            # forward pass, make sure the model is in train modus
            model.train()
            x, targets = prep_fn(example, vocab)
            print(targets)
#             print(example.userID)
            output, alphas = model(x)
            # output shape (sequence length, nr of products): a score for each product at each time step
            # alphas are the alphas used in the Narm part

            eval_iter += 1
                
            loss = criterion(output, targets)
            train_loss += float(loss.item())

            # backward pass
            # erase previous gradients
            model.zero_grad()

            # compute gradients
            loss.backward()

            # update weights - take a small step in the opposite dir of the gradient
            optimizer.step()
            
            break
            
            if eval_iter % 100 == 0:
                accuracy, _ = eval_fn(model, dev_data, batch_size=eval_batch_size,
                                         batch_fn=batch_fn, prep_fn=prep_fn)
                accuracies.append(accuracy)
                print("epoch %r: dev acc=%.4f" % (epoch + 1, accuracy))       

                # save best model parameters
                if accuracy > best_eval:
                    print("new highscore")
                    best_eval = accuracy
                    best_iter = epoch + 1
                    path = "{}{}.pt".format(model.__class__.__name__,name_extension)
                    ckpt = {
                      "state_dict": model.state_dict(),
                      "optimizer_state_dict": optimizer.state_dict(),
                      "best_eval": best_eval,
                      "best_iter": best_iter
                    }
                    torch.save(ckpt, path)
         
        break
        
        if (epoch + 1) % print_every == 0:
            print("Epoch %r: loss=%.4f, time=%.2fs" % 
                 (epoch + 1, train_loss, time.time()-start))
            losses.append(train_loss)       
            train_loss = 0.
            
        if (epoch + 1) % eval_every == 0:
            accuracy, _ = eval_fn(model, dev_data, batch_size=eval_batch_size,
                                         batch_fn=batch_fn, prep_fn=prep_fn)
            accuracies.append(accuracy)
            print("epoch %r: dev acc=%.4f" % (epoch + 1, accuracy))       

            # save best model parameters
            if accuracy > best_eval:
                print("new highscore")
                best_eval = accuracy
                best_iter = epoch + 1
                path = "{}{}.pt".format(model.__class__.__name__,name_extension)
                ckpt = {
                  "state_dict": model.state_dict(),
                  "optimizer_state_dict": optimizer.state_dict(),
                  "best_eval": best_eval,
                  "best_iter": best_iter
                }
                torch.save(ckpt, path)
    
    # Done training
    # evaluate on train, dev, and test with best model
    print("Loading best model")
    path = "{}{}.pt".format(model.__class__.__name__,name_extension)        
    ckpt = torch.load(path)
    model.load_state_dict(ckpt["state_dict"])

    train_acc, _ = eval_fn(
        model, train_data, batch_size=eval_batch_size, 
        batch_fn=batch_fn, prep_fn=prep_fn)
    dev_acc, _ = eval_fn(
        model, dev_data, batch_size=eval_batch_size,
        batch_fn=batch_fn, prep_fn=prep_fn)
    test_acc, predictions = eval_fn(
        model, test_data, batch_size=eval_batch_size, 
        batch_fn=batch_fn, prep_fn=prep_fn)

    print("best model iter {:d}: "
          "train acc={:.4f}, dev acc={:.4f}, test acc={:.4f}".format(
              best_iter, train_acc, dev_acc, test_acc))

    return test_acc, predictions

In [16]:
def createSplits(data, k):
    folds = {}
    for i in range(k):
        dev = data[math.ceil(i*len(data)/k) : math.ceil((i+1)*len(data)/k)]
        train = [x for x in data if x not in dev]
        folds[i] = train,dev
    return folds

In [71]:
gc.collect()
torch.cuda.empty_cache()
device='cpu'

train_data = trainData
test_data = testData
dev_data = testData[:100]

num_users = len(userList)
num_products = len(productList)

# item_embedding_dim, user_embedding_dim, hidden_size, output_dim, num_layers, vocab, 
# model = NarmPlus(math.ceil(num_products**0.25),math.ceil(num_users**0.25),10,1,1,v,dropout=0.2)
model = NarmPlus(50,50,50,1,1,v,dropout=0.2,pre_trained=True, batch_size=3)
model.to(device)
optimizer = optim.Adam(model.parameters())
a, p = train_model(model, optimizer, eval_fn=recall, 
                   num_epochs=1, 
                   prep_fn=prepare_minibatch_pre_trained,
                   batch_fn=get_minibatch,
                   batch_size=3)

tensor([[ 2902, 38878, 14828,  6834, 44964,   587, 58693,   587,    -1,    -1,
            -1,    -1],
        [21673, 32519, 32520, 32521, 32522, 17936,    -1,    -1,    -1,    -1,
            -1,    -1],
        [51765, 46554, 55729, 62367, 31223,  5902,  1896,  6156, 25988, 27341,
         52532,  4333]])
Inputs shape: torch.Size([3, 12, 50])
History shape: torch.Size([3, 25, 50])
User shape: torch.Size([3, 1, 50])
Input lengths shape: torch.Size([3])
Input lengths: tensor([ 8,  6, 12])
Alpha shape: torch.Size([3, 25, 1])
Profile shape: torch.Size([3, 50])
h_0 shape: torch.Size([1, 3, 50])
Unpacked Out_global shape: torch.Size([3, 12, 50])
tensor([ 8,  6, 12])
c_global shape: torch.Size([3, 12, 50])
c_local shape: torch.Size([3, 12, 50])
torch.Size([3, 12, 100])
torch.Size([1, 67172, 50])
torch.Size([3, 12, 67172, 50])
torch.Size([3, 12, 67172, 100])
c shape: torch.Size([3, 12, 67172, 100])
All item embeds shape: torch.Size([3, 12, 67172, 50])
Output shape: torch.Size([3, 12, 67172]

NameError: name 'example' is not defined

In [None]:
alph = torch.FloatTensor(
    [
        [[1],[0]],
        [[10],[-10]],
        [[5],[-5]],
        [[0],[1]],
        [[2],[3]]
    ]
)

prod = torch.FloatTensor(
    [
        [[1,1,1],[2,2,2]],
        [[1,1,1],[2,2,2]],
        [[1,1,1],[2,2,2]],
        [[1,1,1],[2,2,2]],
        [[1,1,1],[2,2,2]]
    ]
)

usr = torch.FloatTensor(
    [
        [[1,2,3]],
        [[4,5,6]],
        [[7,8,9]],
        [[10,11,12]],
        [[13,14,15]]
    ]
)
print(usr.shape)
print(prod.shape)

a = torch.matmul(prod,torch.transpose(usr,1,2))
print(a)
ass = torch.softmax(a,1)
print(ass)

torch.t(torch.sum(torch.mul(ass,prod),1)).shape

torch.__version__

In [64]:
alph = torch.FloatTensor(
    [
        [[1],[0]],
        [[10],[-10]],
        [[5],[-5]],
        [[0],[1]],
        [[2],[3]]
    ]
)
inx = [1,0,0,1,0]
selinx = [r for r in range(alph.shape[0]) if inx[r] == 1]
print(selinx)
alph[selinx] = 99
print(alph)
print(alph.shape)

[0, 3]
tensor([[[ 99.],
         [ 99.]],

        [[ 10.],
         [-10.]],

        [[  5.],
         [ -5.]],

        [[ 99.],
         [ 99.]],

        [[  2.],
         [  3.]]])
torch.Size([5, 2, 1])
