# NARM+




In [1]:
#Importing functions.
import pandas
import csv
from collections import Counter, OrderedDict, defaultdict, namedtuple
import itertools
import math
import numpy as np
import pandas
import random
from sklearn.model_selection import train_test_split
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# PyTorch can run on CPU or on Nvidia GPU (video card) using CUDA
# This cell selects the GPU if one is available.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.synchronize() 


In [2]:
# Create sessions
sample = pandas.read_pickle("./data/smallTaoBao.pkl")
sample['SESSION'] = pandas.to_datetime(sample['TIMESTAMP'],unit='s').dt.date
sample.head()

Unnamed: 0,USERID,PRODUCTID,CATEGORYID,ACTION,TIMESTAMP,SESSION
0,1,2268318,2520377,pv,1511544070,2017-11-24
1,1,2333346,2520771,pv,1511561733,2017-11-24
2,1,2576651,149192,pv,1511572885,2017-11-25
3,1,3830808,4181361,pv,1511593493,2017-11-25
4,1,4365585,2520377,pv,1511596146,2017-11-25


In [3]:
print("Average number of sessions per user")
sample.groupby('USERID')['SESSION'].nunique().mean()

Average number of sessions per user


7.2

In [4]:
print("Average number of clicks per session")
sample.groupby(['USERID', 'SESSION'])['ACTION'].count().mean()

Average number of clicks per session


14.126388888888888

In [5]:
user1 = sample.loc[sample['USERID'] == 1]
user1.head()

Unnamed: 0,USERID,PRODUCTID,CATEGORYID,ACTION,TIMESTAMP,SESSION
0,1,2268318,2520377,pv,1511544070,2017-11-24
1,1,2333346,2520771,pv,1511561733,2017-11-24
2,1,2576651,149192,pv,1511572885,2017-11-25
3,1,3830808,4181361,pv,1511593493,2017-11-25
4,1,4365585,2520377,pv,1511596146,2017-11-25


In [6]:
# Create nested list of sessions and items per user
userBase = sample.groupby(['USERID', 'SESSION'])['PRODUCTID'].apply(list).groupby('USERID').apply(list)
print(userBase[1])

[[2268318, 2333346], [2576651, 3830808, 4365585, 4606018, 230380], [3827899, 3745169, 1531036], [2266567, 2951368, 3108797, 1338525, 2286574], [5002615, 2734026, 5002615, 3239041, 4615417, 4152983, 266784, 46259, 266784, 4092065, 1305059], [2791761, 3239041, 46259, 4973305, 2087357, 3157558], [2087357, 4170517, 1340922, 3911125, 4170517, 3682069, 4954999, 79715, 4666650], [1323189, 4198227, 4954999], [2041056, 3219016, 2104483, 2028434, 3219016, 2278603, 929177], [4954999, 818610, 271696, 568695]]


In [7]:
# More efficient create examples function
# A simple way to define a class is using namedtuple.
Example = namedtuple("Example", ["userID", "history", "inputs", "target"])

def f(userid, sessions, train):
    if train:
        object_train = Example(userID = userid, history = [item for sublist in sessions[:-2] for item in sublist], inputs = sessions[-2][:-1], target = sessions[-2][1:])
        return object_train
    else:
        return Example(userID = userid, history = [item for sublist in sessions[:-1] for item in sublist], inputs = sessions[-1][:-1], target = sessions[-1][1:])

def createExamples(userBase):
    ''' Create training and testing set '''
    userBase = pandas.DataFrame(userBase)
    userBase.reset_index(level = 0, inplace = True)
    trainData = userBase.apply(lambda x: f(x['USERID'], x['PRODUCTID'], True), axis = 1).tolist()
    testData = userBase.apply(lambda x: f(x['USERID'], x['PRODUCTID'], False), axis = 1).tolist()
    return trainData, testData

trainData, testData = createExamples(userBase)
print(trainData[0])
print('')
print(testData[0])
    
    

Example(userID=1, history=[2268318, 2333346, 2576651, 3830808, 4365585, 4606018, 230380, 3827899, 3745169, 1531036, 2266567, 2951368, 3108797, 1338525, 2286574, 5002615, 2734026, 5002615, 3239041, 4615417, 4152983, 266784, 46259, 266784, 4092065, 1305059, 2791761, 3239041, 46259, 4973305, 2087357, 3157558, 2087357, 4170517, 1340922, 3911125, 4170517, 3682069, 4954999, 79715, 4666650, 1323189, 4198227, 4954999], inputs=[2041056, 3219016, 2104483, 2028434, 3219016, 2278603], target=[3219016, 2104483, 2028434, 3219016, 2278603, 929177])

Example(userID=1, history=[2268318, 2333346, 2576651, 3830808, 4365585, 4606018, 230380, 3827899, 3745169, 1531036, 2266567, 2951368, 3108797, 1338525, 2286574, 5002615, 2734026, 5002615, 3239041, 4615417, 4152983, 266784, 46259, 266784, 4092065, 1305059, 2791761, 3239041, 46259, 4973305, 2087357, 3157558, 2087357, 4170517, 1340922, 3911125, 4170517, 3682069, 4954999, 79715, 4666650, 1323189, 4198227, 4954999, 2041056, 3219016, 2104483, 2028434, 3219016, 

In [8]:
# Here we first define a class that can map a product to an ID (p2i)
# and back (i2p).

class OrderedCounter(Counter, OrderedDict):
    """Counter that remembers the order elements are first seen"""
    def __repr__(self):
        return '%s(%r)' % (self.__class__.__name__,
                      OrderedDict(self))
    def __reduce__(self):
        return self.__class__, (OrderedDict(self),)


class Vocabulary:
    """A vocabulary, assigns IDs to tokens"""
    def __init__(self):
        self.freqs = OrderedCounter()
        self.p2i = {}
        self.i2p = []

    def count_token(self, t):
        self.freqs[t] += 1

    def add_token(self, t):
        self.p2i[t] = len(self.p2i)
        self.i2p.append(t)    

    def build(self, min_freq=0):
#         self.add_token("<unk>")  # reserve 0 for <unk> (unknown products (products only occuring in test set))
        tok_freq = list(self.freqs.items())
        tok_freq.sort(key=lambda x: x[1], reverse=True)
        for tok, freq in tok_freq:
            if freq >= min_freq:
                self.add_token(tok)

In [9]:
# This process should be deterministic and should have the same result 
# if run multiple times on the same data set.

def build_voc(data_sets):
    v = Vocabulary()
    for data_set in data_sets:
        for ex in data_set:
            for product in ex.history + ex.inputs:
                v.count_token(product)
        v.build()
    return v

v = build_voc([trainData])
print("Vocabulary size:", len(v.p2i))


Vocabulary size: 6701


In [10]:
# HELPER FUNCTIONS

# function to yield one example at a time
def get_examples(data, shuffle=True, **kwargs):
    """Shuffle data set and return 1 example at a time (until nothing left)"""
    if shuffle:
#         print("Shuffling training data")
        random.shuffle(data)  # shuffle training data each epoch
    for example in data:
        yield example
    
# function to prepare an example for usage by the model
def prepare_example(example):
    """
    Turn an example into tensors of inputs and target.
    """
    v = torch.LongTensor(example.userID)
    v = v.to(device)
    
    w = torch.LongTensor(example.history)
    w = w.to(device)
    
    x = torch.LongTensor(example.inputs)[:,None]
    x = x.to(device)

    y = torch.LongTensor(example.target)[:,None]
    y = y.to(device)

    return (v, w, x), y

# simple evaluation function
def simple_evaluate(model, data, prep_fn=prepare_example, **kwargs):
    """Explained Variance Score of a model on given data set."""
    model.eval()  # disable dropout (explained later)
    targets = []
    predictions = []

    for example in data:

        # convert the example input and label to PyTorch tensors
        targets.append(example.target)
        x, target = prepare_example(example)

        # forward pass
        # get the output from the neural network for input x
        with torch.no_grad():
            output = model(x)
        # output shape: (batch, output_size)
        prediction = output[0].tolist()
        predictions.append(prediction)
            
    score = explained_variance_score(targets, predictions, multioutput='variance_weighted')

    return score, None

In [27]:
# Custom NN

class NarmPlus(nn.Module):
    def __init__(self, 
                 embedding_dim, hidden_size, output_dim, num_layers, 
                 num_items, item_vocabulary, num_users, 
                 activation_fn=nn.RReLU()):
        super(NarmPlus, self).__init__()
        # Store parameters
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.output_dim = output_dim
        self.num_layers = num_layers
        # Shape of hidden_state: (num_layers * num_directions, batch, hidden_size)
        self.hidden_state_dim = (num_layers, 1, hidden_size)
        self.hidden_state_size = num_layers * hidden_size
        self.items = item_vocabulary
        
        # General part
        self.ActivationFn = activation_fn
        self.Softmax = nn.Softmax()
        self.loss = self.top1
        
        # History part
        self.UserEmbedding = nn.Embedding(num_users, embedding_dim)
        self.ItemEmbedding = nn.Embedding(num_items, embedding_dim)
        self.LatentItemHistory = nn.Linear(embedding_dim, self.hidden_state_size)
        
        # NARM Part
        # Input to the GRU is the item embedding: input_size = embedding_size
        # Hidden size is something we can experiment with
        self.Local = nn.GRU(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True)
        self.Global = nn.GRU(embedding_dim, hidden_size, num_layers=num_layers, batch_first=True)
        self.Decoder = nn.Bilinear(embedding_dim, hidden_size, output_dim)
        
        # Inner working of NARM attention part
        # Latent space for alpha: what value to pick?
        # I assume no bias, based on the paper
        latent_space = hidden_size
        self.A1 = nn.Linear(hidden_size,latent_space,bias=False)
        self.A2 = nn.Linear(hidden_size,latent_space,bias=False)
        self.v = nn.Linear(latent_space,1,bias=False)
        
        
    def forward(self,x):
        user, history, inputs = x
        # user shape (1)
        # history shape (history_length)
        # inputs shape (seq_length,1)
        item_embeds_h = self.ItemEmbedding(history)
        # item_embeds_h shape (history_length, embedding_size)
        user_embed = self.UserEmbedding(user)
        # user_embed shape (1, embedding_size) <- shouldn't this be (1, hidden_state_size)?
        dense = self.LatentItemHistory(item_embeds_h)
        dense = self.ActivationFn(dense)
        # dense shape (history_length, hidden_state_size)
        alpha1 = self.Softmax(torch.matmul(user_embed,torch.transpose(dense, 0, 1)))
        # alpha shape (history_length)
        profile = torch.sum(torch.mul(alpha1,torch.transpose(dense,0,1)),1)
        # profile shape (hidden_state_size)
        # reshape to correct hidden state dimensions
        h_0 = torch.reshape(profile,*self.hidden_state_dim)
        
        # Embed the items of the current session
        item_embeds_c = self.ItemEmbedding(inputs).squeeze()
        # shape (seq_length, embedding_size)
        # add a batch dimension to the front, necessary for GRU
        item_embeds_c = item_embeds_c[None,:,:] 
        
        out_local, _ = self.Local(item_embeds_c,h_0)
        out_global, _ = self.Global(item_embeds_c,h_0)
        # out shape (batch_size, seq_length, hidden_size), containing hidden state output for every step
        # Shape of c_global and c_local should be: (sequence, hidden_size)
        c_global = out_global.squeeze()
        # c_global shape (seq_length, hidden_size)
        c_local, alphas = self.calculate_c_local(out_local)
        
        # Shape of c_global and c_local should be: (sequence, hidden_size)
        c = torch.cat(c_global, c_local, dim=1)
        # Decoder takes as inputs: embeddings for each item and c
        embeds = self.ItemEmbedding(torch.LongTensor(self.items).to(device))
        out = self.Decoder(embeds, c)
        output = self.Softmax(out)
        return output
    
    def calculate_c_local(self,H):
        # H: hidden states returned from the GRU
        # H shape (batch, sequence, hidden size)
        H = H.squeeze()
        # H shape (sequence, hidden size)
        # Initialise c_local with the output hidden states: every hidden state has a similarity of 
        # 1 with itself, so the entire hidden state is taken into account
        c_local = torch.tensor(H)
        
  
        alphas = torch.ones((H.shape[0],H.shape[0]),dtype=torch.float64).to(device)
        
        for t in range(H.shape[0]):
            # If it is the first hidden state, then there are no previous hidden states to calculate the 
            # alpha from and the current hidden state has already been saved in c_local.
            if t == 0:
                continue
            
            # Technically we do not need to store the alphas, but maybe we want to do something with these values
            alphas_t = torch.zeroes(H.shape[0], dtype=torch.float64).to(device)
            A1 = self.A1(H[t])
            for j in range(t):
                A2 = self.A2(H[j])
                # next three lines could be done in one line
                alphas_t[j] = self.v(self.ActivationFn(torch.add(A1,A2)))
                ct_j = torch.mul(alphas_t[j],H[j])
                c_local[t] = torch.add(c_local[t],ct_j)
            alphas[t] = alphas_t
        return c_local, alphas
    
    def top1(self, yhat):
        ''' Top1 loss, yhat is vector with softmax probabilities '''
        # Not sure if you can just call backward to this, but I think it should work. Code from:
        # https://github.com/mquad/hgru4rec/blob/master/src/hgru4rec.py
        yhatT = torch.transpose(yhat, 0, 1)
        loss = torch.mean(torch.mean(nn.sigmoid( - torch.diag(yhat) + yhatT) + nn.sigmoid(yhat ** 2), dim = 0) - nn.sigmoid(T.diag(yhat ** 2)))
        return loss
        

In [12]:
# function to train a model
name_extension = ''
def train_model(model, optimizer, num_iterations=100000, 
                print_every=100000, eval_every=100000,
                batch_fn=get_examples, 
                prep_fn=prepare_example,
                eval_fn=simple_evaluate,
                batch_size=1, eval_batch_size=None,
                predict=False
               ):
    """Train a model."""  
    iter_i = 0
    train_loss = 0.
    print_num = 0
    start = time.time()
    best_eval = 0.
    best_iter = 0
    criterion=model.loss

    # store train loss and validation accuracy during training
    # so we can plot them afterwards
    losses = []
    accuracies = []  

    if eval_batch_size is None:
        eval_batch_size = batch_size

    while True:  # when we run out of examples, shuffle and continue
        for batch in batch_fn(train_data, batch_size=batch_size):

            # forward pass
            model.train()
            x, targets = prep_fn(batch)
            output = model(x)
            # output shape: (batch, outputsize)

            # B stands for batch size
            B = targets.size(0)  # later we will use B examples per update

            # compute Huber loss (our criterion)
            loss = criterion(output, targets)
            train_loss += loss.item()

            # backward pass
            # erase previous gradients
            model.zero_grad()

            # compute gradients
            loss.backward()

            # update weights - take a small step in the opposite dir of the gradient
            optimizer.step()

            print_num += 1
            iter_i += 1

            # print info
            if iter_i % print_every == 0:
#                 print("Iter %r: loss=%.4f, time=%.2fs" % 
#                       (iter_i, train_loss, time.time()-start))
                losses.append(train_loss)
                print_num = 0        
                train_loss = 0.

            # evaluate
            if iter_i % eval_every == 0:
                accuracy, _ = eval_fn(model, dev_data, batch_size=eval_batch_size,
                                         batch_fn=batch_fn, prep_fn=prep_fn)
                accuracies.append(accuracy)
#                 print("iter %r: dev acc=%.4f" % (iter_i, accuracy))       

                # save best model parameters
                if accuracy > best_eval:
#                     print("new highscore")
                    best_eval = accuracy
                    best_iter = iter_i
                    path = "{}{}.pt".format(model.__class__.__name__,name_extension)
                    ckpt = {
                      "state_dict": model.state_dict(),
                      "optimizer_state_dict": optimizer.state_dict(),
                      "best_eval": best_eval,
                      "best_iter": best_iter
                    }
                    torch.save(ckpt, path)

            # done training
            if iter_i == num_iterations:
#                 print("Done training")

                # evaluate on train, dev, and test with best model
#                 print("Loading best model")
                path = "{}{}.pt".format(model.__class__.__name__,name_extension)        
                ckpt = torch.load(path)
                model.load_state_dict(ckpt["state_dict"])

                train_acc, _ = eval_fn(
                    model, train_data, batch_size=eval_batch_size, 
                    batch_fn=batch_fn, prep_fn=prep_fn)
                dev_acc, _ = eval_fn(
                    model, dev_data, batch_size=eval_batch_size,
                    batch_fn=batch_fn, prep_fn=prep_fn)
                test_acc, predictions = eval_fn(
                    model, test_data, batch_size=eval_batch_size, 
                    batch_fn=batch_fn, prep_fn=prep_fn)
                
                if predictions and predict:
                    with open(f"predictions-{name_extension}.csv", "w", newline="") as out:
                        wr = csv.writer(out)
                        wr.writerow(['gwb_code_8'] + new_index_columns[3:])
                        wr.writerows(predictions)

#                 print("best model iter {:d}: "
#                       "train acc={:.4f}, dev acc={:.4f}, test acc={:.4f}".format(
#                           best_iter, train_acc, dev_acc, test_acc))

                return test_acc, predictions

In [13]:
def createSplits(data, k):
    folds = {}
    for i in range(k):
        dev = data[math.ceil(i*len(data)/k) : math.ceil((i+1)*len(data)/k)]
        train = [x for x in data if x not in dev]
        folds[i] = train,dev
    return folds

In [28]:
# embedding_dim, hidden_size, output_dim, num_layers, num_items, item_vocabulary, num_users, 
train_data = trainData

model = NarmPlus(64,64,len(v.p2i),1,len(v.p2i),[i for i in range(len(v.p2i))],100)
model.to(device)
optimizer = optim.Adam(model.parameters())
a, p = train_model(model,optimizer)

RuntimeError: CUDA error: device-side assert triggered