## Libraries Used

In [1]:
import tensorflow as tf
import pandas as pd          
import numpy as np
import scipy.sparse as sp
from tqdm import tqdm

## Data loading

In [2]:
df = pd.read_excel('C:/Users/chand/Downloads/Online Retail.xlsx')

In [3]:
df.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom


In [4]:
## Checking for NAs
df.isna().sum()

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [5]:
df.shape

(541909, 8)

In [6]:
## Making columns names smaller
df.columns = df.columns.str.lower()

In [7]:
## On looking close at data some of the transaction are of return so we have to remove them
## Also filling -1 where customerid is not present
df = df[~df.invoiceno.astype('str').str.startswith('C')].reset_index(drop=True)
df.customerid = df.customerid.fillna(-1).astype('int32')

In [8]:
## we should encode all item IDs (stockcode) with integers
stockcode_values = df.stockcode.astype('str')
stockcodes = sorted(set(stockcode_values))
stockcodes = {c: i for (i, c) in enumerate(stockcodes)}
df.stockcode = stockcode_values.map(stockcodes).astype('int32')

In [9]:
df.head()

Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country
0,536365,3527,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,2791,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,3040,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,2981,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,2980,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom


## Splitting of Data

In [10]:
df_train = df[df.invoicedate < '2011-10-09']
df_val = df[(df.invoicedate >= '2011-10-09') & 
            (df.invoicedate <= '2011-11-09') ]
df_test = df[df.invoicedate >= '2011-11-09']

Our evaluation measure is then the number of successful recommendations (the items the user has actually bought) divided by the number of total recommendations we made. This is called precision—a common measure of evaluating the performance of machine learning models.

## First baseline model

First baseline is how many times a user has bought an item, then take the most frequent five items, and recommend these items to all the users.

In [11]:
top = df_train.stockcode.value_counts().head(5).index.values

In [12]:
top

array([3527, 3506, 1347, 2730,  180], dtype=int64)

Now we use this array to recommend it to all the users. So we repeat the top array as many times as there are transactions in the validation dataset, and then we use this as the recommendations and calculate the precision metric to evaluate the quality.

In [13]:
num_groups = len(df_val.invoiceno.drop_duplicates())
baseline = np.tile(top, num_groups).reshape(-1, 5)

In [14]:
baseline

array([[3527, 3506, 1347, 2730,  180],
       [3527, 3506, 1347, 2730,  180],
       [3527, 3506, 1347, 2730,  180],
       ...,
       [3527, 3506, 1347, 2730,  180],
       [3527, 3506, 1347, 2730,  180],
       [3527, 3506, 1347, 2730,  180]], dtype=int64)

Now we are ready to calculate the precision of this recommendation.However, there is a complication: the way the items are stored makes it difficult to calculate the number of correctly classified elements per group. Using groupby from pandas is one way of solving the problem.The reason it is slow is the way groupby is implemented in pandas: it internally performs sorting, which we do not need. However, we can improve the speed by exploiting the way the data is stored: we know that the elements of our dataframe are always ordered. That is, if a transaction starts at a certain row number i, then it ends at the number i + k, where k is the number of items in this transaction. In other words, all the rows between i and i + k belong to the same invoiceid.

Let us call this array indptr. For each transaction t:  
indptr[t] returns the number of the row in the dataframe where the transaction starts.  
indptr[t + 1] returns the row where it ends.

In [15]:
df.shape

(532621, 8)

In [16]:
df.tail()

Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country
532616,581587,1526,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680,France
532617,581587,1802,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.1,12680,France
532618,581587,2144,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680,France
532619,581587,2145,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France
532620,581587,1092,BAKING SET 9 PIECE RETROSPOT,3,2011-12-09 12:50:00,4.95,12680,France


In [17]:
def group_indptr(df):
    indptr, = np.where(df.invoiceno != df.invoiceno.shift())   
    indptr = np.append(indptr, len(df)).astype('int32')
    return indptr

In [18]:
# Getting pointer arrays for validation
val_indptr = group_indptr(df_val)
val_indptr

array([    0,     3,    32, ..., 63531, 64049, 64460])

In [19]:
df_val.stockcode.values

array([ 663, 2766,  810, ..., 1989, 2262, 2261])

In [20]:
## Using numba for fast computation
from numba import njit
@njit
def precision(group_indptr, true_items, predicted_items):
    tp = 0    
    n, m = predicted_items.shape    
    for i in range(n):
        group_start = group_indptr[i]
        group_end = group_indptr[i + 1]
        group_true_items = true_items[group_start:group_end]
        for item in group_true_items: 
            for j in range(m):
                if item == predicted_items[i, j]: 
                    tp = tp + 1   
                    continue   
    return tp / (n * m)

Now we can check what is the precision of this baseline:

In [21]:
val_items = df_val.stockcode.values
precision(val_indptr, val_items, baseline)

0.0642299794661191

Executing this code should produce 0.064. That is, in 6.4% of the cases we made the correct recommendation. This means that the user ended up buying the recommended item only in 6.4% cases.

Now when we take a first look at the data and establish a simple baseline, we can proceed to more complex techniques such as matrix factorization.

## Implicit baseline model 

 
 we will establish another baseline have stronger than the previous one. We will use the implicit library, which uses ALS.

In [22]:
# We have already converted items (the column stockcode) to integers.
# we need to perform the same on the user IDs (the column customerid):
df_train_user = df_train[df_train.customerid != -1].reset_index(drop=True)
customers = sorted(set(df_train_user.customerid))
customers = {c: i for (i, c) in enumerate(customers)}
df_train_user.customerid = df_train_user.customerid.map(customers)

In [23]:
## Doing same for validation data
df_val.customerid = df_val.customerid.apply(lambda c: customers.get(c, -1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [24]:
### we use these integer codes to construct the matrix X
uid = df_train_user.customerid.values.astype('int32')
iid = df_train_user.stockcode.values.astype('int32')
ones = np.ones_like(uid, dtype='uint8')
X_train = sp.csr_matrix((ones, (uid, iid)))


Now let us use implicit to factorize the matrix X and learn the user and item vectors:


In [25]:
from implicit.als import AlternatingLeastSquares
item_user = X_train.T.tocsr()
als = AlternatingLeastSquares(factors=128, regularization=0.000001)
als.fit(item_user)

100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:00<00:00, 23.43it/s]


In [26]:
als_U = als.user_factors
als_I = als.item_factors

Matrix factorization methods have a problem: they cannot deal with new users. To overcome this problem, we can simply combine it with the baseline method: use the baseline to make a recommendation to new and unknown users, but apply Matrix Factorization to known users.

In [27]:
uid_val = df_val.drop_duplicates(subset='invoiceno').customerid.values
known_mask = uid_val != -1
uid_val = uid_val[known_mask] 

In [28]:
imp_baseline = baseline.copy()
pred_all = als_U[uid_val].dot(als_I.T)
top_val = (-pred_all).argsort(axis=1)[:, :5]
imp_baseline[known_mask] = top_val
precision(val_indptr, val_items, imp_baseline)

0.13782340862422998

This outputs 14.1%. This is a lot stronger baseline than our previous baseline of 6.4%

## SGD Based Matrix Factorization

In [29]:
#Let us define a helper function for declaring embedding layers:
def embed(inputs, size, dim, name=None):
    std = np.sqrt(2 / dim)
    emb = tf.Variable(tf.random_uniform([size, dim], -std, std), name=name) 
    lookup = tf.nn.embedding_lookup(emb, inputs)   
    return lookup

#### MODEL GRAPH

In [30]:
tf.placeholder(tf.int32, shape=(None, 1))

<tf.Tensor 'Placeholder:0' shape=(?, 1) dtype=int32>

In [31]:
# parameters of the model
num_users = uid.max() + 1
num_items = iid.max() + 1
num_factors = 128
lambda_user = 0.0000001
lambda_item = 0.0000001
K = 5
lr = 0.005
graph = tf.Graph()
graph.seed = 1
with graph.as_default(): 
    # this is the input to the model
    place_user = tf.placeholder(tf.int32, shape=(None, 1))  
    place_item = tf.placeholder(tf.int32, shape=(None, 1))  
    place_y = tf.placeholder(tf.float32, shape=(None, 1))   
    # user features    
    user_factors = embed(place_user, num_users, num_factors,         "user_factors") 
    user_bias = embed(place_user, num_users, 1, "user_bias")    
    user_bias = tf.reshape(user_bias, [-1, 1])   
    # item features    
    item_factors = embed(place_item, num_items, num_factors,         "item_factors") 
    item_bias = embed(place_item, num_items, 1, "item_bias")   
    item_bias = tf.reshape(item_bias, [-1, 1])  
    global_bias = tf.Variable(0.0, name='global_bias')  
    # prediction is dot product followed by a sigmoid    
    pred = tf.reduce_sum(user_factors * item_factors, axis=2)  
    pred = tf.sigmoid(global_bias + user_bias + item_bias + pred)   
    reg = lambda_user * tf.reduce_sum(user_factors * user_factors) + lambda_item * tf.reduce_sum(item_factors * item_factors)    
    # we have a classification model, so minimize logloss    
    loss = tf.losses.log_loss(place_y, pred)    
    loss_total = loss + reg   
    opt = tf.train.AdamOptimizer(learning_rate=lr)    
    step = opt.minimize(loss_total)    
    init = tf.global_variables_initializer()

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.


Instructions for updating:
Use tf.cast instead.


In [32]:
## Now let us train the model. For that, we need to cut the input into small batches. Let us use a helper function for that:
def prepare_batches(seq, step):
    n = len(seq)    
    res = []   
    for i in range(0, n, step):
        res.append(seq[i:i+step]) 
    return res

#### Training model


In [33]:
def get_variable(graph, session, name):
    v = graph.get_operation_by_name(name) 
    v = v.values()[0]   
    v = v.eval(session=session)   
    return v
def calculate_validation_precision(graph, session, uid): 
    U = get_variable(graph, session, 'user_factors')
    I = get_variable(graph, session, 'item_factors') 
    bi = get_variable(graph, session, 'item_bias').reshape(-1)
    pred_all = U[uid_val].dot(I.T) + bi    
    top_val = (-pred_all).argsort(axis=1)[:, :5]    
    imp_baseline = baseline.copy()    
    imp_baseline[known_mask] = top_val
    return precision(val_indptr, val_items, imp_baseline)

In [34]:
session = tf.Session(config=None, graph=graph)
session.run(init)
np.random.seed(0)
for i in range(10):
    train_idx_shuffle = np.arange(uid.shape[0])
    np.random.shuffle(train_idx_shuffle)   
    batches = prepare_batches(train_idx_shuffle, 5000) 
    progress = tqdm(total=len(batches))    
    for idx in batches:        
        pos_samples = len(idx)    
        neg_samples = pos_samples * K   
        label = np.concatenate([np.ones(pos_samples, dtype='float32'), np.zeros(neg_samples, dtype='float32')]).reshape(-1, 1)
        # negative sampling 
        neg_users = np.random.randint(low=0, high=num_users, size=neg_samples, dtype='int32')       
        neg_items = np.random.randint(low=0, high=num_items,size=neg_samples, dtype='int32')     
        batch_uid = np.concatenate([uid[idx], neg_users]).reshape(-1, 1)       
        batch_iid = np.concatenate([iid[idx], neg_items]).reshape(-1, 1)      
        feed_dict = {place_user: batch_uid,
                     place_item: batch_iid,
                     place_y: label }       
        _, l = session.run([step, loss], feed_dict)       
        progress.update(1)       
        progress.set_description('%.3f' % l)   
    progress.close()    
    val_precision = calculate_validation_precision(graph, session, uid_val)
    print('epoch %02d: precision: %.3f' % (i+1, val_precision))

0.538: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [00:03<00:00, 14.21it/s]


epoch 01: precision: 0.069


0.258: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [00:04<00:00, 13.65it/s]


epoch 02: precision: 0.084


0.245: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [00:04<00:00, 13.44it/s]


epoch 03: precision: 0.107


0.221: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [00:04<00:00, 12.96it/s]


epoch 04: precision: 0.127


0.201: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [00:04<00:00, 14.59it/s]


epoch 05: precision: 0.137


0.190: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [00:04<00:00, 13.29it/s]


epoch 06: precision: 0.147


0.176: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [00:03<00:00, 14.16it/s]


epoch 07: precision: 0.147


0.165: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [00:04<00:00, 14.10it/s]


epoch 08: precision: 0.150


0.156: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [00:04<00:00, 11.66it/s]


epoch 09: precision: 0.152


0.152: 100%|███████████████████████████████████████████████████████████████████████████| 56/56 [00:04<00:00, 13.86it/s]


epoch 10: precision: 0.151


On 10 epoch it reaches 15.1%

## Bayesian Personalized Ranking

In [35]:
def init_variable(size, dim, name=None):
    std = np.sqrt(2 / dim)
    return tf.Variable(tf.random_uniform([size, dim], -std, std), name=name)
def embed(inputs, size, dim, name=None):  
    emb = init_variable(size, dim, name) 
    return tf.nn.embedding_lookup(emb, inputs)

In [36]:
num_factors = 128
lambda_user = 0.0000001
lambda_item = 0.0000001
lambda_bias = 0.0000001
lr = 0.0005
graph = tf.Graph()
graph.seed = 1
with graph.as_default():
    place_user = tf.placeholder(tf.int32, shape=(None, 1)) 
    place_item_pos = tf.placeholder(tf.int32, shape=(None, 1))  
    place_item_neg = tf.placeholder(tf.int32, shape=(None, 1))  
    # no place_y 
    user_factors = embed(place_user, num_users, num_factors,"user_factors")
    # no user bias anymore as well as no global bias
    item_factors = init_variable(num_items, num_factors,"item_factors")
    item_factors_pos = tf.nn.embedding_lookup(item_factors, place_item_pos)  
    item_factors_neg = tf.nn.embedding_lookup(item_factors, place_item_neg)   
    item_bias = init_variable(num_items, 1, "item_bias")  
    item_bias_pos = tf.nn.embedding_lookup(item_bias, place_item_pos) 
    item_bias_pos = tf.reshape(item_bias_pos, [-1, 1])  
    item_bias_neg = tf.nn.embedding_lookup(item_bias, place_item_neg) 
    item_bias_neg = tf.reshape(item_bias_neg, [-1, 1])  
    # predictions for each item are same as previously 
    # but no user bias and global bias   
    pred_pos = item_bias_pos +  tf.reduce_sum(user_factors * item_factors_pos, axis=2)   
    pred_neg = item_bias_neg + tf.reduce_sum(user_factors * item_factors_neg, axis=2) 
    pred_diff = pred_pos- pred_neg  
    loss_bpr = -tf.reduce_mean(tf.log(tf.sigmoid(pred_diff)))  
    loss_reg = lambda_user * tf.reduce_sum(user_factors * user_factors) +lambda_item * tf.reduce_sum(item_factors_pos * item_factors_pos)+ lambda_item * tf.reduce_sum(item_factors_neg * item_factors_neg)+lambda_bias * tf.reduce_sum(item_bias_pos) +lambda_bias * tf.reduce_sum(item_bias_neg) 
    loss_total = loss_bpr + loss_reg
    opt = tf.train.AdamOptimizer(learning_rate=lr) 
    step = opt.minimize(loss_total)   
    init = tf.global_variables_initializer()

In [37]:
session = tf.Session(config=None, graph=graph)
session.run(init)
size_total = uid.shape[0]
size_sample = 15000
np.random.seed(0)
for i in range(75):
    for k in range(30):  
        idx = np.random.randint(low=0, high=size_total, size=size_sample)  
        batch_uid = uid[idx].reshape(-1, 1)        
        batch_iid_pos = iid[idx].reshape(-1, 1)   
        batch_iid_neg = np.random.randint(low=0, high=num_items, size=(size_sample, 1), dtype='int32')  
        feed_dict = { place_user: batch_uid, 
                     place_item_pos: batch_iid_pos, 
                     place_item_neg: batch_iid_neg
                    }        
        _, l = session.run([step, loss_bpr], feed_dict)
    val_precision = calculate_validation_precision(graph, session, uid_val) 
    print('epoch %02d: precision: %.3f' % (i+1, val_precision))

epoch 01: precision: 0.022
epoch 02: precision: 0.025
epoch 03: precision: 0.027
epoch 04: precision: 0.031
epoch 05: precision: 0.035
epoch 06: precision: 0.040
epoch 07: precision: 0.045
epoch 08: precision: 0.048
epoch 09: precision: 0.051
epoch 10: precision: 0.057
epoch 11: precision: 0.060
epoch 12: precision: 0.061
epoch 13: precision: 0.063
epoch 14: precision: 0.065
epoch 15: precision: 0.067
epoch 16: precision: 0.067
epoch 17: precision: 0.069
epoch 18: precision: 0.070
epoch 19: precision: 0.073
epoch 20: precision: 0.076
epoch 21: precision: 0.077
epoch 22: precision: 0.080
epoch 23: precision: 0.083
epoch 24: precision: 0.085
epoch 25: precision: 0.087
epoch 26: precision: 0.090
epoch 27: precision: 0.092
epoch 28: precision: 0.093
epoch 29: precision: 0.095
epoch 30: precision: 0.097
epoch 31: precision: 0.099
epoch 32: precision: 0.102
epoch 33: precision: 0.103
epoch 34: precision: 0.104
epoch 35: precision: 0.107
epoch 36: precision: 0.108
epoch 37: precision: 0.109
e

Accuracy is 14.2%. So we will try RNN now

## RNN 

In [38]:
df = pd.read_excel('C:/Users/chand/Downloads/Online Retail.xlsx')

In [39]:
df.columns = df.columns.str.lower()
df = df[~df.invoiceno.astype('str').str.startswith('C')].reset_index(drop=True)
df.customerid = df.customerid.fillna(-1).astype('int32')

In [40]:
class LabelEncoder:
    def fit(self, seq):
        self.vocab = sorted(set(seq))
        self.idx = {c: i + 1 for i, c in enumerate(self.vocab)}

    def vocab_size(self):
        return len(self.vocab) + 1

    def transfrom(self, seq):
        n = len(seq)
        result = np.zeros(n, dtype='int32')

        for i in range(n):
            result[i] = self.idx.get(seq[i], 0)

        return result

    def fit_transform(self, seq):
        self.fit(seq)
        return self.transfrom(seq)

In [41]:
item_enc = LabelEncoder()
df.stockcode = item_enc.fit_transform(df.stockcode.astype('str'))
df.stockcode = df.stockcode.astype('int32')

TRAIN-TEST SPLIT

In [42]:
df_train = df[df.invoicedate < '2011-10-09'].reset_index(drop=True)
df_val = df[(df.invoicedate >= '2011-10-09') & (df.invoicedate <= '2011-11-09') ].reset_index(drop=True)
df_test = df[df.invoicedate >= '2011-11-09'].reset_index(drop=True)

In [43]:
df_train.shape, df_val.shape, df_test.shape

((378470, 8), (64460, 8), (89691, 8))

In [44]:
user_enc = LabelEncoder()
user_enc.fit(df_train[df_train.customerid != -1].customerid)

df_train.customerid = user_enc.transfrom(df_train.customerid)
df_val.customerid = user_enc.transfrom(df_val.customerid)

In [46]:
uid_train = df_train.drop_duplicates(subset='invoiceno').customerid.values
uid_val = df_val.drop_duplicates(subset='invoiceno').customerid.values

In [47]:
def group_indptr(df):
    indptr, = np.where(df.invoiceno != df.invoiceno.shift())
    indptr = np.append(indptr, len(df)).astype('int32')
    return indptr

indptr_train = group_indptr(df_train)
indptr_val = group_indptr(df_val)

In [48]:
from collections import Counter
top_train = Counter(df_train.stockcode)

In [50]:
def baseline(uid, indptr, items, top, k=5):
    n_groups = len(uid)
    n_items = len(items)

    pred_all = np.zeros((n_items, k), dtype=np.int32)

    for g in range(n_groups):
        t = top.copy()

        start = indptr[g]
        end = indptr[g+1]
        
        for i in range(start, end):
            pred = [k for (k, c) in t.most_common(5)]
            pred_all[i] = pred

            actual = items[i]
            if actual in t:
                del t[actual]

    return pred_all

In [51]:
iid_val = df_val.stockcode.values
pred_baseline = baseline(uid_val, indptr_val, iid_val, top_train, k=5)

In [53]:
pred_baseline

array([[3528, 3507, 1348, 2731,  181],
       [3528, 3507, 1348, 2731,  181],
       [3528, 3507, 1348, 2731,  181],
       ...,
       [1348, 2731,  181,  454, 1314],
       [1348, 2731,  181,  454, 1314],
       [1348, 2731,  181,  454, 1314]])

In [52]:
@njit
def accuracy_k(y_true, y_pred):
    n, k = y_pred.shape

    acc = 0
    for i in range(n):
        for j in range(k):
            if y_pred[i, j] == y_true[i]:
                acc = acc + 1
                break

    return acc / n

In [54]:
accuracy_k(iid_val, pred_baseline)

0.012705553831833695

RNN Data preparation

In [55]:
def pack_items(users, items_indptr, items_vals):
    n = len(items_indptr) - 1

    result = []
    for i in range(n):
        start = items_indptr[i]
        end = items_indptr[i+1]
        result.append(items_vals[start:end])

    return result

In [56]:
train_items = pack_items(indptr_train, indptr_train, df_train.stockcode.values)

df_train_wrap = pd.DataFrame()
df_train_wrap['customerid'] = uid_train
df_train_wrap['items'] = train_items

In [57]:
df_train_wrap.head(5)

Unnamed: 0,customerid,items
0,3439,"[3528, 2792, 3041, 2982, 2981, 1662, 800]"
1,3439,"[1547, 1546]"
2,459,"[3301, 1655, 1658, 1659, 1247, 3368, 1537, 153..."
3,459,"[1862, 1816, 1815, 1817]"
4,459,[818]


In [58]:
def pad_seq(data, num_steps):
    data = np.pad(data, pad_width=(1, 0), mode='constant')

    n = len(data)

    if n <= num_steps:
        pad_right = num_steps - n + 1
        data = np.pad(data, pad_width=(0, pad_right), mode='constant')

    return data

def prepare_train_data(data, num_steps):
    data = pad_seq(data, num_steps)

    X = []
    Y = []

    for i in range(num_steps, len(data)):
        start = i - num_steps
        X.append(data[start:i])
        Y.append(data[start+1:i+1])

    return X, Y

In [59]:
import tensorflow as tf
rnn = tf.contrib.rnn


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.



In [60]:
class Config:
    num_steps = 5

    num_items = item_enc.vocab_size()
    num_users = user_enc.vocab_size()

    init_scale = 0.1
    learning_rate = 1.0
    max_grad_norm = 5
    num_layers = 2
    hidden_size = 200
    embedding_size = 200
    batch_size = 20    

config = Config()

In [61]:
train_items = df_train_wrap['items']

X_train = []
Y_train = []

for i in range(len(train_items)):
    X, Y = prepare_train_data(train_items[i], config.num_steps)
    X_train.extend(X)
    Y_train.extend(Y)

X_train = np.array(X_train, dtype='int32')
Y_train = np.array(Y_train, dtype='int32')

#### MODEL GRAPH

In [63]:
def lstm_cell(hidden_size, is_training):
    return rnn.BasicLSTMCell(hidden_size, forget_bias=0.0, 
                             state_is_tuple=True, reuse=not is_training)

def rnn_model(inputs, hidden_size, num_layers, batch_size, num_steps, is_training):
    cells = [lstm_cell(hidden_size, is_training) for _ in range(num_layers)]
    cell = rnn.MultiRNNCell(cells, state_is_tuple=True)

    initial_state = cell.zero_state(batch_size, tf.float32)
    inputs = tf.unstack(inputs, num=num_steps, axis=1)
    outputs, final_state = rnn.static_rnn(cell, inputs, initial_state=initial_state)
    output = tf.reshape(tf.concat(outputs, 1), [-1, hidden_size])

    return output, initial_state, final_state


def model(config, is_training):
    batch_size = config.batch_size
    num_steps = config.num_steps
    embedding_size = config.embedding_size
    hidden_size = config.hidden_size
    num_items = config.num_items
    place_x = tf.placeholder(shape=[batch_size, num_steps], dtype=tf.int32)
    place_y = tf.placeholder(shape=[batch_size, num_steps], dtype=tf.int32)

    embedding = tf.get_variable("items", [num_items, embedding_size], dtype=tf.float32)
    inputs = tf.nn.embedding_lookup(embedding, place_x)

    output, initial_state, final_state = \
        rnn_model(inputs, hidden_size, config.num_layers, batch_size, num_steps, is_training)

    W = tf.get_variable("W", [hidden_size, num_items], dtype=tf.float32)
    b = tf.get_variable("b", [num_items], dtype=tf.float32)
    logits = tf.nn.xw_plus_b(output, W, b)
    logits = tf.reshape(logits, [batch_size, num_steps, num_items])

    loss = tf.losses.sparse_softmax_cross_entropy(place_y, logits)
    total_loss = tf.reduce_mean(loss)

    tvars = tf.trainable_variables()
    gradient = tf.gradients(total_loss, tvars)
    clipped, _ = tf.clip_by_global_norm(gradient, config.max_grad_norm)
    optimizer = tf.train.GradientDescentOptimizer(config.learning_rate)

    global_step = tf.train.get_or_create_global_step()
    train_op = optimizer.apply_gradients(zip(clipped, tvars), global_step=global_step)
    out = {}
    out['place_x'] = place_x
    out['place_y'] = place_y
    
    out['logits'] = logits
    out['initial_state'] = initial_state
    out['final_state'] = final_state

    out['total_loss'] = total_loss
    out['train_op'] = train_op

    return out

Initialization

In [64]:
config = Config()
config_val = Config()
config_val.batch_size = 1
config_val.num_steps = 1

graph = tf.Graph()
graph.seed = 1

with graph.as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)

    with tf.name_scope("Train"):
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = model(config, is_training=True)

    with tf.name_scope("Valid"):
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            val_model = model(config_val, is_training=False)

    init = tf.global_variables_initializer()
    

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.


Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API


Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API


In [65]:
def prepare_batches(seq, step):
    n = len(seq)
    res = []
    for i in range(0, n, step):
        res.append(seq[i:i+step])
    return res

In [66]:
def run_epoch(session, model, X, Y, batch_size):
    fetches = {
        "total_loss": model['total_loss'],
        "final_state": model['final_state'],
        "eval_op": model['train_op']
    }

    num_steps = X.shape[1]
    all_idx = np.arange(X.shape[0])
    np.random.shuffle(all_idx)
    batches = prepare_batches(all_idx, batch_size)

    initial_state = session.run(model['initial_state'])
    current_state = initial_state

    progress = tqdm(total=len(batches))
    for idx in batches:
        if len(idx) < batch_size:
            continue

        feed_dict = {}
        for i, (c, h) in enumerate(model['initial_state']):
            feed_dict[c] = current_state[i].c
            feed_dict[h] = current_state[i].h

        feed_dict[model['place_x']] = X[idx]
        feed_dict[model['place_y']] = Y[idx]

        vals = session.run(fetches, feed_dict)
        loss = vals["total_loss"]
        current_state = vals["final_state"]

        progress.update(1)
        progress.set_description('%.3f' % loss)
    progress.close()

In [67]:
session = tf.Session(config=None, graph=graph) 
session.run(init)

np.random.seed(0)

run_epoch(session, train_model, X_train, Y_train, batch_size=config.batch_size)

6.426: 100%|████████████████████████████████████████████████████████████████████▉| 16376/16377 [09:08<00:00, 29.87it/s]


In [68]:
def generate_prediction(uid, indptr, items, model, k):
    n_groups = len(uid)
    n_items = len(items)

    pred_all = np.zeros((n_items, k), dtype=np.int32)
    initial_state = session.run(model['initial_state'])

    fetches = {
        "logits": model['logits'],
        "final_state": model['final_state'],
    }

    for g in tqdm(range(n_groups)):    
        start = indptr[g]
        end = indptr[g+1]

        current_state = initial_state

        feed_dict = {}
        for i, (c, h) in enumerate(model['initial_state']):
            feed_dict[c] = current_state[i].c
            feed_dict[h] = current_state[i].h

        prev = np.array([[0]], dtype=np.int32)

        for i in range(start, end):
            feed_dict[model['place_x']] = prev

            actual = items[i]
            prev[0, 0] = actual

            values = session.run(fetches, feed_dict)
            current_state = values["final_state"]

            logits = values['logits'].reshape(-1)
            pred = np.argpartition(-logits, k)[:k]
            pred_all[i] = pred

    return pred_all

In [69]:
pred_lstm = generate_prediction(uid_val, indptr_val, iid_val, val_model, k=5)

100%|██████████████████████████████████████████████████████████████████████████████| 2435/2435 [01:08<00:00, 35.41it/s]


In [70]:
accuracy_k(iid_val, pred_lstm)

0.07053986968662737

### Adding user features to model

In [71]:
X_train = []
U_train = []
Y_train = []


for t in df_train_wrap.itertuples():
    X, Y = prepare_train_data(t.items, config.num_steps)
    U_train.extend([t.customerid] * len(X))
    X_train.extend(X)
    Y_train.extend(Y)

X_train = np.array(X_train, dtype='int32')
Y_train = np.array(Y_train, dtype='int32')
U_train = np.array(U_train, dtype='int32')

In [73]:
def user_model(config, is_training):
    batch_size = config.batch_size
    num_steps = config.num_steps
    embedding_size = config.embedding_size
    hidden_size = config.hidden_size
    num_items = config.num_items
    num_users = config.num_users

    place_x = tf.placeholder(shape=[batch_size, num_steps], dtype=tf.int32)
    place_u = tf.placeholder(shape=[batch_size, 1], dtype=tf.int32)
    place_y = tf.placeholder(shape=[batch_size, num_steps], dtype=tf.int32)

    item_embedding = tf.get_variable("items", [num_items, embedding_size], dtype=tf.float32)
    item_inputs = tf.nn.embedding_lookup(item_embedding, place_x)
    
    user_embedding = tf.get_variable("users", [num_items, embedding_size], dtype=tf.float32)
    u_repeat = tf.tile(place_u, [1, num_steps])
    user_inputs = tf.nn.embedding_lookup(user_embedding, u_repeat)

    inputs = tf.concat([user_inputs, item_inputs], axis=2)
    
    output, initial_state, final_state = \
        rnn_model(inputs, hidden_size, config.num_layers, batch_size, num_steps, is_training)

    W = tf.get_variable("W", [hidden_size, num_items], dtype=tf.float32)
    b = tf.get_variable("b", [num_items], dtype=tf.float32)

    logits = tf.nn.xw_plus_b(output, W, b)
    logits = tf.reshape(logits, [batch_size, num_steps, num_items])
    loss = tf.losses.sparse_softmax_cross_entropy(place_y, logits)
    total_loss = tf.reduce_mean(loss)

    tvars = tf.trainable_variables()
    gradient = tf.gradients(total_loss, tvars)
    clipped, _ = tf.clip_by_global_norm(gradient, config.max_grad_norm)
    optimizer = tf.train.GradientDescentOptimizer(config.learning_rate)

    global_step = tf.train.get_or_create_global_step()
    train_op = optimizer.apply_gradients(zip(clipped, tvars), global_step=global_step)

    out = {}
    out['place_x'] = place_x
    out['place_u'] = place_u
    out['place_y'] = place_y
    

    out['logits'] = logits
    out['initial_state'] = initial_state
    out['final_state'] = final_state

    out['total_loss'] = total_loss
    out['train_op'] = train_op

    return out

In [74]:
graph = tf.Graph()
graph.seed = 1

with graph.as_default():
    initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale)

    with tf.name_scope("Train"):
        with tf.variable_scope("Model", reuse=None, initializer=initializer):
            train_model = user_model(config, is_training=True)

    with tf.name_scope("Valid"):
        with tf.variable_scope("Model", reuse=True, initializer=initializer):
            val_model = user_model(config_val, is_training=False)

    init = tf.global_variables_initializer()

session = tf.Session(config=None, graph=graph) 
session.run(init)

In [75]:
## Training
def user_model_epoch(session, model, X, U, Y, batch_size):
    fetches = {
        "total_loss": model['total_loss'],
        "final_state": model['final_state'],
        "eval_op": model['train_op']
    }

    num_steps = X.shape[1]
    all_idx = np.arange(X.shape[0])
    np.random.shuffle(all_idx)
    batches = prepare_batches(all_idx, batch_size)

    initial_state = session.run(model['initial_state'])
    current_state = initial_state

    progress = tqdm(total=len(batches))
    for idx in batches:
        if len(idx) < batch_size:
            continue

        feed_dict = {}
        for i, (c, h) in enumerate(model['initial_state']):
            feed_dict[c] = current_state[i].c
            feed_dict[h] = current_state[i].h

        feed_dict[model['place_x']] = X[idx]
        feed_dict[model['place_y']] = Y[idx]
        feed_dict[model['place_u']] = U[idx].reshape(-1, 1)

        vals = session.run(fetches, feed_dict)
        loss = vals["total_loss"]
        current_state = vals["final_state"]

        progress.update(1)
        progress.set_description('%.3f' % loss)
    progress.close()

In [76]:
session = tf.Session(config=None, graph=graph) 
session.run(init)

np.random.seed(0)

user_model_epoch(session, train_model, X_train, U_train, Y_train, batch_size=config.batch_size)

5.789: 100%|████████████████████████████████████████████████████████████████████▉| 16376/16377 [10:22<00:00, 26.29it/s]


In [77]:
def generate_prediction_user_model(uid, indptr, items, model, k):
    n_groups = len(uid)
    n_items = len(items)

    pred_all = np.zeros((n_items, k), dtype=np.int32)
    initial_state = session.run(model['initial_state'])

    fetches = {
        "logits": model['logits'],
        "final_state": model['final_state'],
    }

    for g in tqdm(range(n_groups)):    
        start = indptr[g]
        end = indptr[g+1]
        u = uid[g]

        current_state = initial_state

        feed_dict = {}
        feed_dict[model['place_u']] = np.array([[u]], dtype=np.int32)

        for i, (c, h) in enumerate(model['initial_state']):
            feed_dict[c] = current_state[i].c
            feed_dict[h] = current_state[i].h

        prev = np.array([[0]], dtype=np.int32)
        for i in range(start, end):
            feed_dict[model['place_x']] = prev

            actual = items[i]
            prev[0, 0] = actual

            values = session.run(fetches, feed_dict)
            current_state = values["final_state"]

            logits = values['logits'].reshape(-1)
            pred = np.argpartition(-logits, k)[:k]
            pred_all[i] = pred

    return pred_all

In [78]:
pred_lstm = generate_prediction_user_model(uid_val, indptr_val, iid_val, val_model, k=5)
accuracy_k(iid_val, pred_lstm)

100%|██████████████████████████████████████████████████████████████████████████████| 2435/2435 [01:09<00:00, 35.12it/s]


0.22804840210983557