In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy.sparse as sps
import time



# Input data files are available in the read-only "./input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


# Read Data

In [2]:
RM_train=pd.read_csv('./input/data_train.csv')
R_test=pd.read_csv('./input/data_target_users_test.csv')
URM=pd.read_csv('./input/data_train.csv')
ICM = pd.read_csv('./input/data_ICM_title_abstract.csv')

## URM all

In [3]:
URM_tuples = [tuple(x) for x in URM.to_numpy()]

userList, itemList, ratingList = zip(*URM_tuples)

userList = list(userList)
userList=np.array(userList,dtype=np.int64)
itemList = list(itemList)
itemList=np.array(itemList,dtype=np.int64)

ratingList = list(ratingList)                        #not needed
ratingList=np.array(ratingList,dtype=np.int64)       #not needed

URM_all = sps.coo_matrix((ratingList, (userList, itemList)))
URM_all = URM_all.tocsr()

In [4]:
userList_unique = list(set(userList))
itemList_unique = list(set(itemList))

numUsers = len(userList_unique)
numItems = len(itemList_unique)

numberInteractions= len(URM_tuples)
print ("Number of items\t {}, Number of users\t {}".format(numItems, numUsers))
print("Number of Intraction \t {}" .format(numberInteractions))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemList_unique), max(userList_unique)))
print ("Average interactions per user {:.2f}".format(numberInteractions/numUsers))
print ("Average interactions per item {:.2f}\n".format(numberInteractions/numItems))

print ("Sparsity {:.2f} %".format((1-float(numberInteractions)/(numItems*numUsers))*100))

Number of items	 24896, Number of users	 7947
Number of Intraction 	 113268
Max ID items	 25974, Max Id users	 7946

Average interactions per user 14.25
Average interactions per item 4.55

Sparsity 99.94 %


## ICM all

In [5]:

ICM_tuples = [tuple(x) for x in ICM.to_numpy()]
itemList_icm, featureList_icm, scoreList_icm = zip(*ICM_tuples)

itemList_icm = list(itemList_icm)
itemList_icm = np.array(itemList_icm,dtype=np.int64)

featureList_icm = list(featureList_icm)
featureList_icm = np.array(featureList_icm,dtype=np.int64)

scoreList_icm = list(scoreList_icm)
scoreList_icm = np.array(scoreList_icm,dtype=np.float64)

ICM_all = sps.coo_matrix((scoreList_icm, (itemList_icm, featureList_icm)))

ICM_all

<25975x20000 sparse matrix of type '<class 'numpy.float64'>'
	with 490691 stored elements in COOrdinate format>

## Test users

In [6]:
userTestList = [x for x in R_test.to_numpy()]
userTestList = zip(*userTestList)
userTestList = [list(a) for a in userTestList][0]

In [7]:
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.80)
# URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train, train_percentage = 0.80)

# evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])



## Let's implement SLIM BPR 

## Step 1 - Sampling

#### Create a mask of positive interactions. How to build it depends on the data

In [8]:
URM_mask = URM_train.copy()
URM_mask.data[URM_mask.data < 1] = 0

URM_mask.eliminate_zeros()
URM_mask

<7947x25975 sparse matrix of type '<class 'numpy.float64'>'
	with 90614 stored elements in Compressed Sparse Row format>

In [9]:
n_users = URM_mask.shape[0]
n_items = URM_mask.shape[1]


# Extract users having at least one interaction to choose from
eligibleUsers = []

for user_id in range(n_users):
    start_pos = URM_mask.indptr[user_id]
    end_pos = URM_mask.indptr[user_id+1]
    if len(URM_mask.indices[start_pos:end_pos]) > 0:
        eligibleUsers.append(user_id)
                

def sampleTriplet():
    
    # By randomly selecting a user in this way we could end up 
    # with a user with no interactions
    #user_id = np.random.randint(0, n_users)
    
    user_id = np.random.choice(eligibleUsers)
    
    # Get user seen items and choose one
    userSeenItems = URM_mask[user_id,:].indices
    pos_item_id = np.random.choice(userSeenItems)

    negItemSelected = False

    # It's faster to just try again then to build a mapping of the non-seen items
    while (not negItemSelected):
        neg_item_id = np.random.randint(0, n_items)

        if (neg_item_id not in userSeenItems):
            
            negItemSelected = True

    return user_id, pos_item_id, neg_item_id


In [10]:
for _ in range(10):
    print(sampleTriplet())

(4033, 14140, 11281)
(4697, 20222, 9922)
(1712, 4804, 22414)
(6976, 15269, 13412)
(1259, 12914, 11529)
(3711, 17442, 10000)
(4248, 13552, 8017)
(4776, 23943, 5448)
(2934, 10689, 5761)
(7323, 24721, 18953)


## Step 2 - Computing prediction

#### The prediction depends on the model: SLIM, Matrix Factorization... 

### We have to initialize our model. In case of SLIM it works best to initialize S as zero, in case of MF you cannot because of how the gradient is computed and you have to initialize at random. Here we initialize SLIM at random just so that we have some numbers to show

In [11]:
similarity_matrix = np.random.random((n_items,n_items))
similarity_matrix[np.arange(n_items),np.arange(n_items)] = 0

In [12]:
user_id, positive_item_id, negative_item_id = sampleTriplet()

In [13]:
positive_item_id

13309

In [14]:
negative_item_id

22376

In [15]:
userSeenItems = URM_mask[user_id,:].indices
userSeenItems

array([ 1331,  3907,  6952,  8360,  9018, 13309, 14562, 18417, 18495,
       19618, 21936, 22217, 22848, 24672], dtype=int32)

In [16]:
x_i = similarity_matrix[positive_item_id, userSeenItems].sum()
x_j = similarity_matrix[negative_item_id, userSeenItems].sum()

print("x_i is {:.2f}, x_j is {:.2f}".format(x_i, x_j))

x_i is 6.08, x_j is 6.03


## Step 3 - Computing gradient

#### The gradient depends on the objective function: RMSE, BPR... 

In [17]:
x_ij = x_i - x_j
x_ij

0.052418597579121595

In [18]:
#### The original BPR paper uses the logarithm of the sigmoid of x_ij, whose derivative is the following

In [19]:
gradient = 1 / (1 + np.exp(x_ij))
gradient

0.4868983504282882

## Step 4 - Update model

#### How to update depends on the model itself, here we have just one paramether, the similarity matrix, so we perform just one update. In matrix factorization we have two.

#### We need a learning rate, which influences how fast the model will change. Small ones lead to slower convergence but often higher results

In [20]:
learning_rate = 1e-3

similarity_matrix[positive_item_id, userSeenItems] += learning_rate * gradient
similarity_matrix[positive_item_id, positive_item_id] = 0

similarity_matrix[negative_item_id, userSeenItems] -= learning_rate * gradient
similarity_matrix[negative_item_id, negative_item_id] = 0

#### Usually there is no relevant change in the scores over a single iteration

In [21]:
x_i = similarity_matrix[positive_item_id, userSeenItems].sum()
x_j = similarity_matrix[negative_item_id, userSeenItems].sum()

print("x_i is {:.2f}, x_j is {:.2f}".format(x_i, x_j))

x_i is 6.08, x_j is 6.02


## Step 5 - Write the iterative epochs

In [22]:
import time

def epochIteration():

    # Get number of available interactions
    numPositiveIteractions = int(URM_mask.nnz*0.01)

    start_time_epoch = time.time()
    start_time_batch = time.time()

    # Uniform user sampling without replacement
    for num_sample in range(numPositiveIteractions):

        # Sample
        user_id, positive_item_id, negative_item_id = sampleTriplet()
        
        userSeenItems = URM_mask[user_id,:].indices
        
        # Prediction
        x_i = similarity_matrix[positive_item_id, userSeenItems].sum()
        x_j = similarity_matrix[negative_item_id, userSeenItems].sum()
        
        # Gradient
        x_ij = x_i - x_j

        gradient = 1 / (1 + np.exp(x_ij))
        
        # Update
        similarity_matrix[positive_item_id, userSeenItems] += learning_rate * gradient
        similarity_matrix[positive_item_id, positive_item_id] = 0

        similarity_matrix[negative_item_id, userSeenItems] -= learning_rate * gradient
        similarity_matrix[negative_item_id, negative_item_id] = 0
        

        if(time.time() - start_time_batch >= 30 or num_sample == numPositiveIteractions-1):
            print("Processed {} ( {:.2f}% ) in {:.2f} seconds. Sample per second: {:.0f}".format(
                num_sample,
                100.0* float(num_sample)/numPositiveIteractions,
                time.time() - start_time_batch,
                float(num_sample) / (time.time() - start_time_epoch)))


            start_time_batch = time.time()


In [23]:
epochIteration()

Processed 905 ( 99.89% ) in 1.71 seconds. Sample per second: 530


In [24]:
import scipy.sparse as sps


def similarityMatrixTopK(item_weights, forceSparseOutput = True, k=100, verbose = False, inplace=True):
    """
    The function selects the TopK most similar elements, column-wise

    :param item_weights:
    :param forceSparseOutput:
    :param k:
    :param verbose:
    :param inplace: Default True, WARNING matrix will be modified
    :return:
    """

    assert (item_weights.shape[0] == item_weights.shape[1]), "selectTopK: ItemWeights is not a square matrix"

    start_time = time.time()

    if verbose:
        print("Generating topK matrix")

    nitems = item_weights.shape[1]
    k = min(k, nitems)

    # for each column, keep only the top-k scored items
    sparse_weights = not isinstance(item_weights, np.ndarray)

    if not sparse_weights:

        idx_sorted = np.argsort(item_weights, axis=0)  # sort data inside each column

        if inplace:
            W = item_weights
        else:
            W = item_weights.copy()

        # index of the items that don't belong to the top-k similar items of each column
        not_top_k = idx_sorted[:-k, :]
        # use numpy fancy indexing to zero-out the values in sim without using a for loop
        W[not_top_k, np.arange(nitems)] = 0.0

        if forceSparseOutput:
            W_sparse = sps.csr_matrix(W, shape=(nitems, nitems))

            if verbose:
                print("Sparse TopK matrix generated in {:.2f} seconds".format(time.time() - start_time))

            return W_sparse

        if verbose:
            print("Dense TopK matrix generated in {:.2f} seconds".format(time.time()-start_time))

        return W

    else:
        # iterate over each column and keep only the top-k similar items
        data, rows_indices, cols_indptr = [], [], []

        item_weights = check_matrix(item_weights, format='csc', dtype=np.float32)

        for item_idx in range(nitems):

            cols_indptr.append(len(data))

            start_position = item_weights.indptr[item_idx]
            end_position = item_weights.indptr[item_idx+1]

            column_data = item_weights.data[start_position:end_position]
            column_row_index = item_weights.indices[start_position:end_position]

            non_zero_data = column_data!=0

            idx_sorted = np.argsort(column_data[non_zero_data])  # sort by column
            top_k_idx = idx_sorted[-k:]

            data.extend(column_data[non_zero_data][top_k_idx])
            rows_indices.extend(column_row_index[non_zero_data][top_k_idx])


        cols_indptr.append(len(data))

        # During testing CSR is faster
        W_sparse = sps.csc_matrix((data, rows_indices, cols_indptr), shape=(nitems, nitems), dtype=np.float32)
        W_sparse = W_sparse.tocsr()

        if verbose:
            print("Sparse TopK matrix generated in {:.2f} seconds".format(time.time() - start_time))

        return W_sparse



In [25]:
class SLIM_BPR_Recommender(object):
    """ SLIM_BPR recommender with cosine similarity and no shrinkage"""

    def __init__(self, URM):
        self.URM = URM
        
        self.similarity_matrix = np.zeros((n_items,n_items))
        
        self.URM_mask = self.URM.copy()
        self.URM_mask.data[self.URM_mask.data < 1] = 0
        self.URM_mask.eliminate_zeros()
        
        self.n_users = URM_mask.shape[0]
        self.n_items = URM_mask.shape[1]


        # Extract users having at least one interaction to choose from
        self.eligibleUsers = []

        for user_id in range(n_users):

            start_pos = self.URM_mask.indptr[user_id]
            end_pos = self.URM_mask.indptr[user_id+1]

            if len(self.URM_mask.indices[start_pos:end_pos]) > 0:
                self.eligibleUsers.append(user_id)



    def sampleTriplet(self):

        # By randomly selecting a user in this way we could end up 
        # with a user with no interactions
        #user_id = np.random.randint(0, n_users)

        user_id = np.random.choice(self.eligibleUsers)

        # Get user seen items and choose one
        userSeenItems = URM_mask[user_id,:].indices
        pos_item_id = np.random.choice(userSeenItems)

        negItemSelected = False

        # It's faster to just try again then to build a mapping of the non-seen items
        while (not negItemSelected):
            neg_item_id = np.random.randint(0, n_items)

            if (neg_item_id not in userSeenItems):

                negItemSelected = True

        return user_id, pos_item_id, neg_item_id


        
    def epochIteration(self):

        # Get number of available interactions
        numPositiveIteractions = int(self.URM_mask.nnz*0.01)

        start_time_epoch = time.time()
        start_time_batch = time.time()

        # Uniform user sampling without replacement
        for num_sample in range(numPositiveIteractions):

            # Sample
            user_id, positive_item_id, negative_item_id = self.sampleTriplet()

            userSeenItems = self.URM_mask[user_id,:].indices

            # Prediction
            x_i = self.similarity_matrix[positive_item_id, userSeenItems].sum()
            x_j = self.similarity_matrix[negative_item_id, userSeenItems].sum()

            # Gradient
            x_ij = x_i - x_j

            gradient = 1 / (1 + np.exp(x_ij))

            # Update
            self.similarity_matrix[positive_item_id, userSeenItems] += learning_rate * gradient
            self.similarity_matrix[positive_item_id, positive_item_id] = 0

            self.similarity_matrix[negative_item_id, userSeenItems] -= learning_rate * gradient
            self.similarity_matrix[negative_item_id, negative_item_id] = 0


            if(time.time() - start_time_batch >= 30 or num_sample == numPositiveIteractions-1):
                print("Processed {} ( {:.2f}% ) in {:.2f} seconds. Sample per second: {:.0f}".format(
                    num_sample,
                    100.0* float(num_sample)/numPositiveIteractions,
                    time.time() - start_time_batch,
                    float(num_sample) / (time.time() - start_time_epoch)))

                start_time_batch = time.time()

                
    def fit(self, learning_rate = 0.01, epochs = 10):
 
        self.learning_rate = learning_rate
        self.epochs = epochs

        for numEpoch in range(self.epochs):
            self.epochIteration()
            
        self.similarity_matrix = self.similarity_matrix.T
        
        self.similarity_matrix = similarityMatrixTopK(self.similarity_matrix, k=100)
        
        
        
    def recommend(self, user_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = user_profile.dot(self.similarity_matrix).toarray().ravel()

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]
            
        return ranking[:at]
    
    
    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]
        
        scores[user_profile] = -np.inf

        return scores  

In [26]:
recommender = SLIM_BPR_Recommender(URM_train)
recommender.fit(epochs=1)

Processed 905 ( 99.89% ) in 0.79 seconds. Sample per second: 1151


In [27]:
from Notebooks_utils.evaluation_function import evaluate_algorithm

evaluate_algorithm(URM_test, recommender)

Evaluated user 0 of 7947
Recommender performance is: Precision = 0.0048, Recall = 0.0077, MAP = 0.0058


{'precision': 0.004815409309791321,
 'recall': 0.007749077250882628,
 'MAP': 0.005782206765352834}

In [None]:
from SLIM_BPR.Cython.SLIM_BPR_Cython import SLIM_BPR_Cython



recommender = SLIM_BPR_Cython(URM_train, recompile_cython=False)

recommender.fit(epochs=50, batch_size=1, sgd_mode='sgd', learning_rate=1e-4, positive_threshold_BPR=1)

SLIM_BPR_Recommender: URM Detected 77 (0.97 %) cold users.
SLIM_BPR_Recommender: URM Detected 2360 (9.09 %) cold items.
Deallocating Cython objects
Unable to read memory status: list index out of range
SLIM_BPR_Recommender: Automatic selection of fastest train mode. Unable to get current RAM status, you may be using a non-Linux operating system. Using dense matrix.
Processed 7947 ( 100.00% ) in 0.26 seconds. BPR loss is 1.84E-08. Sample per second: 31100
SLIM_BPR_Recommender: Epoch 1 of 50. Elapsed time 0.18 sec
Processed 7947 ( 100.00% ) in 0.50 seconds. BPR loss is 7.39E-08. Sample per second: 15881
SLIM_BPR_Recommender: Epoch 2 of 50. Elapsed time 0.42 sec
Processed 7947 ( 100.00% ) in 0.91 seconds. BPR loss is 1.69E-07. Sample per second: 8705
SLIM_BPR_Recommender: Epoch 3 of 50. Elapsed time 0.83 sec
Processed 7947 ( 100.00% ) in 1.09 seconds. BPR loss is 2.66E-07. Sample per second: 7271
SLIM_BPR_Recommender: Epoch 4 of 50. Elapsed time 1.01 sec
Processed 7947 ( 100.00% ) in 0.31

In [32]:
from Base.Evaluation.Evaluator import EvaluatorHoldout

evaluator_validation = EvaluatorHoldout(URM_test, cutoff_list=[5])


evaluator_validation.evaluateRecommender(recommender)

EvaluatorHoldout: Processed 5607 ( 100.00% ) in 2.91 sec. Users per second: 1927


({5: {'ROC_AUC': 0.09944414719695624,
   'PRECISION': 0.03777421080791838,
   'PRECISION_RECALL_MIN_DEN': 0.07408299149872129,
   'RECALL': 0.0661762215840626,
   'MAP': 0.0441943602243227,
   'MRR': 0.0941709767552461,
   'NDCG': 0.057549240640812224,
   'F1': 0.04809512547599055,
   'HIT_RATE': 0.18887105403959337,
   'ARHR': 0.10281196123892751,
   'NOVELTY': 0.002398388748912338,
   'AVERAGE_POPULARITY': 0.1316498590143871,
   'DIVERSITY_MEAN_INTER_LIST': 0.9852515404111407,
   'DIVERSITY_HERFINDAHL': 0.9970151644567453,
   'COVERAGE_ITEM': 0.25570741097208854,
   'COVERAGE_ITEM_CORRECT': 0.01832531280076997,
   'COVERAGE_USER': 0.7055492638731596,
   'COVERAGE_USER_CORRECT': 0.11350195042154272,
   'DIVERSITY_GINI': 0.08708712420003395,
   'SHANNON_ENTROPY': 10.693234120895994}},
 'CUTOFF: 5 - ROC_AUC: 0.0994441, PRECISION: 0.0377742, PRECISION_RECALL_MIN_DEN: 0.0740830, RECALL: 0.0661762, MAP: 0.0441944, MRR: 0.0941710, NDCG: 0.0575492, F1: 0.0480951, HIT_RATE: 0.1888711, ARHR: 0

In [36]:
recommender = SLIM_BPR_Cython(URM_all, recompile_cython=False)

recommender.fit(epochs=10, batch_size=1, sgd_mode='sgd', learning_rate=1e-4, positive_threshold_BPR=1)

SLIM_BPR_Recommender: URM Detected 1079 (4.15 %) cold items.
Deallocating Cython objects
Unable to read memory status: list index out of range
SLIM_BPR_Recommender: Automatic selection of fastest train mode. Unable to get current RAM status, you may be using a non-Linux operating system. Using dense matrix.
Processed 7947 ( 100.00% ) in 0.41 seconds. BPR loss is 2.59E-08. Sample per second: 19553
SLIM_BPR_Recommender: Epoch 1 of 10. Elapsed time 0.02 sec
Processed 7947 ( 100.00% ) in 0.43 seconds. BPR loss is 1.15E-07. Sample per second: 18271
SLIM_BPR_Recommender: Epoch 2 of 10. Elapsed time 0.05 sec
Processed 7947 ( 100.00% ) in 0.47 seconds. BPR loss is 2.34E-07. Sample per second: 16952
SLIM_BPR_Recommender: Epoch 3 of 10. Elapsed time 0.09 sec
Processed 7947 ( 100.00% ) in 0.50 seconds. BPR loss is 4.34E-07. Sample per second: 15788
SLIM_BPR_Recommender: Epoch 4 of 10. Elapsed time 0.12 sec
Processed 7947 ( 100.00% ) in 0.55 seconds. BPR loss is 6.44E-07. Sample per second: 14362


In [37]:
recoms = recommender.recommend(userTestList, cutoff=10)

In [38]:
recomList = []
for i in range(len(recoms)):
    recomList.append(' '.join(str(e) for e in recoms[i]))
# print(recoms)

res = {"user_id": userTestList, "item_list": recomList}
result = pd.DataFrame(res, columns= ['user_id', 'item_list'])

result.to_csv ('outputs/slim_brp1.csv', index = False, header=True)