In [1]:
from ast import literal_eval
from os import listdir
from os.path import isfile, join
from scipy.sparse import csr_matrix, load_npz, save_npz
from tqdm import tqdm

import seaborn as sns
import datetime
import json
import numpy as np
import pandas as pd
import time
import yaml
import scipy.sparse as sparse

# Load Data

In [2]:
def get_yelp_df(path = 'data/', filename = 'data0.txt', sampling=False, top_user_num=6100, top_item_num=4000):
    """
    Get the pandas dataframe
    Sampling only the top users/items by density 
    Implicit representation applies
    """
    with open(filename,'r') as f:
        data = f.readlines()
        data = list(map(json.loads, data))
    #Get all the data from the data file
    df = pd.DataFrame(data)
    
    df.rename(columns={'stars': 'review_stars', 'text': 'review_text', 'cool': 'review_cool',
                       'funny': 'review_funny', 'useful': 'review_useful'},
              inplace=True)

    df['business_num_id'] = df.business_id.astype('category').\
        cat.rename_categories(range(0, df.business_id.nunique()))
    df['business_num_id'] = df['business_num_id'].astype('int')

    df['user_num_id'] = df.user_id.astype('category').\
    cat.rename_categories(range(0, df.user_id.nunique()))
    df['user_num_id'] = df['user_num_id'].astype('int')

    df['timestamp'] = df['date'].apply(date_to_timestamp)

    if sampling:
        df = filter_yelp_df(df, top_user_num=top_user_num, top_item_num=top_item_num)
        # Refresh num id
        df['business_num_id'] = df.business_id.astype('category').\
        cat.rename_categories(range(0, df.business_id.nunique()))
        df['business_num_id'] = df['business_num_id'].astype('int')
        
        df['user_num_id'] = df.user_id.astype('category').\
        cat.rename_categories(range(0, df.user_id.nunique()))
        df['user_num_id'] = df['user_num_id'].astype('int')
#     drop_list = ['date','review_id','review_funny','review_cool','review_useful']
#     df = df.drop(drop_list, axis=1)

    df = df.reset_index(drop = True)

    return df 


def filter_yelp_df(df, top_user_num=6100, top_item_num=4000):
    #Getting the reviews where starts are above 3
    df_implicit = df[df['review_stars']>3]
    frequent_user_id = df_implicit['user_num_id'].value_counts().head(top_user_num).index.values
    frequent_item_id = df_implicit['business_num_id'].value_counts().head(top_item_num).index.values
    return df.loc[(df['user_num_id'].isin(frequent_user_id)) & (df['business_num_id'].isin(frequent_item_id))]


def date_to_timestamp(date):
    dt = datetime.datetime.strptime(date, '%Y-%m-%d')
    return time.mktime(dt.timetuple())

def df_to_sparse(df, row_name='userId', col_name='movieId', value_name='rating',
                 shape=(138494, 131263)):
    rows = df[row_name]
    cols = df[col_name]
    if value_name is not None:
        values = df[value_name]
    else:
        values = [1]*len(rows)

    return csr_matrix((values, (rows, cols)), shape=shape)

In [3]:
#Getting the yelp data -> reviews, only get the data for the top frequent users & restuarants that had a history of rating > 3
df = get_yelp_df(path = '', filename="data0.txt", sampling= True)

In [4]:
df.shape

(39620, 12)

In [5]:
def get_rating_timestamp_matrix(df, sampling=False, top_user_num=6100, top_item_num=4000):
    """
    """
    #make the df implicit with top frenquent users and 
    #no need to sample anymore if df was sampled before 
    if sampling:
        df = filter_yelp_df(df, top_user_num=top_user_num, top_item_num=top_item_num)

    rating_matrix = df_to_sparse(df, row_name='user_num_id',
                                 col_name='business_num_id',
                                 value_name='review_stars',
                                 shape=None)

    timestamp_matrix = df_to_sparse(df, row_name='user_num_id',
                                    col_name='business_num_id',
                                    value_name='timestamp',
                                    shape=None)

    return rating_matrix, timestamp_matrix

In [6]:
rating_matrix, timestamp_matrix = get_rating_timestamp_matrix(df)

In [7]:
print(rating_matrix[2:10])

  (0, 924)	5
  (0, 1464)	5
  (0, 2186)	5
  (0, 2707)	5
  (0, 3174)	3
  (1, 2570)	5
  (1, 2861)	4
  (1, 3803)	4
  (2, 1888)	4
  (2, 2245)	5
  (2, 2453)	4
  (2, 3047)	4
  (2, 3105)	4
  (2, 3135)	5
  (3, 15)	5
  (3, 358)	2
  (3, 453)	4
  (3, 1248)	5
  (3, 1631)	5
  (3, 1642)	4
  (3, 1869)	5
  (3, 2229)	4
  (3, 2305)	4
  (3, 2931)	5
  (3, 3076)	5
  :	:
  (4, 3340)	5
  (4, 3489)	4
  (4, 3781)	3
  (5, 100)	5
  (5, 1770)	1
  (5, 2311)	4
  (5, 2377)	5
  (5, 3284)	2
  (6, 167)	1
  (6, 466)	5
  (6, 823)	4
  (6, 1611)	1
  (6, 3207)	5
  (6, 3390)	5
  (6, 3743)	5
  (7, 156)	4
  (7, 174)	3
  (7, 423)	4
  (7, 783)	2
  (7, 1701)	2
  (7, 1765)	2
  (7, 2115)	2
  (7, 2459)	2
  (7, 2612)	4
  (7, 3814)	4


# Time ordered split

In [8]:
def time_ordered_split(rating_matrix, timestamp_matrix, ratio=[0.5, 0.2, 0.3],
                       implicit=True, remove_empty=False, threshold=3,
                       sampling=False, sampling_ratio=0.1):
    """
    Split the data to train,valid,test by time
    ratio:  train:valid:test
    threshold: for implicit representation
    """
    if implicit:
        temp_rating_matrix = sparse.csr_matrix(rating_matrix.shape)
        temp_rating_matrix[(rating_matrix > threshold).nonzero()] = 1
        rating_matrix = temp_rating_matrix
        timestamp_matrix = timestamp_matrix.multiply(rating_matrix)

    nonzero_index = None

    #Default false, not removing empty columns and rows
    #Should not have this case, since users should have at least 1 record of 4,5 
    #And restuarant should have at least 1 record of 4,5 
    if remove_empty:
        # Remove empty columns. record original item index
        nonzero_index = np.unique(rating_matrix.nonzero()[1])
        rating_matrix = rating_matrix[:, nonzero_index]
        timestamp_matrix = timestamp_matrix[:, nonzero_index]

        # Remove empty rows. record original user index
        nonzero_rows = np.unique(rating_matrix.nonzero()[0])
        rating_matrix = rating_matrix[nonzero_rows]
        timestamp_matrix = timestamp_matrix[nonzero_rows]

    user_num, item_num = rating_matrix.shape

    rtrain = []
    rtime = []
    rvalid = []
    rtest = []
    # Get the index list corresponding to item for train,valid,test
    item_idx_train = []
    item_idx_valid = []
    item_idx_test = []
    
    for i in tqdm(range(user_num)):
        #Get the non_zero indexs, restuarants where the user visited/liked if implicit 
        item_indexes = rating_matrix[i].nonzero()[1]
        
        #Get the data for the user
        data = rating_matrix[i].data
        
        #Get time stamp value 
        timestamp = timestamp_matrix[i].data
        
        #Non zero reviews for this user
        num_nonzeros = len(item_indexes)
        
        #If the user has at least one review
        if num_nonzeros >= 1:
            #Get number of test and valid data 
            #train is 30%
            num_test = int(num_nonzeros * ratio[2])
            #validate is 50%
            num_valid = int(num_nonzeros * (ratio[1] + ratio[2]))

            valid_offset = num_nonzeros - num_valid
            test_offset = num_nonzeros - num_test

            #Sort the timestamp for each review for the user
            argsort = np.argsort(timestamp)
            
            #Sort the reviews for the user according to the time stamp 
            data = data[argsort]
            
            #Non-zero review index sort according to time
            item_indexes = item_indexes[argsort]
            
            #list of ratings, num of valid_offset index, index where there's non-zeros
            rtrain.append([data[:valid_offset], np.full(valid_offset, i), item_indexes[:valid_offset]])
            #50%-70%
            rvalid.append([data[valid_offset:test_offset], np.full(test_offset - valid_offset, i),
                           item_indexes[valid_offset:test_offset]])
            #remaining 30%
            rtest.append([data[test_offset:], np.full(num_test, i), item_indexes[test_offset:]])
            
            item_idx_train.append(item_indexes[:valid_offset])
            
#             item_idx_valid.append(item_indexes[valid_offset:test_offset])
#             item_idx_test.append(item_indexes[test_offset:])
        else:
            item_idx_train.append([])
#             item_idx_valid.append([])
#             item_idx_test.append([])
    rtrain = np.array(rtrain)
    rvalid = np.array(rvalid)
    rtest = np.array(rtest)
    
    #print(rtrain)
    
    
    #take non-zeros values, row index, and column (non-zero) index and store into sparse matrix
    rtrain = sparse.csr_matrix((np.hstack(rtrain[:, 0]), (np.hstack(rtrain[:, 1]), np.hstack(rtrain[:, 2]))),
                               shape=rating_matrix.shape, dtype=np.float32)
    rvalid = sparse.csr_matrix((np.hstack(rvalid[:, 0]), (np.hstack(rvalid[:, 1]), np.hstack(rvalid[:, 2]))),
                               shape=rating_matrix.shape, dtype=np.float32)
    rtest = sparse.csr_matrix((np.hstack(rtest[:, 0]), (np.hstack(rtest[:, 1]), np.hstack(rtest[:, 2]))),
                              shape=rating_matrix.shape, dtype=np.float32)


    return rtrain, rvalid, rtest, nonzero_index, timestamp_matrix, item_idx_train, item_idx_valid, item_idx_test

In [12]:
rtrain, rvalid, rtest, nonzero_index, rtime, item_idx_matrix_train,item_idx_matrix_valid, item_idx_matrix_test = time_ordered_split(rating_matrix=rating_matrix,timestamp_matrix=timestamp_matrix,
                                                                     ratio=[0.5,0.2,0.3],
                                                                     implicit=False)  

100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:01<00:00, 5297.83it/s]


In [13]:
int(1.8)

1

In [14]:
item_idx_matrix_train

[array([2776,  346, 2554]),
 array([  16, 2206, 2830, 2074]),
 array([ 924, 2707, 2186]),
 array([2861, 2570]),
 array([1888, 2245, 2453]),
 array([3741, 2305, 1248, 1642, 1869, 2931, 2229]),
 array([1921, 3209, 1416, 3174,  402, 1237, 3340, 3781, 2595, 2738,  432]),
 array([1770, 2311, 2377]),
 array([ 466, 3207,  823, 1611]),
 array([2459, 1765,  156, 1701, 2115]),
 array([ 792, 3544,  514, 1928, 3024]),
 array([103,  62]),
 array([ 701, 3216]),
 array([3348,  277, 2378]),
 array([ 375, 2339]),
 array([3832, 1050]),
 array([3785]),
 array([1034]),
 array([2087, 1898]),
 array([ 574, 1205, 3851,  720]),
 array([2825,  468, 2106]),
 array([1062]),
 array([1361]),
 array([2142,  881, 1013, 2416]),
 array([2612, 3133, 3839,  194, 2965]),
 array([2085, 3531]),
 array([1157, 1653]),
 array([1169,  245, 1983, 2495,  325, 1212, 2673]),
 array([3120]),
 array([1418]),
 array([ 338, 3525]),
 array([1840,  505, 1113, 3874]),
 array([2004, 1637, 3542, 3603, 3369]),
 array([3192, 3399]),
 array([

In [15]:
print(rtrain)

  (0, 346)	4.0
  (0, 2554)	5.0
  (0, 2776)	5.0
  (1, 16)	5.0
  (1, 2074)	5.0
  (1, 2206)	5.0
  (1, 2830)	5.0
  (2, 924)	5.0
  (2, 2186)	5.0
  (2, 2707)	5.0
  (3, 2570)	5.0
  (3, 2861)	4.0
  (4, 1888)	4.0
  (4, 2245)	5.0
  (4, 2453)	4.0
  (5, 1248)	5.0
  (5, 1642)	4.0
  (5, 1869)	5.0
  (5, 2229)	4.0
  (5, 2305)	4.0
  (5, 2931)	5.0
  (5, 3741)	5.0
  (6, 402)	5.0
  (6, 432)	5.0
  (6, 1237)	4.0
  :	:
  (5785, 754)	5.0
  (5785, 1786)	4.0
  (5785, 3319)	5.0
  (5786, 977)	1.0
  (5786, 2823)	5.0
  (5787, 52)	5.0
  (5787, 635)	4.0
  (5787, 1745)	4.0
  (5787, 1791)	4.0
  (5787, 2552)	4.0
  (5787, 3223)	5.0
  (5787, 3352)	5.0
  (5788, 1370)	5.0
  (5788, 2057)	1.0
  (5788, 2373)	5.0
  (5789, 3409)	5.0
  (5790, 178)	5.0
  (5790, 271)	4.0
  (5790, 282)	3.0
  (5790, 551)	5.0
  (5790, 1474)	5.0
  (5790, 1500)	4.0
  (5790, 2134)	3.0
  (5790, 2735)	5.0
  (5790, 3215)	4.0


## Get df for training corpus

In [16]:
#Item idex matrix stores the reivews starts
#This function returns a list of index for the reviews included in training set 
def get_corpus_idx_list(df, item_idx_matrix):
    """
    Input: 
    df: total dataframe
    item_idx_matrix: train index list got from time_split 
    Output: row index in original dataframe for training data by time split
    """
    lst = []
    #For all the users: 5791
    for i in tqdm(range(len(item_idx_matrix))):
        
        #find row index where user_num_id is i
        a = df.index[df['user_num_id'] == i].tolist()
        
        #loop through the busienss id that the user i reviewed for in offvalid set 
        for item_idx in  item_idx_matrix[i]:
            
            #get the row index for reviews for business that the user liked in the train set
            b = df.index[df['business_num_id'] == item_idx].tolist()
            
            #Find the index for which this user liked, one user only rate a business once
            idx_to_add = list(set(a).intersection(b))
            
            if idx_to_add not in lst:
                lst.extend(idx_to_add)
    return lst

In [17]:
lst_train = get_corpus_idx_list(df, item_idx_matrix_train)

# Get the training dataframe from the original dataframe
df_train = df.loc[lst_train]

100%|█████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:09<00:00, 484.69it/s]


In [18]:
#Resetting the index of the train data
df_train = df_train.reset_index(drop=True)

In [19]:
df_train.head(5)

Unnamed: 0,business_id,review_cool,date,review_funny,review_id,review_stars,review_text,review_useful,user_id,business_num_id,user_num_id,timestamp
0,h7s4MRVvB726jWHCHiQ9kw,0,2015-07-10,0,50CAr36tKFbhnZuUg1HEag,5,"I've come to this place several times, and eac...",0,-1immOUG00aBdpcQPFYQhA,2776,0,1436501000.0
1,4VkLiFoIEjTTpbCRgxjnCQ,0,2015-09-10,0,oUluqUz1h9G-TwrFHIRQ7g,4,A quaint little hole-in-the-wall deli/restaura...,0,-1immOUG00aBdpcQPFYQhA,346,0,1441858000.0
2,duw-3Aj7gAKbQG-isS5UtQ,0,2015-09-10,0,aPTCoZa_5hb-LJ4QHST6zQ,5,Terrific service and comfortable atmosphere as...,0,-1immOUG00aBdpcQPFYQhA,2554,0,1441858000.0
3,-Bdw-5H5C4AYSMGnAvmnzw,4,2011-01-08,15,2yjShsBAIqUN4NZ2UuQH8A,5,Sadly I'm going to rate this restaurant based ...,7,-3gIDSqZ04FROn3du4CK2A,16,1,1294463000.0
4,ZHQU79PMUGCX0fEoyCcWlw,2,2011-01-08,0,hwf4-6O18fP0PJ3bP2i2BA,5,The 5-Star rating is a combo good food+cool pl...,4,-3gIDSqZ04FROn3du4CK2A,2206,1,1294463000.0


In [20]:
df_train.shape

(21304, 12)

In [21]:
import pickle as pkl
# Save and load df_train
# with open('df_train.pkl', 'wb') as handle:
#     pkl.dump(df_train, handle)

# with open('df_train.pkl', 'rb') as handle:
#     df_train = pkl.load(handle)

## Preprocess using tf-idf

In [22]:
#Stemming and Lemmatisation
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('wordnet')
lem = WordNetLemmatizer()
stem = PorterStemmer()
word = 'inversely'
print ('stemming:', stem.stem(word))
print ('lemmatization:', lem.lemmatize(word, "v"))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\songya25\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\songya25\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


stemming: invers
lemmatization: inversely


In [23]:
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

In [24]:
nltk.download('stopwords')
nltk.download('wordnet')
# Create a list of stopwords and add custom stopwords
stop_words = set(stopwords.words("english"))
# stop_words.remove('not')
## A list of custom stopwords
# new_words = ['using','show','result','large','also']
new_words = ['not_the']
stop_words = stop_words.union(new_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\songya25\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\songya25\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [25]:
# Get corpus and CountVector
from sklearn.feature_extraction.text import CountVectorizer
import re

In [26]:
from tqdm import tqdm

#Should 'because' added?
def preprocess(df, reset_list = [',','.','?',';','however','but']):
    corpus = []
    for i in tqdm(range(df.shape[0])):
        text = df['review_text'][i]
        change_flg = 0
        #Convert to lowercase
        text = text.lower()
        
        ##Convert to list from string, loop through the review text
        text = text.split()
        
        #any sentence that encounters a not, the folloing words will become not phrase until hit the sentence end
        for j in range(len(text)):
            #Make the not_ hack
            if text[j] == 'not':
                change_flg = 1
#                 print 'changes is made after ', i
                continue
            #if was 1 was round and not hit a 'not' in this round
            if change_flg == 1 and any(reset in text[j] for reset in reset_list):
                text[j] = 'not_' + text[j]
                change_flg = 0
#                 print 'reset at ', i
            if change_flg == 1:
                text[j] = 'not_' + text[j]
        
        #Convert back to string
        text = " ".join(text)
        
        #Remove punctuations
#         text = re.sub('[^a-zA-Z]', ' ', text)
        
        #remove tags
        text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
        
        # remove special characters and digits
        text=re.sub("(\\d|\\W)+"," ",text)
        
        ##Convert to list from string
        text = text.split()
        
        ##Stemming
        ps=PorterStemmer()
        
        #Lemmatisation
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in  
                stop_words] 
        text = " ".join(text)
        corpus.append(text)
    return corpus

In [27]:
# The entire corpus
corpus = preprocess(df_train)

100%|██████████████████████████████████████████████████████████████████████████| 21304/21304 [00:05<00:00, 3622.66it/s]


In [28]:
len(corpus)

21304

In [29]:
# X row: df_train row, column: key words frequency 
# When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold
cv=CountVectorizer(max_df=0.9,stop_words=stop_words, max_features=5000, ngram_range=(1,1))
X=cv.fit_transform(corpus)

In [30]:
X.shape

(21304, 5000)

In [31]:
# X should be a document(review) vs. term frequency matrix 
import sys
np.set_printoptions(threshold=sys.maxsize)
X[6].toarray().nonzero()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       dtype=int64),
 array([ 136,  181,  265,  466,  678,  919,  929, 1067, 1098, 1380, 1396,
        1733, 1857, 2031, 2067, 2293, 2489, 2606, 2615, 2625, 2633, 3208,
        3225, 3284, 3449, 3856, 3919, 3920, 3997, 4055, 4097, 4176, 4203,
        4222, 4472, 4543, 4544, 4631, 4642, 4684, 4885, 4953, 4961, 4979],
       dtype=int64))

### Process the Corpus using TF-IDF

In [49]:
dict_text = {}
for i in range(len(corpus)):
    if df_train['business_num_id'][i] not in dict_text:
        dict_text[df_train['business_num_id'][i]] = corpus[i]
    else:
        temp = dict_text[df_train['business_num_id'][i]]
        temp = temp + corpus[i]
        dict_text[df_train['business_num_id'][i]] = temp

In [50]:
list_text = []
for key in range(0,max(list(dict_text.keys()))+1) :
    if key not in dict_text.keys():
        list_text.extend([""])
    else:
        list_text.extend([dict_text[key]])

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [57]:
vectorizer = TfidfVectorizer(max_df=0.9,stop_words=stop_words, max_features=5000, ngram_range=(1,1))
X_cleaned = vectorizer.fit_transform(list_text).toarray()

In [58]:
X_cleaned.shape

(3906, 5000)

In [59]:
type(X_cleaned)

numpy.ndarray

In [60]:
X_cleaned_sparse = csr_matrix(X_cleaned)

In [61]:
print(X_cleaned_sparse)


  (0, 1)	0.023610601009401558
  (0, 62)	0.041110869816430244
  (0, 63)	0.023359909160515398
  (0, 95)	0.020641076642448835
  (0, 102)	0.03654977597689457
  (0, 104)	0.020738763932256966
  (0, 107)	0.04418234451287187
  (0, 118)	0.023267987596309325
  (0, 131)	0.018066835560039858
  (0, 140)	0.10402296049059105
  (0, 141)	0.03694109970759077
  (0, 142)	0.029314121389190803
  (0, 160)	0.10172468159243515
  (0, 169)	0.03445821373124826
  (0, 175)	0.03531032713837482
  (0, 183)	0.016742987998210047
  (0, 201)	0.06878458114812552
  (0, 203)	0.04089876691648105
  (0, 217)	0.023515621685203485
  (0, 229)	0.021617842092445178
  (0, 247)	0.03857163900021131
  (0, 248)	0.05915385056980205
  (0, 259)	0.036432881307268565
  (0, 268)	0.07400517079463148
  (0, 269)	0.01886051789943975
  :	:
  (3905, 3636)	0.08041919023280394
  (3905, 3687)	0.07824110374547194
  (3905, 3691)	0.08191032523086332
  (3905, 3705)	0.053642618756116145
  (3905, 3726)	0.0754449748628456
  (3905, 3810)	0.05292978261543481
  

## all Model

In [32]:
from sklearn.metrics.pairwise import cosine_similarity
def train(matrix_train):
    similarity = cosine_similarity(X=matrix_train, Y=None, dense_output=True)
    return similarity

def get_I_K(df, X, row_name = 'business_num_id', binary = True, shape = (121994,6000)):
    """
    get the item-keyphrase matrix
    """
    rows = []
    cols = []
    vals = []
    
    #For each review history
    for i in tqdm(range(X.shape[0])):
        #Get the array of frequencies for document/review i 
        arr = X[i].toarray() 
        nonzero_element = arr.nonzero()[1]  # Get nonzero element in each line, keyphrase that appears index
        
        length_of_nonzero = len(nonzero_element) #number of important keyphrase that appears
        
        # df[row_name][i] is the item idex
        #Get a list row index that indicates the document/review
        rows.extend(np.array([df[row_name][i]]*length_of_nonzero)) ## Item index
        #print(rows)
        
        #Get a list of column index indicating the key phrase that appears in i document/review
        cols.extend(nonzero_element) ## Keyword Index
        if binary:
            #Create a bunch of 1s
            vals.extend(np.array([1]*length_of_nonzero))
        else:
            #If not binary 
            vals.extend(arr[arr.nonzero()])    
    return csr_matrix((vals, (rows, cols)), shape=shape)


#Get a UI matrix if it's not item_similarity based or else IU
def predict(matrix_train, k, similarity, item_similarity_en = False):
    prediction_scores = []
    
    #inverse to IU matrix
    if item_similarity_en:
        matrix_train = matrix_train.transpose()
        
    #for each user or item, depends UI or IU 
    for user_index in tqdm(range(matrix_train.shape[0])):
        # Get user u's prediction scores for all items
        #Get prediction/similarity score for each user 1*num or user or num of items
        vector_u = similarity[user_index]

        # Get closest K neighbors excluding user u self
        #Decending accoding to similarity score, select top k
        similar_users = vector_u.argsort()[::-1][1:k+1]
        
        # Get neighbors similarity weights and ratings
        similar_users_weights = similarity[user_index][similar_users]
        #print(similar_users_weights.shape)
        #shape: num of res * k
        similar_users_ratings = matrix_train[similar_users].toarray()
        
        
        prediction_scores_u = similar_users_ratings * similar_users_weights[:, np.newaxis]
        #print(prediction_scores_u)
        
        """should divide by the sum of the weights if explicit"""
        prediction_scores.append(np.sum(prediction_scores_u, axis=0))
        
    res = np.array(prediction_scores)
    
    if item_similarity_en:
        res = res.transpose()
    return res


#Preidction score is UI or IU?
def prediction(prediction_score, topK, matrix_Train):

    prediction = []

    #for each user
    for user_index in tqdm(range(matrix_Train.shape[0])):
        
        #take the prediction scores for user 1 * num res
        vector_u = prediction_score[user_index]
        
        #The restuarant the user rated
        vector_train = matrix_Train[user_index]
        
        if len(vector_train.nonzero()[0]) > 0:
            vector_predict = sub_routine(vector_u, vector_train, topK=topK)
        else:
            vector_predict = np.zeros(topK, dtype=np.float32)

        prediction.append(vector_predict)

    return np.vstack(prediction)

#topK: the number of restuarants we are suggesting 
def sub_routine(vector_u, vector_train, topK=500):

    #index where non-zero
    train_index = vector_train.nonzero()[1]
    
    vector_u = vector_u
    
    #get topk + num rated res prediction score descending, top index 
    candidate_index = np.argpartition(-vector_u, topK+len(train_index))[:topK+len(train_index)]
    
    #sort top prediction score index in range topK+len(train_index) into vector_u`
    vector_u = candidate_index[vector_u[candidate_index].argsort()[::-1]]
    
    #deleted the rated res from the topk+train_index prediction score vector for user u 
    #Delete the user rated res index from the topk+numRated index
    vector_u = np.delete(vector_u, np.isin(vector_u, train_index).nonzero()[0])

    #so we only include the top K prediction score here
    return vector_u[:topK]


In [33]:
([df['business_num_id'][0]]*1)

[3660]

### Evaluation 

In [34]:
def recallk(vector_true_dense, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_true_dense)


def precisionk(vector_predict, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_predict)


def average_precisionk(vector_predict, hits, **unused):
    precisions = np.cumsum(hits, dtype=np.float32)/range(1, len(vector_predict)+1)
    return np.mean(precisions)


def r_precision(vector_true_dense, vector_predict, **unused):
    vector_predict_short = vector_predict[:len(vector_true_dense)]
    hits = len(np.isin(vector_predict_short, vector_true_dense).nonzero()[0])
    return float(hits)/len(vector_true_dense)


def _dcg_support(size):
    arr = np.arange(1, size+1)+1
    return 1./np.log2(arr)


def ndcg(vector_true_dense, vector_predict, hits):
    idcg = np.sum(_dcg_support(len(vector_true_dense)))
    dcg_base = _dcg_support(len(vector_predict))
    dcg_base[np.logical_not(hits)] = 0
    dcg = np.sum(dcg_base)
    return dcg/idcg


def click(hits, **unused):
    first_hit = next((i for i, x in enumerate(hits) if x), None)
    if first_hit is None:
        return 5
    else:
        return first_hit/10


def evaluate(matrix_Predict, matrix_Test, metric_names =['R-Precision', 'NDCG', 'Precision', 'Recall', 'MAP'], atK = [5, 10, 15, 20, 50], analytical=False):
    """
    :param matrix_U: Latent representations of users, for LRecs it is RQ, for ALSs it is U
    :param matrix_V: Latent representations of items, for LRecs it is Q, for ALSs it is V
    :param matrix_Train: Rating matrix for training, features.
    :param matrix_Test: Rating matrix for evaluation, true labels.
    :param k: Top K retrieval
    :param metric_names: Evaluation metrics
    :return:
    """
    global_metrics = {
        "R-Precision": r_precision,
        "NDCG": ndcg,
        "Clicks": click
    }

    local_metrics = {
        "Precision": precisionk,
        "Recall": recallk,
        "MAP": average_precisionk
    }

    output = dict()

    num_users = matrix_Predict.shape[0]

    for k in atK:

        local_metric_names = list(set(metric_names).intersection(local_metrics.keys()))
        results = {name: [] for name in local_metric_names}
        topK_Predict = matrix_Predict[:, :k]

        for user_index in tqdm(range(topK_Predict.shape[0])):
            vector_predict = topK_Predict[user_index]
            if len(vector_predict.nonzero()[0]) > 0:
                vector_true = matrix_Test[user_index]
                vector_true_dense = vector_true.nonzero()[1]
                hits = np.isin(vector_predict, vector_true_dense)

                if vector_true_dense.size > 0:
                    for name in local_metric_names:
                        results[name].append(local_metrics[name](vector_true_dense=vector_true_dense,
                                                                 vector_predict=vector_predict,
                                                                 hits=hits))

        results_summary = dict()
        if analytical:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = results[name]
        else:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = (np.average(results[name]),
                                                              1.96*np.std(results[name])/np.sqrt(num_users))
        output.update(results_summary)

    global_metric_names = list(set(metric_names).intersection(global_metrics.keys()))
    results = {name: [] for name in global_metric_names}

    topK_Predict = matrix_Predict[:]

    for user_index in tqdm(range(topK_Predict.shape[0])):
        vector_predict = topK_Predict[user_index]

        if len(vector_predict.nonzero()[0]) > 0:
            vector_true = matrix_Test[user_index]
            vector_true_dense = vector_true.nonzero()[1]
            hits = np.isin(vector_predict, vector_true_dense)

            # if user_index == 1:
            #     import ipdb;
            #     ipdb.set_trace()

            if vector_true_dense.size > 0:
                for name in global_metric_names:
                    results[name].append(global_metrics[name](vector_true_dense=vector_true_dense,
                                                              vector_predict=vector_predict,
                                                              hits=hits))

    results_summary = dict()
    if analytical:
        for name in global_metric_names:
            results_summary[name] = results[name]
    else:
        for name in global_metric_names:
            results_summary[name] = (np.average(results[name]), 1.96*np.std(results[name])/np.sqrt(num_users))
    output.update(results_summary)

    return output


In [35]:
## Utility functions
#3906 restuarant, 3000 keyphrase, 5791 user 
def add_two_matrix(ratio, U_I_matrix,I_K_matrix, shape = (3906, 3000+5791)):
    # ratio determine Keywords/User in the matrix
    rows = []
    cols = []
    datas = []
    I_U_matrix = U_I_matrix.transpose()
    
    #for each restuarant
    for i in tqdm(range(I_K_matrix.shape[0])):
        #key phrase that this item has, column(key phrase) index
        nonzero1 = I_K_matrix[i].nonzero()
        
        #user that rated this item, column(user) index 
        nonzero2 = I_U_matrix[i].nonzero()
        
        #Trying to create a sparse matrix that stores 
        #index of restuarant for (K + U) times
        row = [i]*(len(nonzero1[1])+len(nonzero2[1]))
        
        #column index for key phrase and users that are non-zero
        col = nonzero1[1].tolist()+ nonzero2[1].tolist()
        
        
        data = [ratio]*len(nonzero1[1])+[1-ratio]*len(nonzero2[1]) # Binary representation of I-K/U matrix
        
        rows.extend(row)
        cols.extend(col)
        datas.extend(data)
    return csr_matrix( (datas,(rows,cols)), shape=shape )

def transfer_to_implicit(rating_matrix, threshold = 0):
    temp_rating_matrix = sparse.csr_matrix(rating_matrix.shape)
    temp_rating_matrix[(rating_matrix > threshold).nonzero()] = 1
    rating_matrix = temp_rating_matrix
    return rating_matrix

## user_item KNN

In [36]:
#UU similarity
similarity = train(rtrain)
#get a user-item matrix  UI prediction
#Predict using UI matrix with ratings in it 
user_item_prediction_score = predict(rtrain, 10, similarity, item_similarity_en= False)
user_item_predict = prediction(user_item_prediction_score, 50, rtrain)

100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:01<00:00, 3694.86it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:01<00:00, 5327.00it/s]


In [37]:
#Check user item prediction score
user_item_prediction_score.shape

(5791, 3906)

In [38]:
user_item_res = evaluate(user_item_predict, rvalid)

100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7752.07it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7710.94it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7690.52it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7488.08it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7428.95it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7530.86it/s]


In [39]:
user_item_res

{'Precision@5': (0.008521207489491785, 0.001083345161010646),
 'MAP@5': (0.009604508979747803, 0.0014980399129590725),
 'Recall@5': (0.027657621854221014, 0.0038284227312722972),
 'Precision@10': (0.007336645013374094, 0.0007207138421056788),
 'MAP@10': (0.008640277853594629, 0.0010828056779517141),
 'Recall@10': (0.04666506830960404, 0.00490789032159672),
 'Precision@15': (0.006317666539294358, 0.0005550900831680399),
 'MAP@15': (0.007981665407152607, 0.0008838342244627093),
 'Recall@15': (0.05916937263899434, 0.00547178567846764),
 'Precision@20': (0.00550248376003057, 0.0004484214352208597),
 'MAP@20': (0.007436901396458298, 0.0007604941845718976),
 'Recall@20': (0.06834653585704407, 0.005874313750771494),
 'Precision@50': (0.0028467711119602598, 0.0002022636456795373),
 'MAP@50': (0.005292439152889286, 0.0004483106388828079),
 'Recall@50': (0.08762987161439588, 0.006543335984180451),
 'R-Precision': (0.010262858400038375, 0.002159512356554834),
 'NDCG': (0.0364597104610826, 0.00307

## item_based KNN using TF-IDF

In [75]:
IK_MATRIX = X_cleaned_sparse

In [74]:
I_I_similarity = train(IK_MATRIX)
item_based_prediction_score = predict(rtrain, 10, I_I_similarity, item_similarity_en= True)
#for each restuarant top50 users 
item_based_predict = prediction(item_based_prediction_score, 50, rtrain)

100%|████████████████████████████████████████████████████████████████████████████| 3906/3906 [00:03<00:00, 1195.51it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:01<00:00, 4452.89it/s]


In [71]:
item_based_res_TFIDF = evaluate(item_based_predict, rvalid)

100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7670.19it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7463.44it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7540.72it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7482.39it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7368.77it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7312.79it/s]


In [73]:
item_based_res_TFIDF

{'Precision@5': (0.005196790217806649, 0.0008617099481287719),
 'MAP@5': (0.006092854413450516, 0.0012065821524976797),
 'Recall@5': (0.013982822520747624, 0.0026087530571683427),
 'Precision@10': (0.004088651127244936, 0.0005460438377275104),
 'MAP@10': (0.005296656173614523, 0.0008686098122583183),
 'Recall@10': (0.02137126422806591, 0.0031659836297877175),
 'Precision@15': (0.0033116800407591387, 0.00039791432969878886),
 'MAP@15': (0.004731502240099872, 0.0007020225320323408),
 'Recall@15': (0.026695426755132524, 0.0035515715171223422),
 'Precision@20': (0.0028467711119602598, 0.0003219054913786238),
 'MAP@20': (0.004307008630787735, 0.0005986059835155256),
 'Recall@20': (0.030501408539238113, 0.00379772817508023),
 'Precision@50': (0.0020328620557890716, 0.00018068011648347503),
 'MAP@50': (0.0031535092848903887, 0.00035182771127505825),
 'Recall@50': (0.0509775066567195, 0.004766205477582822),
 'R-Precision': (0.006640201761046239, 0.0016763599138188996),
 'NDCG': (0.020998827765

## item_based KNN

In [40]:
X

<21304x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 962735 stored elements in Compressed Sparse Row format>

In [62]:
#3987 items? 5000 key phrase
I_K_matrix = get_I_K(df_train, X, shape = (rating_matrix.shape[1], 5000))
I_I_similarity = train(I_K_matrix)
item_based_prediction_score = predict(rtrain, 10, I_I_similarity, item_similarity_en= True)
#for each restuarant top50 users 
item_based_predict = prediction(item_based_prediction_score, 50, rtrain)

100%|██████████████████████████████████████████████████████████████████████████| 21304/21304 [00:02<00:00, 9085.93it/s]
100%|████████████████████████████████████████████████████████████████████████████| 3906/3906 [00:03<00:00, 1191.86it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:01<00:00, 4422.25it/s]


In [64]:
I_K_matrix.shape

(3906, 5000)

In [65]:
item_based_res = evaluate(item_based_predict, rvalid)

100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7690.55it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7589.95it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7590.02it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7600.18it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7521.46it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7349.86it/s]


In [66]:
item_based_res

{'Precision@5': (0.004661826518914789, 0.0008154305536628502),
 'MAP@5': (0.006100496752006114, 0.0012543375211610756),
 'Recall@5': (0.012692147469564359, 0.002491705609306215),
 'Precision@10': (0.003802063431410011, 0.0005201087060859525),
 'MAP@10': (0.005101192750695999, 0.0008773434438779437),
 'Recall@10': (0.01964220925340528, 0.0030311374548420427),
 'Precision@15': (0.0032607311170551527, 0.000401012671738522),
 'MAP@15': (0.004551910005673858, 0.0007033252979959634),
 'Recall@15': (0.0241127705352008, 0.0033041115431208384),
 'Precision@20': (0.0029709591134887275, 0.0003351607803518368),
 'MAP@20': (0.00418789110910929, 0.0005987757870873808),
 'Recall@20': (0.02839681070002048, 0.0035343793608007884),
 'Precision@50': (0.0020596102407336647, 0.0001872347597298744),
 'MAP@50': (0.0031185600823783475, 0.00035244376851816087),
 'Recall@50': (0.04754841123297371, 0.004547225557607917),
 'R-Precision': (0.005801470957183605, 0.0014985094919413374),
 'NDCG': (0.02006206411854443

## User_based KNN

In [45]:
U_K_matrix = get_I_K(df_train, X, row_name = 'user_num_id', shape = (rating_matrix.shape[0],5000))
U_U_similarity = train(U_K_matrix)
#return UI matrix
user_based_prediction_score = predict(rtrain, 10, U_U_similarity, item_similarity_en= False)
user_based_predict = prediction(user_based_prediction_score, 50, rtrain)

100%|██████████████████████████████████████████████████████████████████████████| 21304/21304 [00:02<00:00, 9120.85it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:03<00:00, 1835.78it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:01<00:00, 5165.85it/s]


In [46]:
U_K_matrix.shape

(5791, 5000)

In [47]:
user_based_res = evaluate(user_based_predict, rvalid)

100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7670.71it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7609.88it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7550.51it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7580.08it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7472.80it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5791/5791 [00:00<00:00, 7359.13it/s]


In [44]:
user_based_res

{'MAP@5': (0.0025735575085976308, 0.0007676156126468466),
 'Precision@5': (0.0024073366450133742, 0.0005795062496393374),
 'Recall@5': (0.0066284467029595045, 0.0018404749909681256),
 'MAP@10': (0.002507725737091423, 0.000559410552105227),
 'Precision@10': (0.002426442491402369, 0.0004088981513066637),
 'Recall@10': (0.012451814945127898, 0.0024279126495880673),
 'MAP@15': (0.0024161872107993625, 0.00046230520394713445),
 'Precision@15': (0.0021525920264934403, 0.00031802087357710076),
 'Recall@15': (0.01635767853765561, 0.0027477111797367055),
 'MAP@20': (0.0023298605384057315, 0.0004018414719961806),
 'Precision@20': (0.0019296904852884986, 0.00026052312171035455),
 'Recall@20': (0.01908269467975238, 0.0029418874130037625),
 'MAP@50': (0.0019804868107647835, 0.0002494590150419936),
 'Precision@50': (0.0015819640810087888, 0.00015258737043166547),
 'Recall@50': (0.039183685481927744, 0.004186060385583187),
 'R-Precision': (0.0027842566722964123, 0.001053550976552734),
 'NDCG': (0.0138

## Combined KNN 

### lambda = .001

In [None]:
## Change to implicit first
I_K_matrix_implicit = transfer_to_implicit(I_K_matrix)

In [None]:
I_K_U_matrix = add_two_matrix(0.001, rtrain_implicit, I_K_matrix_implicit, shape = (3987, 5000+6049))
I_I_matrix_combined = train(I_K_U_matrix)
combined_prediction_score = predict(rtrain_implicit, 10, I_I_matrix_combined, item_similarity_en= True)
combined_predict = prediction(combined_prediction_score, 50, rtrain_implicit)

In [None]:
combined_res = evaluate(combined_predict, rvalid_implicit)

In [None]:
combined_res

## lambda = 0.5

In [None]:
I_K_U_matrix = add_two_matrix(0.5, rtrain_implicit, I_K_matrix_implicit, shape = (3987, 5000+6049))
I_I_matrix_combined = train(I_K_U_matrix)
combined_prediction_score = predict(rtrain_implicit, 10, I_I_matrix_combined, item_similarity_en= True)
combined_predict = prediction(combined_prediction_score, 50, rtrain_implicit)

In [None]:
combined_res = evaluate(combined_predict, rvalid_implicit)

In [None]:
combined_res

## lambda = .99

In [None]:
I_K_U_matrix = add_two_matrix(0.99, rtrain_implicit, I_K_matrix_implicit, shape = (3987, 5000+6049))
I_I_matrix_combined = train(I_K_U_matrix)
combined_prediction_score = predict(rtrain_implicit, 10, I_I_matrix_combined, item_similarity_en= True)
combined_predict = prediction(combined_prediction_score, 50, rtrain_implicit)

In [None]:
combined_res = evaluate(combined_predict, rvalid_implicit)

In [None]:
combined_res

## lambda = .3

In [None]:
I_K_U_matrix = add_two_matrix(0.3, rtrain_implicit, I_K_matrix_implicit, shape = (3987, 5000+6049))
I_I_matrix_combined = train(I_K_U_matrix)
combined_prediction_score = predict(rtrain_implicit, 10, I_I_matrix_combined, item_similarity_en= True)
combined_predict = prediction(combined_prediction_score, 50, rtrain_implicit)
combined_res = evaluate(combined_predict, rvalid_implicit)

In [None]:
combined_res

In [57]:
x = np.array([10, 20, 4, 3, 7, 8, 11])

In [58]:
np.argpartition(x, 0)

array([3, 1, 2, 0, 4, 5, 6], dtype=int64)

In [61]:
x[np.argpartition(x, 4)]

array([ 7,  3,  4,  8, 10, 20, 11])