In [1]:
pip install geopy

Collecting geopy
  Downloading https://files.pythonhosted.org/packages/80/93/d384479da0ead712bdaf697a8399c13a9a89bd856ada5a27d462fb45e47b/geopy-1.20.0-py2.py3-none-any.whl (100kB)
Collecting geographiclib<2,>=1.49
  Downloading https://files.pythonhosted.org/packages/8b/62/26ec95a98ba64299163199e95ad1b0e34ad3f4e176e221c40245f211e425/geographiclib-1.50-py3-none-any.whl
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.50 geopy-1.20.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from ast import literal_eval
from os import listdir
from os.path import isfile, join
from scipy.sparse import csr_matrix, load_npz, save_npz
from tqdm import tqdm
#from scipy import integrate
import statistics as stats
#import seaborn as sns
import datetime
import json
import numpy as np
import pandas as pd
import time
import yaml
import scipy.sparse as sparse
import sys
from geopy.geocoders import Nominatim
from geopy import distance
from geopy import Point

## Dataset files 

In [3]:
#reviewJson = "..\\data\\Export_CleanedReview.json"
#reviewJsonWithClosedRes = "..\\data\\Export_CleanedReviewWithClosedRes.json"
#reviewJsonToronto = "..\\data\\Export_TorontoData.json"
reviewJsonToronto = "..\\data\\Cleaned_Toronto_Reviews.json"


## Load Data
Select top frenquent user and top frequenty restaurants that had at least 1 review >= 4 stars (Kickking out users that gave all  reviews <=3 and restaurants that never got start >= 4 stars)

In [4]:
def get_yelp_df(path = 'data/', filename = 'Export_CleanedReview.json', sampling=False, top_user_num=7000, top_item_num=5000):
    """
    Get the pandas dataframe
    Sampling only the top users/items by density 
    Implicit representation applies
    """
    with open(filename,'r') as f:
        data = f.readlines()
        data = list(map(json.loads, data))
    
    data = data[0]
    #Get all the data from the dggeata file
    df = pd.DataFrame(data)
    
    df.rename(columns={'stars': 'review_stars', 'text': 'review_text', 'cool': 'review_cool',
                       'funny': 'review_funny', 'useful': 'review_useful'},
              inplace=True)

    df['business_num_id'] = df.business_id.astype('category').\
        cat.rename_categories(range(0, df.business_id.nunique()))
    df['business_num_id'] = df['business_num_id'].astype('int')

    df['user_num_id'] = df.user_id.astype('category').\
    cat.rename_categories(range(0, df.user_id.nunique()))
    df['user_num_id'] = df['user_num_id'].astype('int')

    df['timestamp'] = df['date'].apply(date_to_timestamp)

    if sampling:
        df = filter_yelp_df(df, top_user_num=top_user_num, top_item_num=top_item_num)
        # Refresh num id
        df['business_num_id'] = df.business_id.astype('category').\
        cat.rename_categories(range(0, df.business_id.nunique()))
        df['business_num_id'] = df['business_num_id'].astype('int')
        
        df['user_num_id'] = df.user_id.astype('category').\
        cat.rename_categories(range(0, df.user_id.nunique()))
        df['user_num_id'] = df['user_num_id'].astype('int')
#     drop_list = ['date','review_id','review_funny','review_cool','review_useful']
#     df = df.drop(drop_list, axis=1)

    df = df.reset_index(drop = True)

    return df 

def filter_yelp_df(df, top_user_num=7000, top_item_num=5000):
    #Getting the reviews where starts are above 3
    df_implicit = df[df['review_stars']>3]
    frequent_user_id = df_implicit['user_num_id'].value_counts().head(top_user_num).index.values
    frequent_item_id = df_implicit['business_num_id'].value_counts().head(top_item_num).index.values
    return df.loc[(df['user_num_id'].isin(frequent_user_id)) & (df['business_num_id'].isin(frequent_item_id))]

def date_to_timestamp(date):
    dt = datetime.datetime.strptime(date, '%Y-%m-%d')
    return time.mktime(dt.timetuple())

def df_to_sparse(df, row_name='userId', col_name='movieId', value_name='rating',
                 shape=None):
    rows = df[row_name]
    cols = df[col_name]
    if value_name is not None:
        values = df[value_name]
    else:
        values = [1]*len(rows)

    return csr_matrix((values, (rows, cols)), shape=shape)

## Get rating-UI and timestamp-UI matrix from original df

In [5]:
def get_rating_timestamp_matrix(df, sampling=False, top_user_num=7000, top_item_num=5000):
    """
    """
    #make the df implicit with top frenquent users and 
    #no need to sample anymore if df was sampled before 
    if sampling:
        df = filter_yelp_df(df, top_user_num=top_user_num, top_item_num=top_item_num)

    rating_matrix = df_to_sparse(df, row_name='user_num_id',
                                 col_name='business_num_id',
                                 value_name='review_stars',
                                 shape=None)
    
    #Have same dimension and data entries with rating_matrix, except that the review stars are - user avg
#     ratingWuserAvg_matrix = df_to_sparse(df, row_name='user_num_id',
#                                  col_name='business_num_id',
#                                  value_name='reviewStars_userAvg',
#                                  shape=None)
    
    timestamp_matrix = df_to_sparse(df, row_name='user_num_id',
                                    col_name='business_num_id',
                                    value_name='timestamp',
                                    shape=None)
    
    
    IC_matrix, IC_dictionary = get_I_C(df)
#     ratingWuserAvg_matrix
    return rating_matrix, timestamp_matrix, IC_matrix, IC_dictionary

In [6]:
def get_I_C(df):
    lst = df.categories.values.tolist()
    cat = []
    for i in range(len(lst)):
        if lst[i] is None:
            print(i)
        cat.extend(lst[i].split(', '))
        
    unique_cat = set(cat)
    #     set categories id
    df_cat = pd.DataFrame(list(unique_cat),columns=["Categories"])
    df_cat['cat_id'] = df_cat.Categories.astype('category').cat.rename_categories(range(0, df_cat.Categories.nunique()))
    dict_cat = df_cat.set_index('Categories')['cat_id'].to_dict()
    
    df_I_C = pd.DataFrame(columns=['business_num_id', 'cat_id'])
    
    for i in range((df['business_num_id'].unique().shape)[0]):
        df_temp = df[df['business_num_id'] == i].iloc[:1]
        temp_lst = df_temp['categories'].to_list()[0].split(",")
        for j in range(len(temp_lst)):
            df_I_C = df_I_C.append({'business_num_id' : i  , 'cat_id' : dict_cat[temp_lst[j].strip()]} , ignore_index=True)
    
    IC_Matrix = df_to_sparse(df_I_C, row_name='business_num_id',
                                 col_name='cat_id',
                                 value_name=None,
                                 shape=None)    
    return IC_Matrix, dict_cat

# Construct IC, IP, IR, ID... for Critiquing

In [7]:
def get_IP_matrix_dictionary(df,item_sim):
    # get an initial item price dataframe(without onehot encoding)
    # drop duplicates
    df_temp = df[['business_num_id', 'price']].drop_duplicates()
    # with nontype with string "NaN"
    df_temp.fillna(value = "NaN", inplace = True)
    
    for i in tqdm(range(df_temp.shape[0])):
    # find all the items with no price
        if df_temp[df_temp['business_num_id'] == i]['price'].values[0] == "NaN":
            # get the index of the second large number in the similarity matrix
            temp_l = list(item_sim[i])
            index = [temp_l.index(x) for x in sorted(temp_l, reverse=True)[:2]][1]
            # get the dollar sign of the similar item
            dollar_of_sim_item = df_temp[df_temp['business_num_id'] == index]['price'].values[0]
            # replace the Nan
            df_temp.loc[df_temp['business_num_id'] == i,"price"] = dollar_of_sim_item

    # assign single dollar sign($) to the ones still with no price tag(since there is no items that are similar to this item)
    df_temp.loc[df_temp["price"] == "NaN","price"] = "$"

    # One hot encoding
    #note that the last column is price__$$$$
    #cat_columns = ["price"]
    #df_processed = pd.get_dummies(df_temp, prefix_sep="_",
    #                          columns=cat_columns)
    
    df_processed = df_temp.copy()
    df_processed['Price'] = df_processed.apply (lambda row: len(row.price), axis=1)
    
    #drop the $ column
    df_processed = df_processed.drop('price', 1)
    
    #Adding additional column of price label, range 1-4
    #df_preprocessed['Price_label'] = df.apply (lambda row: label_price(row), axis=1)
    df_processed.set_index("business_num_id", drop=True, inplace=True)
    I_P_dictionary = df_processed.to_dict()['Price']
    df_processed.reset_index(level=0, inplace=True)
    
    return df_processed, I_P_dictionary

In [8]:
def get_IS_dictionary(df):
    df_IS = df[['business_num_id', 'business_stars']].drop_duplicates()
    df_IS.set_index("business_num_id", drop=True, inplace=True)
    IS_dictionary = df_IS.to_dict()['business_stars']
    df_IS.reset_index(level=0, inplace=True)
    
    return IS_dictionary

In [9]:
# Input a list of prediction matrix
def get_ID_dictionary(df,prediction_matrix,intersection):
    ID_dictionary = dict()
    length = len(prediction_matrix)
    
    for j in tqdm(range(length)):
        #Save the coordinates of the business id to a dictionary 
        coordinateDict = yaml.safe_load(df[df["business_num_id"] == prediction_matrix[j]].iloc[0].coordinates)
    
        #Load the business latitude and longitude
        test_point = Point(coordinateDict['latitude'],coordinateDict['longitude'])

        #Get the distance with the test point, unit in km 
        result = round(distance.distance(intersection,test_point).kilometers,1)

        ID_dictionary[prediction_matrix[j]] = result
        
    return ID_dictionary

## Time ordered split 

In [10]:
def time_ordered_splitModified(rating_matrix, ratingWuserAvg_matrix, timestamp_matrix, ratio=[0.5, 0.2, 0.3],
                       implicit=True, remove_empty=False, threshold=3,
                       sampling=False, sampling_ratio=0.1, trainSampling=1):
    """
    Split the data to train,valid,test by time
    ratio:  train:valid:test
    threshold: for implicit representation
    """
    
    
    if implicit:
        temp_rating_matrix = sparse.csr_matrix(rating_matrix.shape)
        temp_rating_matrix[(rating_matrix > threshold).nonzero()] = 1
        rating_matrix = temp_rating_matrix
        timestamp_matrix = timestamp_matrix.multiply(rating_matrix)
        #ratingWuserAvg_matrix = ratingWuserAvg_matrix.multiply(rating_matrix)

    nonzero_index = None

    #Default false, not removing empty columns and rows
    #Should not have this case, since users should have at least 1 record of 4,5 
    #And restuarant should have at least 1 record of 4,5 
    if remove_empty:
        # Remove empty columns. record original item index
        nonzero_index = np.unique(rating_matrix.nonzero()[1])
        rating_matrix = rating_matrix[:, nonzero_index]
        timestamp_matrix = timestamp_matrix[:, nonzero_index]
        ratingWuserAvg_matrix = ratingWuserAvg_matrix[:, nonzero_index]

        # Remove empty rows. record original user index
        nonzero_rows = np.unique(rating_matrix.nonzero()[0])
        rating_matrix = rating_matrix[nonzero_rows]
        timestamp_matrix = timestamp_matrix[nonzero_rows]
        ratingWuserAvg_matrix = ratingWuserAvg_matrix[nonzero_rows]

    user_num, item_num = rating_matrix.shape

    rtrain = []
    rtrain_userAvg = []
    rtime = []
    rvalid = []
    rvalid_userAvg = []
    rtest = []
    rtest_userAvg = []
    # Get the index list corresponding to item for train,valid,test
    item_idx_train = []
    item_idx_valid = []
    item_idx_test = []
    
    for i in tqdm(range(user_num)):
        #Get the non_zero indexs, restuarants where the user visited/liked if implicit 
        item_indexes = rating_matrix[i].nonzero()[1]        
        #Get the data for the user
        data = rating_matrix[i].data      
        #Get time stamp value 
        timestamp = timestamp_matrix[i].data 
        #Get review stars with user avg data 
        if implicit == False:
            dataWuserAvg = ratingWuserAvg_matrix[i].data

            
        #Non zero reviews for this user
        num_nonzeros = len(item_indexes)
        
        #If the user has at least one review
        if num_nonzeros >= 1:
            num_test = int(num_nonzeros * ratio[2])
            num_valid = int(num_nonzeros * (ratio[1] + ratio[2]))
            valid_offset = num_nonzeros - num_valid
            
            # Adding this for sampling for training set
            valid_offsetSample = int(valid_offset*trainSampling)
            test_offset = num_nonzeros - num_test
            
            #Sort the timestamp for each review for the user
            argsort = np.argsort(timestamp)
            
            #Sort the reviews for the user according to the time stamp 
            data = data[argsort]
            
            #Sort the review with user avg accoridng to the time stamp
            if implicit == False:
                dataWuserAvg = dataWuserAvg[argsort]
            
            #Non-zero review index sort according to time
            item_indexes = item_indexes[argsort]
            
            #list of ratings, num of valid_offset index, index where there's non-zeros
            #if take from old to new
            #rtrain.append([data[:valid_offsetSample], np.full(valid_offsetSample, i), item_indexes[:valid_offsetSample]])
            #if take from new to old
            rtrain.append([data[valid_offset-valid_offsetSample:valid_offset], np.full(valid_offsetSample, i), item_indexes[valid_offset-valid_offsetSample:valid_offset]])
            rvalid.append([data[valid_offset:test_offset], np.full(test_offset - valid_offset, i),
                           item_indexes[valid_offset:test_offset]])
            rtest.append([data[test_offset:], np.full(num_test, i), item_indexes[test_offset:]])
            
            if implicit == False:
                #Now for the rating matrix that considers user average rating
                #list of ratings, num of valid_offset index, index where there's non-zeros
                #from old to new
                #rtrain_userAvg.append([dataWuserAvg[:valid_offsetSample], np.full(valid_offsetSample, i), item_indexes[:valid_offsetSample]])
                #take nearest
                rtrain_userAvg.append([dataWuserAvg[valid_offset-valid_offsetSample:valid_offset], np.full(valid_offsetSample, i), item_indexes[valid_offset-valid_offsetSample:valid_offset]])                
                    
                rvalid_userAvg.append([dataWuserAvg[valid_offset:test_offset], np.full(test_offset - valid_offset, i),
                               item_indexes[valid_offset:test_offset]])
                
                rtest_userAvg.append([dataWuserAvg[test_offset:], np.full(num_test, i), item_indexes[test_offset:]])
                
            item_idx_train.append(item_indexes[:valid_offsetSample])
            item_idx_valid.append(item_indexes[:test_offset])
            item_idx_test.append(item_indexes[test_offset:])
            
        else:
            item_idx_train.append([])
    
    rtrain = np.array(rtrain)
    rvalid = np.array(rvalid)
    rtest = np.array(rtest)
   
    if implicit == False:
        rtrain_userAvg = np.array(rtrain_userAvg)
        rvalid_userAvg = np.array(rvalid_userAvg)
        rtest_userAvg = np.array(rtest_userAvg)

    #take non-zeros values, row index, and column (non-zero) index and store into sparse matrix
    rtrain = sparse.csr_matrix((np.hstack(rtrain[:, 0]), (np.hstack(rtrain[:, 1]), np.hstack(rtrain[:, 2]))),
                               shape=rating_matrix.shape, dtype=np.float32)
    rvalid = sparse.csr_matrix((np.hstack(rvalid[:, 0]), (np.hstack(rvalid[:, 1]), np.hstack(rvalid[:, 2]))),
                               shape=rating_matrix.shape, dtype=np.float32)
    rtest = sparse.csr_matrix((np.hstack(rtest[:, 0]), (np.hstack(rtest[:, 1]), np.hstack(rtest[:, 2]))),
                              shape=rating_matrix.shape, dtype=np.float32)
    
    if implicit == False:
        rtrain_userAvg = sparse.csr_matrix((np.hstack(rtrain_userAvg[:, 0]), (np.hstack(rtrain_userAvg[:, 1]), np.hstack(rtrain_userAvg[:, 2]))),
                                   shape=rating_matrix.shape, dtype=np.float32)
        rvalid_userAvg = sparse.csr_matrix((np.hstack(rvalid_userAvg[:, 0]), (np.hstack(rvalid_userAvg[:, 1]), np.hstack(rvalid_userAvg[:, 2]))),
                                   shape=rating_matrix.shape, dtype=np.float32)
        rtest_userAvg = sparse.csr_matrix((np.hstack(rtest_userAvg[:, 0]), (np.hstack(rtest_userAvg[:, 1]), np.hstack(rtest_userAvg[:, 2]))),
                                  shape=rating_matrix.shape, dtype=np.float32)

    return rtrain, rvalid, rtest,rtrain_userAvg, rvalid_userAvg, rtest_userAvg, nonzero_index, timestamp_matrix, item_idx_train, item_idx_valid, item_idx_test


### Popularity Calculation

In [11]:
def get_three_popularity_matrix(df_original,rtrain):
    # get the list of popular items by ranking the number of reviews
    numUsers = rtrain.shape[0]
    numItems = rtrain.shape[1]
    
    dff_popular = df_original.copy()
    dff_popular = dff_popular.sort_values(by=["review_count_y"], ascending=False).drop_duplicates(subset = 'business_id', keep = 'first')
    popular_list_num_of_reviews = dff_popular["business_num_id"].tolist()
    
    # get the list of popular items by ranking average rating score
    dff_popular_rating = df_original.copy()
    dff_popular_rating = dff_popular_rating.sort_values(by=["business_stars"], ascending=False).drop_duplicates(subset = 'business_id', keep = 'first')
    popular_list_avg_stars = dff_popular_rating["business_num_id"].tolist()
    
    lst_temp = []
    for item in tqdm(range(numItems)):
        numOfUsersRated = len(rtrain.toarray()[:, item].nonzero()[0])
        if numOfUsersRated <= 50:
            lst_temp.append(item)
    popular_list_avg_stars = [x for x in popular_list_avg_stars if x not in lst_temp]
    
    # get the popularity items by using the percentage liked method(number of liked items / total items)
    predictionMatrix = np.zeros((numUsers , numItems))

    # Define function for converting 1-5 rating to 0/1 (like / don't like)
    vf = np.vectorize(lambda x: 1 if x >= 4 else 0)
    rtrain_array = rtrain.toarray()
    # For every item calculate the number of people liked (4-5) divided by the number of people that rated
    itemPopularity = np.zeros((numItems))
    for item in range(numItems):
        numOfUsersRated = len(rtrain_array[:, item].nonzero()[0])
        numOfUsersLiked = len(vf(rtrain_array[:, item]).nonzero()[0])
#         if numOfUsersRated == 0:
        # set a threshold to filter out restaurants with very few reviews
        if numOfUsersRated <= 30:
            itemPopularity[item] = 0
        else:
            itemPopularity[item] = numOfUsersLiked/numOfUsersRated
    popular_list_liked_ratio = itemPopularity.argsort()
    
    return np.asarray(popular_list_num_of_reviews),np.asarray(popular_list_avg_stars),popular_list_liked_ratio

## Get df for training corpus

In [12]:
#Item idex matrix stores the reivews starts
#This function returns a list of index for the reviews included in training set 
def get_corpus_idx_list(df, item_idx_matrix):
    """
    Input: 
    df: total dataframe
    item_idx_matrix: train index list got from time_split 
    Output: row index in original dataframe for training data by time split
    """
    lst = []
    #For all the users: 5791
    for i in tqdm(range(len(item_idx_matrix))):
        
        #find row index where user_num_id is i
        a = df.index[df['user_num_id'] == i].tolist()
        
        #loop through the busienss id that the user i reviewed for in offvalid set 
        for item_idx in  item_idx_matrix[i]:
            
            #get the row index for reviews for business that the user liked in the train set
            b = df.index[df['business_num_id'] == item_idx].tolist()
            
            #Find the index for which this user liked, one user only rate a business once
            idx_to_add = list(set(a).intersection(b))
            
            if idx_to_add not in lst:
                lst.extend(idx_to_add)
    return lst

## Preprocess using Term Frequency - CounterVectorizer

In [13]:
#Stemming and Lemmatisation
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
# Get corpus and CountVector
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('wordnet')
nltk.download('stopwords')
lem = WordNetLemmatizer()
stem = PorterStemmer()
stop_words = set(stopwords.words("english"))
new_words = ['not_the']
stop_words = stop_words.union(new_words)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shenti10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shenti10\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [14]:
from tqdm import tqdm

#Should 'because' added?
def preprocess(df, reset_list = [',','.','?',';','however','but']):
    corpus = []
    for i in tqdm(range(df.shape[0])):
        text = df['review_text'][i]
        change_flg = 0
        #Convert to lowercase
        text = text.lower()
        
        ##Convert to list from string, loop through the review text
        text = text.split()
        
        #any sentence that encounters a not, the folloing words will become not phrase until hit the sentence end
        for j in range(len(text)):
            #Make the not_ hack
            if text[j] == 'not':
                change_flg = 1
#                 print 'changes is made after ', i
                continue
            #if was 1 was round and not hit a 'not' in this round
            if change_flg == 1 and any(reset in text[j] for reset in reset_list):
                text[j] = 'not_' + text[j]
                change_flg = 0
#                 print 'reset at ', i
            if change_flg == 1:
                text[j] = 'not_' + text[j]
        
        #Convert back to string
        text = " ".join(text)
        
        #Remove punctuations
#       text = re.sub('[^a-zA-Z]', ' ', text)
        
        #remove tags
        text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
        
        # remove special characters and digits
        text=re.sub("(\\d|\\W)+"," ",text)
        
        ##Convert to list from string
        text = text.split()
        
        ##Stemming
        ps=PorterStemmer()
        
        #Lemmatisation
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in  
                stop_words] 
        text = " ".join(text)
        corpus.append(text)
    return corpus

## All Model

In [15]:
from sklearn.metrics.pairwise import cosine_similarity
def train(matrix_train):
    similarity = cosine_similarity(X=matrix_train, Y=None, dense_output=True)
    return similarity

def get_I_K(df, X, row_name = 'business_num_id', binary = True, shape = (121994,6000)):
    """
    get the item-keyphrase matrix
    """
    rows = []
    cols = []
    vals = []
    #For each review history
    for i in tqdm(range(X.shape[0])):
        #Get the array of frequencies for document/review i 
        arr = X[i].toarray() 
        nonzero_element = arr.nonzero()[1]  # Get nonzero element in each line, keyphrase that appears index 
        length_of_nonzero = len(nonzero_element) #number of important keyphrase that appears
        
        # df[row_name][i] is the item idex
        #Get a list row index that indicates the document/review
        rows.extend(np.array([df[row_name][i]]*length_of_nonzero)) ## Item index
        #print(rows)
        
        #Get a list of column index indicating the key phrase that appears in i document/review
        cols.extend(nonzero_element) ## Keyword Index
        if binary:
            #Create a bunch of 1s
            vals.extend(np.array([1]*length_of_nonzero))
        else:
            #If not binary 
            vals.extend(arr[arr.nonzero()])    
    return csr_matrix((vals, (rows, cols)), shape=shape)


#Get a UI matrix if it's not item_similarity based or else IU
def predict(matrix_train, k, similarity, item_similarity_en = False):
    prediction_scores = []
    
    #inverse to IU matrix
    if item_similarity_en:
        matrix_train = matrix_train.transpose()
        
    #for each user or item, depends UI or IU 
    for user_index in tqdm(range(matrix_train.shape[0])):
        # Get user u's prediction scores for all items
        #Get prediction/similarity score for each user 1*num or user or num of items
        vector_u = similarity[user_index]

        # Get closest K neighbors excluding user u self
        #Decending accoding to similarity score, select top k
        similar_users = vector_u.argsort()[::-1][1:k+1]
        
        # Get neighbors similarity weights and ratings
        similar_users_weights = similarity[user_index][similar_users]
        
        #similar_users_weights_sum = np.sum(similar_users_weights)
        #print(similar_users_weights.shape)
        #shape: num of res * k
        similar_users_ratings = matrix_train[similar_users].toarray()
              
        prediction_scores_u = similar_users_ratings * similar_users_weights[:, np.newaxis]
        #print(prediction_scores_u)
        
        
        prediction_scores.append(np.sum(prediction_scores_u, axis=0))
        
    res = np.array(prediction_scores)
    
    if item_similarity_en:
        res = res.transpose()
    return res


#Preidction score is UI or IU?
def prediction(prediction_score, topK, matrix_Train):

    prediction = []

    #for each user
    for user_index in tqdm(range(matrix_Train.shape[0])):
        
        #take the prediction scores for user 1 * num res
        vector_u = prediction_score[user_index]
        
        #The restuarant the user rated
        vector_train = matrix_Train[user_index]
        
        if len(vector_train.nonzero()[0]) > 0:
            vector_predict = sub_routine(vector_u, vector_train, topK=topK)
        else:
            vector_predict = np.zeros(topK, dtype=np.float32)

        prediction.append(vector_predict)

    return np.vstack(prediction)

def prediction_modified(prediction_score, matrix_Train, user_id, topK = 50):
    prediction = []

    #for each user
    for user_index in tqdm(range(matrix_Train.shape[0])):
        
        #take the prediction scores for user 1 * num res
        vector_u = prediction_score[user_index]
        
        #The restuarant the user rated
        vector_train = matrix_Train[user_index]
        
        if len(vector_train.nonzero()[0]) > 0:
            vector_predict = sub_routine_modified(vector_u, vector_train)
        else:
            vector_predict = np.zeros(topK, dtype=np.float32)

        prediction.append(vector_predict)

    return prediction[user_id]

#topK: the number of restuarants we are suggesting 
#if vector_train has number, then the user has visited
def sub_routine(vector_u, vector_train, topK=500):

    #index where non-zero
    train_index = vector_train.nonzero()[1]
    
    vector_u = vector_u
    
    #get topk + num rated res prediction score descending, top index 
    candidate_index = np.argpartition(-vector_u, topK+len(train_index))[:topK+len(train_index)]
    
    #sort top prediction score index in range topK+len(train_index) into vector_u`
    vector_u = candidate_index[vector_u[candidate_index].argsort()[::-1]]
    
    #deleted the rated res from the topk+train_index prediction score vector for user u 
    #Delete the user rated res index from the topk+numRated index
    vector_u = np.delete(vector_u, np.isin(vector_u, train_index).nonzero()[0])

    #so we only include the top K prediction score here
    return vector_u[:topK]

def sub_routine_modified(vector_u, vector_train):

    #index where non-zero
    train_index = vector_train.nonzero()[1]
    
    vector_u = vector_u
    
    #get topk + num rated res prediction score descending, top index 
    candidate_index = np.argpartition(-vector_u, -1)
    
    #sort top prediction score index in range topK+len(train_index) into vector_u`
    vector_u = candidate_index[vector_u[candidate_index].argsort()[::-1]]
    
    #deleted the rated res from the topk+train_index prediction score vector for user u 
    #Delete the user rated res index from the topk+numRated index
    vector_u = np.delete(vector_u, np.isin(vector_u, train_index).nonzero()[0])

    #so we only include the top K prediction score here
    return vector_u

## Evaluation

In [16]:
def recallk(vector_true_dense, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_true_dense)


def precisionk(vector_predict, hits, **unused):
    hits = len(hits.nonzero()[0])
    return float(hits)/len(vector_predict)


def average_precisionk(vector_predict, hits, **unused):
    precisions = np.cumsum(hits, dtype=np.float32)/range(1, len(vector_predict)+1)
    return np.mean(precisions)


def r_precision(vector_true_dense, vector_predict, **unused):
    vector_predict_short = vector_predict[:len(vector_true_dense)]
    hits = len(np.isin(vector_predict_short, vector_true_dense).nonzero()[0])
    return float(hits)/len(vector_true_dense)


def _dcg_support(size):
    arr = np.arange(1, size+1)+1
    return 1./np.log2(arr)


def ndcg(vector_true_dense, vector_predict, hits):
    idcg = np.sum(_dcg_support(len(vector_true_dense)))
    dcg_base = _dcg_support(len(vector_predict))
    dcg_base[np.logical_not(hits)] = 0
    dcg = np.sum(dcg_base)
    return dcg/idcg


def click(hits, **unused):
    first_hit = next((i for i, x in enumerate(hits) if x), None)
    if first_hit is None:
        return 5
    else:
        return first_hit/10


def evaluate(matrix_Predict, matrix_Test, metric_names =['R-Precision', 'NDCG', 'Precision', 'Recall', 'MAP'], atK = [5, 10, 15, 20, 50], analytical=False):
    """
    :param matrix_U: Latent representations of users, for LRecs it is RQ, for ALSs it is U
    :param matrix_V: Latent representations of items, for LRecs it is Q, for ALSs it is V
    :param matrix_Train: Rating matrix for training, features.
    :param matrix_Test: Rating matrix for evaluation, true labels.
    :param k: Top K retrieval
    :param metric_names: Evaluation metrics
    :return:
    """
    global_metrics = {
        #"R-Precision": r_precision,
        #"NDCG": ndcg,
        #"Clicks": click
    }

    local_metrics = {
        #"Precision": precisionk,
        #"Recall": recallk,
        "MAP": average_precisionk
    }

    output = dict()

    num_users = matrix_Predict.shape[0]

    for k in atK:

        local_metric_names = list(set(metric_names).intersection(local_metrics.keys()))
        results = {name: [] for name in local_metric_names}
        topK_Predict = matrix_Predict[:, :k]

        for user_index in tqdm(range(topK_Predict.shape[0])):
            vector_predict = topK_Predict[user_index]
            if len(vector_predict.nonzero()[0]) > 0:
                vector_true = matrix_Test[user_index]
                vector_true_dense = vector_true.nonzero()[1]
                hits = np.isin(vector_predict, vector_true_dense)

                if vector_true_dense.size > 0:
                    for name in local_metric_names:
                        results[name].append(local_metrics[name](vector_true_dense=vector_true_dense,
                                                                 vector_predict=vector_predict,
                                                                 hits=hits))
        results_summary = dict()
        if analytical:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = round(results[name],4)
        else:
            for name in local_metric_names:
                results_summary['{0}@{1}'.format(name, k)] = (round((np.average(results[name])),4),
                                                              round((1.96*np.std(results[name])/np.sqrt(num_users)),4))
        output.update(results_summary)

    global_metric_names = list(set(metric_names).intersection(global_metrics.keys()))
    results = {name: [] for name in global_metric_names}

    topK_Predict = matrix_Predict[:]

    for user_index in tqdm(range(topK_Predict.shape[0])):
        vector_predict = topK_Predict[user_index]

        if len(vector_predict.nonzero()[0]) > 0:
            vector_true = matrix_Test[user_index]
            vector_true_dense = vector_true.nonzero()[1]
            hits = np.isin(vector_predict, vector_true_dense)

            # if user_index == 1:
            #     import ipdb;
            #     ipdb.set_trace()

            if vector_true_dense.size > 0:
                for name in global_metric_names:
                    results[name].append(global_metrics[name](vector_true_dense=vector_true_dense,
                                                              vector_predict=vector_predict,
                                                              hits=hits))
    results_summary = dict()
    if analytical:
        for name in global_metric_names:
            results_summary[name] = round(results[name],4)
    else:
        for name in global_metric_names:
            results_summary[name] = (round(np.average(results[name]),4), round((1.96*np.std(results[name])/np.sqrt(num_users)),4))
    output.update(results_summary)

    return output

In [17]:
#Pass in a vector of predicted restaurant for this specific user
#We specify the index of the recommended restaurant we want to display 
def displayRestaurantInfo(RecommendItem_Index, user_item_predictionMatrix,df,ID_dictionary):
    #RecommendItem_Index: 0,1,2,3.. changes as user critiques the restaurant itself
    print('------------------------------------------------------')
    recommend_item = user_item_predictionMatrix[RecommendItem_Index]
    print('Business_num_id: ', recommend_item)
    print('Restaurant name:', df[df['business_num_id'] == recommend_item].name.unique()[0],\
      '\nCuisine Type: ', df[df['business_num_id'] == recommend_item].categories.unique()[0],\
      '\nPrice:', df[df['business_num_id'] == recommend_item].price.unique()[0],\
      '\nRating:', df[df['business_num_id'] == recommend_item].business_stars.unique()[0],\
      '\nDistance:', ID_dictionary[recommend_item], 'km'
     )
    print('------------------------------------------------------')

#### Get original dataframe out of the review datastet

In [18]:
df = get_yelp_df(path ='', filename=reviewJsonToronto, sampling= True)

In [19]:
df.shape[0]

205968

### Investigating the cities

In [20]:
# np.set_printoptions(threshold=sys.maxsize)
# listCity = []
# for location in df.location.unique():
#     city = yaml.safe_load(location)['city']
#     if city not in listCity:
#         listCity.append(city)
# listCity

In [21]:
df.head(2)

Unnamed: 0,Day,Month,Unnamed: 0.1,Unnamed: 0_x,Unnamed: 0_y,Updated,Year,alias,business_id,business_stars,...,review_text,transactions,ufc,url,user_id,user_loc,vote_count,business_num_id,user_num_id,timestamp
0,23,8,6,6,21653,False,2016,happy-lemon-markham,Xo1LNzhnwE-ilqsM3ybs9Q,3.5,...,I ordered the lemon mango slush and the lemon ...,[],"[1, 1, 1]",https://www.yelp.com/biz/happy-lemon-markham?a...,zsJFjhBQEFQ6gJ7BsNM_Ug,"Toronto, Canada",1.0,2682,6984,1471925000.0
1,2,10,7,7,21653,False,2016,happy-lemon-markham,Xo1LNzhnwE-ilqsM3ybs9Q,3.5,...,"Came here on a Sunday afternoon, it wasn't bus...",[],"[1, 0, 0]",https://www.yelp.com/biz/happy-lemon-markham?a...,P7YuMh74-I2cDq7oU8frww,"York Regional Municipality, Canada",1.0,2682,2845,1475381000.0


#### Get rating-UI matrix and timestepm-UI matrix

In [22]:
rating_matrix, timestamp_matrix , I_C_matrix, IC_dictionary = get_rating_timestamp_matrix(df)

# get ratingWuserAvg_matrix
rating_array = rating_matrix.toarray()
user_average_array = rating_array.sum(axis = 1)/np.count_nonzero(rating_array,axis = 1)
init_UI = np.zeros(rating_array.shape)
init_UI[rating_array.nonzero()] = 1

#Creating rating with user average array array
for i in range(user_average_array.shape[0]):
    init_UI[i] = init_UI[i] * (user_average_array[i]-0.001) 
user_average_array = init_UI
ratingWuserAvg_array = rating_array - user_average_array
ratingWuserAvg_matrix=sparse.csr_matrix(ratingWuserAvg_array)

In [23]:
IC_dictionary

{'Chicken Wings': 47,
 'Tabletop Games': 199,
 'Syrian': 198,
 'Salad': 170,
 'Sandwiches': 172,
 'Nicaraguan': 144,
 'Chocolatiers & Shops': 49,
 'Desserts': 67,
 'Bistros': 23,
 'Sri Lankan': 192,
 'Chicken Shop': 46,
 'Hungarian': 110,
 'Mediterranean': 136,
 'Team Building Activities': 204,
 'Poke': 159,
 'Venezuelan': 214,
 'Peruvian': 155,
 'Restaurants': 167,
 'Dive Bars': 71,
 'Tea Rooms': 203,
 'Donairs': 73,
 'Custom Cakes': 62,
 'Sports Clubs': 191,
 'Live/Raw Food': 131,
 'Breweries': 28,
 'Irish': 117,
 'Asian Fusion': 10,
 'Thai': 206,
 'Arts & Entertainment': 9,
 'Specialty Food': 188,
 'Pool Halls': 161,
 'Breakfast & Brunch': 27,
 'Czech': 63,
 'Fast Food': 80,
 'Musical Instruments & Teachers': 143,
 'Coffee Roasteries': 54,
 'Cheese Shops': 44,
 'Social Clubs': 182,
 'Patisserie/Cake Shop': 151,
 'Cafes': 35,
 'Afghan': 1,
 'Tiki Bars': 207,
 'Cheesesteaks': 45,
 'Food Delivery Services': 87,
 'Art Galleries': 8,
 'Sushi Bars': 197,
 'Organic Stores': 147,
 'Persian/

#### Split to get rtrain-UI matrix and valid and test.. item_index_matrix_train

In [24]:
rtrain_implicit, rvalid_implicit, rtest_implicit, rtrain_userAvg_implicit, rvalid_userAvg_implicit, rtest_userAvg_implicit, nonzero_index, rtime, item_idx_matrix_train_implicit,item_idx_matrix_valid_implicit, item_idx_matrix_test_implicit = time_ordered_splitModified(rating_matrix=rating_matrix, ratingWuserAvg_matrix=ratingWuserAvg_matrix, timestamp_matrix=timestamp_matrix,
                                                                     ratio=[0.5,0.2,0.3],
                                                                     implicit=True,
                                                                     remove_empty=False, threshold=3,sampling=False, 
                                                                     sampling_ratio=0.1, trainSampling=0.95)

  del sys.path[0]
100%|████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:01<00:00, 5076.14it/s]


In [25]:
rtrain, rvalid, rtest, rtrain_userAvg, rvalid_userAvg, rtest_userAvg, nonzero_index, rtime, item_idx_matrix_train,item_idx_matrix_valid, item_idx_matrix_test = time_ordered_splitModified(rating_matrix=rating_matrix, ratingWuserAvg_matrix=ratingWuserAvg_matrix, timestamp_matrix=timestamp_matrix,
                                                                     ratio=[0.5,0.2,0.3],
                                                                     implicit=False,
                                                                     remove_empty=False, threshold=3,
                                                                     sampling=False, sampling_ratio=0.1, 
                                                                     trainSampling=0.95)  

100%|████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:01<00:00, 4034.16it/s]


### Using entire dataset 

In [26]:
rtrain = rtrain + rvalid + rtest

In [27]:
rtrain_implicit = rtrain_implicit + rvalid_implicit + rtest_implicit

### Using TD-IDF to compute corpus and X (business vs. terms) TfIdfVectorizer

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
corpus = preprocess(df)

100%|████████████████████████████████████████████████████████████████████████| 205968/205968 [01:13<00:00, 2793.51it/s]


In [30]:
#Creating a dictionary to store business: review text
dict_text = {}
for i in range(len(corpus)):
    if df['business_num_id'][i] not in dict_text:
        dict_text[df['business_num_id'][i]] = corpus[i]
    else:
        temp = dict_text[df['business_num_id'][i]]
        temp = temp + corpus[i]
        dict_text[df['business_num_id'][i]] = temp

In [31]:
#Create a list for the review text, where the row dimension = total business ids
list_text = []
for key in range(0,max(list(dict_text.keys()))+1) :
    if key not in dict_text.keys():
        list_text.extend([""])
    else:
        list_text.extend([dict_text[key]])

In [32]:
#Get the X vector, where dimension is #business vs #terms like IK
vectorizer = TfidfVectorizer(max_df=0.9,stop_words=stop_words, max_features=5000, ngram_range=(1,1))
X_cleaned = vectorizer.fit_transform(list_text).toarray()
X_cleaned_sparse = csr_matrix(X_cleaned)

In [33]:
#Check keywords
#print(vectorizer.get_feature_names())
#keywordList = X_cleaned_sparse[50].nonzero()[1]
#for word in keywordList:
#    print(vectorizer.get_feature_names()[word])

## Cross Validation Section below

In [34]:
#Passing in the trained similarity matrx
def individualKNNPrediction (similarityMatrix, predictionMatrix, kRange, validOrTestMatrix, itemBased=False):
    "Declaration for kRange = range(50,120,10)"
    #similarity = train(similarityMatrix)
    MAP10 = {}
    #Loop through the kvalues 
    for kValue in kRange:
        if(itemBased==False):
            user_item_p
            rediction_score = predict(predictionMatrix, kValue, similarityMatrix, item_similarity_en= False)
        else:
            user_item_prediction_score = predict(predictionMatrix, kValue, similarityMatrix, item_similarity_en= True)
        user_item_predict = prediction(user_item_prediction_score, 50, predictionMatrix)
        user_item_res = evaluate(user_item_predict, validOrTestMatrix)
        
        
        MAP10[kValue] = user_item_res.get('MAP@10')
        
    return MAP10

def get_UC_Matrix(IC_Matrix,rtrain_implicit):
    U_C_matrix_explicit = rtrain_implicit*IC_Matrix
    U_C_matrix_implicit = getImplicitMatrix(U_C_matrix_explicit,3)
    return U_C_matrix_explicit,U_C_matrix_implicit

def getImplicitMatrix(sparseMatrix, threashold=0):
    temp_matrix = sparse.csr_matrix(sparseMatrix.shape)
    temp_matrix[(sparseMatrix > threashold).nonzero()] = 1
    return temp_matrix

In [35]:
#Passing in the trained similarity matrx
def KNNPrediction (similarityMatrix, predictionMatrix, kValue, validOrTestMatrix, itemBased=False):

    if(itemBased==False):
        user_item_prediction_score = predict(predictionMatrix, kValue, similarityMatrix, item_similarity_en= False)
    else:
        user_item_prediction_score = predict(predictionMatrix, kValue, similarityMatrix, item_similarity_en= True)
    user_item_predict = prediction(user_item_prediction_score, 50, predictionMatrix)
    user_item_res = evaluate(user_item_predict, validOrTestMatrix)

        
    return user_item_res.get('MAP@10')

In [36]:
def saveDictToJson(dictionary, fileName, trainOrTest='train'):
    json_fileName = "{:s}.json".format(fileName)
    if(trainOrTest == 'train'):
        json.dump(dictionary, open("crossValidation\\trainPerformance\\"+json_fileName, 'w') )
    else:
        json.dump(dictionary, open("crossValidation\\testPerformance\\"+json_fileName, 'w') )
    

def loadDict(fileName, trainOrTest='train'):
    json_fileName = "{:s}.json".format(fileName)
    # Read data from file:
    if(trainOrTest == 'train'):
        dataDict = json.load( open("crossValidation\\trainPerformance\\"+json_fileName))
    else:
        dataDict = json.load( open("crossValidation\\testPerformance\\"+json_fileName))
    return dataDict

In [37]:
#Get UC matrices
U_C_matrix_explicit,U_C_matrix_implicit = get_UC_Matrix(I_C_matrix,rtrain_implicit)



In [38]:
#Get User visit binary UI matrix
#userVisitMatrix = getImplicitMatrix(rtrain)

In [39]:
U_C_matrix_implicit.shape

(7000, 224)

## Item based recommend & critique

In [40]:
#intersections
yonge_and_finch = Point("43.779824, -79.415665")
bloor_and_bathurst = Point("43.665194,-79.411208")
queen_and_spadina = Point("43.648772,-79.396259")
bloor_and_yonge = Point("43.670409,-79.386814")
dundas_and_yonge = Point("43.6561,-79.3802")
spadina_and_dundas = Point("43.653004,-79.398082")

#Set intersection for test case:
intersection = spadina_and_dundas

In [41]:
#IK TF-IDF
IK_MATRIX = X_cleaned_sparse
IK_similarity = train(IK_MATRIX)
IC_similarity = train(I_C_matrix)

In [42]:
#Get IP dictionary
#IP_df is the dataframe, IP_dictionary maps business_num_id with price range from 1-4
IP_df, IP_dictionary = get_IP_matrix_dictionary(df, IK_similarity)
IS_dictionary = get_IS_dictionary(df)
ID_dictionary = get_ID_dictionary(df,list(set(df['business_num_id'])),intersection)

100%|████████████████████████████████████████████████████████████████████████████| 4996/4996 [00:03<00:00, 1425.64it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 4996/4996 [00:06<00:00, 737.82it/s]


In [43]:
I_C_matrix.shape

(4996, 224)

In [44]:
#IK TF-IDF
#testPerformance['ItemKeyphrase'] = KNNPrediction(IK_similarity, rtrain, 110, rtest, itemBased=True)
user_item_prediction_score = predict(rtrain, 110, IK_similarity, item_similarity_en= True)
#user_item_predict = prediction(user_item_prediction_score, 50, rtrain)

100%|█████████████████████████████████████████████████████████████████████████████| 4996/4996 [00:28<00:00, 176.41it/s]


In [45]:
#all_user_item_predict = prediction_modified(user_item_prediction_score, rtrain, 23)
#all_user_item_predict[:50]

## Take user num id 23 as example - Cuisine Type

In [46]:
# #Take number of cuisine type
# NumCuisine = I_C_matrix.shape[1]
# print('Number of cuisin types: ', NumCuisine)

# #Take the index of recommended restuarant
# recommendIndex = 0
# print('Recommending the ', recommendIndex, 'th restaurant')

# # 1 * Item
# sampleIndex = 23
# SampleUserVector = rtrain[sampleIndex]

# print('User initially rated restaurant :\n', SampleUserVector)

# #Get initial restuarant preferences
# initialRestaurants = SampleUserVector.nonzero()[1]
# print('initial user picked restaurants id: ', rtrain[23].nonzero()[1])

# initialLiked = sparse.lil_matrix(SampleUserVector.shape)

# initialLiked[(rtrain[23] >= 4.0).nonzero()] = 1

# print('User initially liked restaurant:\n', initialLiked)

# initialLikedRestaurants = initialLiked.nonzero()[1]
# print('initial user liked restaurants id:', initialLikedRestaurants)

In [47]:
# #Adding the 1st recommended item 
# initialPreferenceAndRecommend = np.append(initialLikedRestaurants, user_item_predict[sampleIndex][recommendIndex])
# print('Initial user restaurant preference vector: ', initialPreferenceAndRecommend)

In [48]:
# #Get the initial picked restaurant cuisine type and category attribute 
# InitialCategoryId_List = []

# for restuarant in initialPreferenceAndRecommend:
#     Restaurant_name = df[df['business_num_id'] == restuarant].name.unique()
#     Restaurant_category = df[df['business_num_id'] == restuarant].categories.unique()
#     Restaurant_cat_id = []
    
#     for category in Restaurant_category:
#         print(category)
#         if ',' in category:
#             Restaurant_cat_id.extend([IC_dictionary[cat] for cat in category.split(', ')])
#         else:
#             Restaurant_cat_id.append(IC_dictionary[category])
        
#     print(Restaurant_name, Restaurant_category, 'category id:', Restaurant_cat_id)
    
#     InitialCategoryId_List.extend(Restaurant_cat_id)

# InitialCategoryId_List = sorted(list(set(InitialCategoryId_List)),reverse=False)
# print(InitialCategoryId_List)

In [49]:
# #initial category preference to numpy 
# columnVector = np.array(InitialCategoryId_List)
# print('initial user prefernce list:', columnVector)

# #Now cosntruct initial user preference binary vector 
# initPrefVector = sparse.csr_matrix((np.array([1] * columnVector.shape[0]), (np.array([0] * columnVector.shape[0]), columnVector)), \
#                            shape=(1, NumCuisine), dtype=np.float32)

# print('Initial User Preference vector\n', initPrefVector)

In [50]:
#df[df['business_num_id'] == 20].categories.unique()[0]

#Print 1st recommended item 
#displayRestaurantInfo(recommendIndex, user_item_predict[sampleIndex])

## Check if item and categories are matched

In [51]:
#print('item category vector:\n', I_C_matrix[4682])

#print('item category in df:\n', set(df[df['business_num_id'] == 4682].categories.values))

#print('Category index in category dictionary:\n', IC_dictionary['Tapas/Small Plates'])

## Critique Upon Cuisine Type

In [52]:
#Set up criquited category 
#critiquiCategory = 'Asian Fusion'

### Scenario 1

In [53]:
#Scenario 1 Criquite Upon Restaurant itself 
#Update recommended restuarant # 
#recommendIndex += 1
#Update initialUserPreference to 

### Scenario 2

In [54]:
#Scenario 2 Criquite the existing cuisine type 
#e.g. I don't want Asian Fusion 
# critiquiedIndex = IC_dictionary[critiquiCategory]
# critiqueList = []
# critiqueList.append(critiquiedIndex)
# print('critiquing index at:', critiquiedIndex)

# #Update initial user preference vector 
# modifiedInitPrefVect = initPrefVector.copy()
# modifiedInitPrefVect[0,critiquiedIndex] = 0 
# modifiedInitPrefVect.eliminate_zeros()
# print('modified user preference \n', modifiedInitPrefVect)

In [55]:
#Recommend again 
#find the items that have this category, list of business_num_id's
# critiquedItemsList = I_C_matrix.getcol(10).nonzero()[0]
# print(critiquedItemsList)


In [56]:
#Find the items in the prediction vector for this user, set prediction score to 0 
# user_item_predict[sampleIndex]

In [57]:
#Filter out critiqued items, sequence must remain the same 
#Modified_UI_predict = [x for x in user_item_predict[sampleIndex] if x not in critiquedItemsList]

#Recommended
#print('new recommend restaurant:', Modified_UI_predict[recommendIndex])

#Display Recommend Restaurant Information
#displayRestaurantInfo(recommendIndex, Modified_UI_predict)

### Scenario 3 Positively Criquite Cuisine - "I want ..."

In [58]:
# #If want Hot Pot

# #User input positive critique cuisine type
# positiveCritiquiCategory = 'Chinese'

# #Retrieve cuisine type index 
# positiveCritiquiedIndex = IC_dictionary[positiveCritiquiCategory]

# #Modify the user preference vector 

# #Retrieve item matching item category 
# matchedItemList = I_C_matrix.getcol(positiveCritiquiedIndex).nonzero()[0]

In [59]:
#Need to look from the entire entire prediction set 
#[x for x in Modified_UI_predict if x in I_C_matrix.getcol(positiveCritiquiedIndex).nonzero()[0]]

In [60]:
#Scenario I want 

In [61]:
#Check businesses with categories
#IC_dictionary
#print(I_C_matrix.getcol(108))
#df[df['business_num_id'] == 250].categories

In [62]:
#Initialize
testUser_index = int(input("Enter test user index: "))
#Get user item initial prediction vector 
test_user_item_predict = prediction_modified(user_item_prediction_score, rtrain, testUser_index)

#Compute user intitial preference vector 
#TODO

categoryList = list(IC_dictionary.keys())
categoryList[:15]

maxDistance = max(ID_dictionary, key=ID_dictionary.get)

Enter test user index: 23


100%|████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:04<00:00, 1565.88it/s]


In [63]:
def checkIfExhaustedList (current_user_item_predict, original_user_item_predict, critique_Res_list):
    if len(current_user_item_predict) == 0:
        print('Exhausted the recommendation list, fall back to initial preference')
        current_user_item_predict = [item for item in original_user_item_predict if item not in critique_Res_list]
        
    return current_user_item_predict

In [64]:
def updateList (listTobeUpdated, listUsedToUpdate):
    listTobeUpdated.append(listUsedToUpdate)
    listUpdate = list(set(listTobeUpdated))
    
    return listUpdate

In [66]:
current_user_item_predict

[4139,
 2764,
 883,
 1908,
 2202,
 2261,
 4201,
 800,
 376,
 3298,
 3282,
 2766,
 2970,
 3277,
 3922,
 3442,
 3721,
 1681,
 4664,
 1176,
 304,
 4581,
 1889,
 1252,
 4638,
 1020,
 539,
 2254,
 171,
 3001,
 2569,
 4229,
 1550,
 4552,
 4101,
 1309,
 3168,
 32,
 1999,
 3387,
 2020,
 107,
 1543,
 3621,
 2216,
 2661,
 637,
 3388,
 2278,
 4870,
 4767,
 1945,
 2549,
 3129,
 3059,
 1919,
 4434,
 43,
 1559,
 3667,
 1169,
 1533,
 1805,
 4837,
 4656,
 1361,
 3108,
 4927,
 35,
 1027,
 2568,
 2612,
 2307,
 4645,
 4548,
 151,
 4460,
 4108,
 3375,
 435,
 2226,
 4891,
 4076,
 2863,
 464,
 1290,
 317,
 22,
 712,
 2128,
 506,
 1542,
 4188,
 290,
 551,
 1994,
 266,
 2050,
 948,
 4569,
 4075,
 4053,
 4324,
 1658,
 857,
 2657,
 2650,
 727,
 1325,
 387,
 395,
 997,
 3316,
 2088,
 523,
 4333,
 609,
 4480,
 616,
 2172,
 2234,
 970,
 882,
 2306,
 1966,
 3710,
 900,
 241,
 4089,
 2499,
 2294,
 4503,
 4805,
 3756,
 264,
 2848,
 4682,
 4178,
 15,
 2367,
 2579,
 4745,
 861,
 3703,
 654,
 2368,
 4161,
 179,
 3402,
 

In [75]:
#Initialize variable 

#Need to initialize to have only restaurants within 2.0km recommended, restaurants distance above 2.0 will be critiqued
critique_Distance = 2.0

#list of restaurant don't wanted, initialized as restaurants distance >2.0km
critique_Res_list = [key for (key, value) in ID_dictionary.items() if value > critique_Distance] 
critique_Price_list = []
ciritiqued_Rating_list = []
#list of categories don't wanted - Accumulated 
critique_Cat_list = []

#Used to fall back, when user inputs are conflicted 
critiqued_Res_AccrdName_list= []
critiqued_Res_AccrdCuisine_list = []
critiqued_Res_AccrdPrice_list = []
critiqued_Res_AccrdStar_list = []
critiqued_Res_AccrdDistance_list = [key for (key, value) in ID_dictionary.items() if value > critique_Distance] 

#Categories that explicitly wanted, ONLY 1 FOR NOW
wanted_Category_index = None
#wanted_Price_list = []
#Recommendation info
recommendIndex = 0  
outPutString = ''

#Initial user item vector for critiquing process 
current_user_item_predict = [item for item in test_user_item_predict if item not in critique_Res_list]

print('Initial Recommendation')
print('Enter Stop any time to exist loop')
#displayRestaurantInfo(recommendIndex, test_user_item_predict)

while True:
    satisfied = 'None'
    feature = 'None'
    positiveOrNegative = 'None'
    critiqueValue = 'None'
    
    print('Recommending...')
    displayRestaurantInfo(recommendIndex, current_user_item_predict,df,ID_dictionary)
    
    #Current recommendation cuisine type 
    currentCuisineType = df[df['business_num_id'] == current_user_item_predict[recommendIndex]]\
                                                        .categories.unique()[0].split(', ')
    currentPriceLabel = IP_dictionary[current_user_item_predict[recommendIndex]]
    currentRating = IS_dictionary[current_user_item_predict[recommendIndex]]
    currentDistance = ID_dictionary[current_user_item_predict[recommendIndex]]
    
    #First testing cuisine type
    print('\n????????????????')
    
    while satisfied.lower() not in ['yes', 'no', 'stop']: 
        satisfied = input("You Like? ('yes', 'no') ").strip().lower()
    
    if satisfied == 'stop' or satisfied == 'yes':
        print('BYE :)')
        break
        
    #When satisfied is NO, take in feature
    while feature.lower() not in ['name', 'cuisine', 'price', 'distance', 'rating', 'stop']: 
        feature = input("What feature to critique: (name, cuisine, price, rating, distance)")
    
    if feature == 'stop':
        break
    
    #Take in Positive or nagative 
    if feature != 'name':
        while positiveOrNegative.lower() not in ['positive', 'negative', 'stop']:
            positiveOrNegative = input("Positive or negative: ")
    
    if positiveOrNegative == 'stop':
        break
    
    #Only ask for critique value when not critiquing restaurant name, or not negatively critiquing price 
    if feature != 'name' and not(feature == 'price' and positiveOrNegative == 'negative')\
    and not(feature == 'rating' and positiveOrNegative == 'negative')\
    and not(feature == 'distance' and positiveOrNegative == 'negative'): 
        #The valid values to be critiuqed that can pass in 
        validCritiqueValueList = []
        #negatively critique current cuisine type, should only enter current cruisine type
        if 'cuisine' in feature and 'negative' in positiveOrNegative:
            validCritiqueValueList = [cat.strip().lower() for cat in currentCuisineType] + ['stop']
            outPutString = '(' + currentCuisineType[0] +')'
        elif 'cuisine' in feature and 'positive' in positiveOrNegative:
            validCritiqueValueList = [cat.strip().lower() for cat in categoryList] 
            categories = list(set(IC_dictionary.keys())) + ['stop']
            outPutString = '(' + categories[0] +categories[1] +categories[2] +'...)'
        #Can only enter cheapter or more expensive
        elif 'price' in feature:
            validCritiqueValueList = ['cheaper', 'more expensive','stop']
            outPutString = '(cheaper, more expensive)'
        elif 'rating' in feature:
            validCritiqueValueList = [star/10 for star in range(0,51,1)] + ['stop']
            outPutString = '(0 ~ 5.0 with 0.1 increment)'
        #positively critique distance
        elif 'distance' in feature:
            validCritiqueDis = [str(i/10) for i in range(0,int((maxDistance+0.5)*10),5)][1:] 
            validCritiqueValueList = ['closer', 'further'] + validCritiqueDis + ['stop']
            outPutString = '(closer, further, or distance in range 0.5 ~' + str(maxDistance) + 'in every 0.5km)' 
        #Prompt to ask critique value
        while critiqueValue not in validCritiqueValueList:
            critiqueValue = input("Critique value: " + outPutString).strip().lower()
        
        if critiqueValue == 'stop':
            break
            
    print('????????????????\n')
    
    #Need to check if the critiqued value is listed for the recommended item 
    
    #If user starts critiquing, the current showend restaurant will be in critiqued list 
    critique_Res_list.append(current_user_item_predict[recommendIndex])
    critique_Res_list = list(set(critique_Res_list))
    
    
    #Scenario 1 - Critique Restaurant name 
    if 'name' in feature:
        
        print("Saving critiqued item: ", current_user_item_predict[recommendIndex])
        
        #Save critiqued restaurant to list 
        #critique_Res_list.append(current_user_item_predict[recommendIndex])
        #critique_Res_list = list(set(critique_Res_list))
        #critique_Res_list = updateList(critique_Res_list, current_user_item_predict[recommendIndex])
        
        #update critiqued restaurant list according to name
        #critiqued_Res_AccrdName_list.append(current_user_item_predict[recommendIndex])
        critiqued_Res_AccrdName_list = updateList(critiqued_Res_AccrdName_list, current_user_item_predict[recommendIndex])
        
        current_user_item_predict = current_user_item_predict[recommendIndex+1 :]
        #Handling all items critiqued case
        """BUT I'M NOT COUNTING THE FACTOR THAT THE USER HAD LIKED CATEGORIES"""
        current_user_item_predict = checkIfExhaustedList(current_user_item_predict, test_user_item_predict, critique_Res_list)
            
    
    #Scenario 2 - Negatively critique restaurant features 
    if ('cuisine' in feature.lower()) and ('negative' in positiveOrNegative.lower()):
        
        #Find the correct category name the user want to critique in current recommended item categories
        critiqueValue = [cuisine for cuisine in currentCuisineType if critiqueValue.strip().lower() in cuisine.lower()][0]
        
        critiquied_Cat_Index = IC_dictionary[critiqueValue]
        print('Saving negatively critiqued cuisine type:', critiqueValue, ', cuisine index: ', critiquied_Cat_Index)
        critique_Cat_list.append(critiquied_Cat_Index)
        
        #Handling extreme case - critique categories previously requested 
        if critiquied_Cat_Index == wanted_Category_index:
            #Reset wanted category index
            wanted_Category_index = None
            print('You are critiquing a cuisine type you previously requested\n Fall back to initial preference')
            current_user_item_predict = [item for item in test_user_item_predict if item not in critique_Res_list]
        
        
        #Find list of restaurants to filter out 
        critiquedItemsList = I_C_matrix.getcol(critiquied_Cat_Index).nonzero()[0]
        
        critiqued_Res_AccrdCuisine_list.append(critiquedItemsList)
        critiqued_Res_AccrdCuisine_list =list(set(critique_Res_list))
        
        #Updating the critiqued items to list
        print('Saving critiqued items at index: ', critiquedItemsList[:5], '...')
        critique_Res_list.extend(list(critiquedItemsList))
        critique_Res_list = list(set(critique_Res_list))
        
        #Filter out critiqued items, sequence must remain the same 
        #Update current valid set 
        current_user_item_predict = [item for item in current_user_item_predict if item not in critique_Res_list]

        #Handle case where run out of items! - Fall Back!
        current_user_item_predict = checkIfExhaustedList(current_user_item_predict, test_user_item_predict, critique_Res_list)
        
        
    
    #Scenario 3 - Positively critique restaurant cuisine type 
    if 'cuisine' in feature and 'positive' in positiveOrNegative:

        #Find the correct category within all the categories list - assuming exact word typed in 
        """Add ERROR HANDLING HERE"""
        positiveCritiquiCategory = [cuisine for cuisine in categoryList if critiqueValue.strip().lower() in cuisine.lower()][0]
        
        #Retrieve cuisine type index 
        positiveCritiquiedIndex = IC_dictionary[positiveCritiquiCategory]
        
        #Get the preferred category index 
        wanted_Category_index = positiveCritiquiedIndex
        
        #Check if user have previously critiqued - remove it
        if positiveCritiquiedIndex in critique_Cat_list:
            critique_Cat_list.remove(positiveCritiquiedIndex)
            
            #UPDATE CRITIQUE RESTAURANT LIST 
            """TODO"""
        
        #Retrieve items matching item category 
        matchedResList = I_C_matrix.getcol(positiveCritiquiedIndex).nonzero()[0]
        print('matching list:', matchedResList[:10])
        
        #Update current valid set, make sure not in critiqued restaurant set 
        current_user_item_predict = [item for item in matchedResList if item not in critique_Res_list]
        
        
    #Scenario 4 - Negative critique restuarant price e.g. "I don't want expensive restaurants" OR positive "I want cheaper"
    #Does not pass in anything ASSUMING ONLY GOING DOWN
    #Scenario 5 - Positively critique restaurant price, "I want fine dining", wanting more expensive restaurant 
    if 'price' in feature:
        
        #Update critiqued price list
        if 'positive' in positiveOrNegative and critiqueValue == 'more expensive':
            critique_Price_list.extend([price for price in range(1,currentPriceLabel+1,1)])
        #Negative or positive, cheaper     
        else:     
            critique_Price_list.extend([price for price in range(currentPriceLabel,5,1)])
            
        #Deduplicate
        critique_Price_list = list(set(critique_Price_list))
        
        #Check if critiqued all price range, if exhausted, fall back 
        if len(critique_Price_list) == 4:
            print('You have exhaused the price range option, showing the most', critiqueValue, 'option')
            #Clear the previous critiqued restuarants based on price out of critiqued restuarant list 
            #POTENTIAL ISSUE HERE, MAY BE ERASING SOME RESTAURANTS CRITIQUED IN OTHER STEPS ... NEED TO RECHECK
            critique_Res_list = [item for item in test_user_item_predict if\
                                item in list(set(critiqued_Res_AccrdName_list+ critiqued_Res_AccrdCuisine_list+\
                                                 critiqued_Res_AccrdStar_list + critiqued_Res_AccrdDistance_list))]
            
            #Clear out the restuarants critiquied by price 
            critiqued_Res_AccrdPrice_list = []
            
            #Restore default critiquing price list 
            if critiqueValue == 'more expensive':
                critique_Price_list = [1,2,3]
            else:
                critique_Price_list = [2,3,4]  
        
        print('Critiqing price at range:', ['$'*label for label in critique_Price_list])
        
        #Find the list of restaurants to critique 
        listCritiqueRestaurant = [key  for (key, value) in IP_dictionary.items() if value in critique_Price_list]
        
        #Record the list of restaurants critiqued so far based on price 
        critiqued_Res_AccrdPrice_list.extend(listCritiqueRestaurant)
        critiqued_Res_AccrdPrice_list = list(set(critiqued_Res_AccrdPrice_list))
        
        #Update the critiqued restaurant list
        critique_Res_list.extend(listCritiqueRestaurant)
        critique_Res_list = list(set(critique_Res_list))
        
        #Updating the critiqued items to list
        print('Saving critiqued items at index: ', listCritiqueRestaurant[:5], '...')
        
        #Filter out critiqued items, sequence must remain the same 
        #Update current valid set 
        current_user_item_predict = [item for item in current_user_item_predict if item not in critique_Res_list]
        
        #Check if exhaused list
        current_user_item_predict = checkIfExhaustedList(current_user_item_predict, test_user_item_predict, critique_Res_list)
        
        #Recommended
        print('Re-recommending...')
        
    #Scenario 6 negatively critique rating & positively critique rating: both goes up 
    #"I don't want ratings this low?" "I don't want restuarants with rating below XXX" "I want restaurants with rating above XXX"
    #I will critique the ratings below the current rating or specific rating 
    if 'rating' in feature:
        #Critiquing the restaurants that has ratings equal and below this restaurant 
        if 'negative' in positiveOrNegative:
            ciritiqued_Rating_list.extend([rating/10 for rating in range(0,int(currentRating*10+1),5)])
        else:
            ciritiqued_Rating_list.extend([rating/10 for rating in range(0,int(critiqueValue*10+1),5)])
        
        ciritiqued_Rating_list = list(set(ciritiqued_Rating_list))
            
        #If critiqued all price
        if ciritiqued_Rating_list == [price/10 for price in range(0,int(5.0 *10 +1),5)]:
            print('exhausted list, no better restaurants, recommending he finest restaurants')
            #reset critiqued restaurant list 
            critique_Res_list = [item for item in test_user_item_predict if\
                                item in list(set(critiqued_Res_AccrdName_list+critiqued_Res_AccrdCuisine_list+\
                                                critiqued_Res_AccrdPrice_list+critiqued_Res_AccrdDistance_list))] 

            #Clear out the restuarants critiquied by rating 
            critiqued_Res_AccrdStar_list = []

            ciritiqued_Rating_list = [rating/10 for rating in range(0,int(currentRating*10+1),5)]

        print('critiquing restaurants at rating rangeing at:',ciritiqued_Rating_list)

        #Get the list of restaurants to critique
        listCritiqueRes = [key  for (key, value) in IS_dictionary.items() if value in ciritiqued_Rating_list]

        #Updating the critiqued items to list
        print('Saving critiqued items at index: ', listCritiqueRes[:5], '...')

        #Update the critiqued restaurant list
        critique_Res_list.extend(listCritiqueRes)
        critique_Res_list = list(set(critique_Res_list))

        #Record those as well
        critiqued_Res_AccrdStar_list.extend(listCritiqueRes)
        critiqued_Res_AccrdStar_list = list(set(critiqued_Res_AccrdStar_list))

        #Update current valid set 
        current_user_item_predict = [item for item in current_user_item_predict if item not in critique_Res_list]
        
        
    #Scenario #7 Critique on Distance, goes up or down, only say positive critiques
    #Want closer, further, or specific value 
    if 'distance' in feature:
        #If want further distance restaurants
        if 'positive' in positiveOrNegative and 'further' in critiqueValue:
            print('Critiquing distance <=', currentDistance)

            #New requirements for restaurants to be critiqued
            listCritiquedRestaurant = [key for (key, value) in ID_dictionary.items() if value <= currentDistance]
        
        #Closer distance
        else:
            if (critiqueValue != 'None') and not 'closer' in critiqueValue:
                currentDistance = int(critiqueValue)
                
            print('Critiquing distance >=', currentDistance)
            
            #New requirements for restaurants to be critiqued
            listCritiquedRestaurant = [key for (key, value) in ID_dictionary.items() if value >= currentDistance]
        

        #Update critique restaurant list, only keep the ones that are critiqued under other features
        critique_Res_list = [item for item in test_user_item_predict if\
                            (item in list(set(listCritiquedRestaurant+critiqued_Res_AccrdPrice_list+\
                                              critiqued_Res_AccrdStar_list+critiqued_Res_AccrdName_list+\
                                                critiqued_Res_AccrdCuisine_list)))] 
        
        #Update list of restaurants critiqued by the restuarnt distance, entire replacement
        critiqued_Res_AccrdDistance_list= listCritiquedRestaurant
        
        #Update initial valid restuarnt list
        current_user_item_predict = [item for item in  current_user_item_predict if item not in critique_Res_list]
        
        #Handle case where critiqued everything, there's no restuarnt that satisfy all the critiques 
        if(len(current_user_item_predict) == 0):
            print('Exhausted sytem')
            
            new_Distance= input('There\'s no restaurant that matches your preference within this area, please input a new larger distance')

            listCritiquedRestaurant = [key for (key, value) in ID_dictionary.items() if value > new_Distance]
            
            #Restore original critiqued restaurants under distance feature 
            critique_Res_list = [item for item in test_user_item_predict if\
                                (item in list(set(listCritiquedRestaurant+critiqued_Res_AccrdPrice_list\
                                                  +critiqued_Res_AccrdStar_list+critiqued_Res_AccrdName_list+\
                                                    critiqued_Res_AccrdCuisine_list)))] 
            
            #New critiquing restuarants
            critiqued_Res_AccrdDistance_list= listCritiquedRestaurant
        
            #Update initial valid restuarnt list
            current_user_item_predict = [item for item in  current_user_item_predict if item not in critique_Res_list]
        
    

Initial Recommendation
Enter Stop any time to exist loop
Recommending...
------------------------------------------------------
Business_num_id:  4501
Restaurant name: Lai Wah Heen 
Cuisine Type:  Dim Sum 
Price: $$$ 
Rating: 3.5 
Distance: 1.0 km
------------------------------------------------------

????????????????
You Like? ('yes', 'no') no
What feature to critique: (name, cuisine, price, rating, distance)cuisine
Positive or negative: positive
Critique value: (Chicken WingsTabletop GamesSyrian...)chinese
????????????????

matching list: [  4   9  21  36 111 114 125 132 151 153]
Recommending...
------------------------------------------------------
Business_num_id:  111
Restaurant name: Wah Too 
Cuisine Type:  Seafood, Chinese 
Price: $$ 
Rating: 3.5 
Distance: 0.9 km
------------------------------------------------------

????????????????
You Like? ('yes', 'no') no
What feature to critique: (name, cuisine, price, rating, distance)distance
Positive or negative: positive
Critique va

In [74]:
positiveCritiquiCategory

'Chinese'

In [None]:
merge = listCritiquedRestaurant+critiqued_Res_AccrdPrice_list+ critiqued_Res_AccrdStar_list+critiqued_Res_AccrdName_list + critiqued_Res_AccrdCuisine_list
a = 4139
4139 not in critiqued_Res_AccrdDistance_list or 4139 in list(set(merge))

### Popularity 

In [None]:
# number of reviews popularity list, redundent with the output of the next method
dff_popular = df.copy()
dff_popular = dff_popular.sort_values(by=["review_count_y"], ascending=False).drop_duplicates(subset = 'business_id', keep = 'first')
#Get the list of restaurants accoridng to their popularity level
popular_list = dff_popular["business_num_id"].tolist()[:50]

In [None]:
rtrainValide = rtrain + rvalid
numUsers = rtrainValide.shape[0]
# transfer to a matrix(list * number of users)
matrix_popular_list_num_of_reviews = np.tile(popular_list,(numUsers,1))
matrix_popular_list_num_of_reviews

In [None]:
popularity_res = evaluate(matrix_popular_list_num_of_reviews, rtest)
popularity_res['MAP@10']

## Test set

In [None]:
rtrain = rtrain + rvalid 
rtrain_implicit = rtrain_implicit + rvalid_implicit 

In [None]:
rtrain_implicit_similarity_trainValid = train(rtrain_implicit)

In [None]:
IK_MATRIX_trainValid = X_cleaned_sparse_trainValid
IK_similarity_trainValid = train(IK_MATRIX_trainValid)

In [None]:
testPerformance = {}

In [None]:
testPerformance['Implicit_UserReview'] = KNNPrediction(rtrain_implicit_similarity_trainValid, rtrain_implicit, 140, rtest_implicit, itemBased=False)

In [None]:
testPerformance['Implicit_Explicit_Combined_UserReview'] = KNNPrediction(rtrain_implicit_similarity_trainValid, rtrain, 100, rtest, itemBased=False)

In [None]:
testPerformance['ItemKeyphrase'] = KNNPrediction(IK_similarity_trainValid, rtrain, 110, rtest, itemBased=True)

In [None]:
testPerformance['ItemCategory'] = KNNPrediction(IC_similarity, rtrain, 130, rtest, itemBased=True)

In [None]:
testPerformance['Popularity_reviewNumber'] = popularity_res['MAP@10']

In [None]:
testPerformance

In [None]:
testPerformance.items()


### Plot

In [None]:
import matplotlib.pyplot as plt
listPrediction = testPerformance.items()

x1, y1 = zip(*listPrediction) # unpack a list of pairs into two tuples

x = []
y = []
err = []
for item in x1:
    x.append(item)
    
for item in y1:
    y.append(item[0])
    err.append(item[1])
    
plt.figure(figsize=(13,12))

plt.scatter(x,y)

plt.errorbar(x,y,yerr=err, linestyle="None", fmt='o')

for x2, y2 in zip(x, y): 
    plt.text(x2, y2, str(y2))
      
plt.title('Test Performance of algorithms')
plt.legend(bbox_to_anchor=(1, 1),
           bbox_transform=plt.gcf().transFigure)
plt.xlabel('Algorithms')
plt.ylabel('MAP@10')
plt.show()