Let's begin by importing libraries and making sure we only deal with valid data.

In [31]:
import os
import re
import string

import numpy as np 
import pandas as pd 

from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer

In [32]:
df_yelp_business = pd.read_json('../input/yelp_academic_dataset_business.json', lines=True)
df_yelp_business.fillna('NA', inplace=True)

df_yelp_business = df_yelp_business[df_yelp_business['categories'].str.contains('Restaurants')]
print('Final Shape: ',df_yelp_business.shape)

Final Shape:  (59371, 14)


Now we bring the reviews and perform some preprocessing on those reviews..

In [33]:
df_yelp_review_iter = pd.read_json('../input/yelp_academic_dataset_review.json', chunksize=100000, lines=True)
df_yelp_review_iter

<pandas.io.json.json.JsonReader at 0x7fbecf102b00>

Because reviews are too big, we will read them in chunks, and make sure we delete reviews of places that are not in our list of businesses filtered earlier. Note here we choose 5 chunks, but we could have chosen any number (larger numbers will give MemoryError later on).

In [34]:
df_yelp_review = pd.DataFrame()
i=0
for df in df_yelp_review_iter:
    df = df[df['business_id'].isin(df_yelp_business['business_id'])]
    df_yelp_review = pd.concat([df_yelp_review, df])
    i=i+1
    print(i)
    if i==4: break

1
2
3
4


In [35]:
df_yelp_business = df_yelp_business[df_yelp_business['business_id'].isin(df_yelp_review['business_id'])]

In [36]:
print('Final businesses shape: ', df_yelp_business.shape)
print('Final review shape: ', df_yelp_review.shape)

Final businesses shape:  (5747, 14)
Final review shape:  (254171, 9)


In [37]:
def clean_text(text):
    ## Remove puncuation
    text = text.translate(string.punctuation)
    
    ## Convert words to lower case and split them
    text = text.lower().split()
    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops and len(w) >= 3]
    
    text = " ".join(text)
    
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)    
    return text

The next step will apply those transformations. Note that it will take a couple of minutes to finish.

In [38]:
%%time
df_yelp_review['text'] = df_yelp_review['text'].apply(clean_text)

CPU times: user 1min 21s, sys: 3.62 s, total: 1min 25s
Wall time: 1min 25s


Now we want to vectorize both reviews and categories. Note that min_df and max_df arguments in both.

In [39]:
vectorizer_reviews = CountVectorizer(min_df = .01,max_df = .99, tokenizer = WordPunctTokenizer().tokenize)
vectorized_reviews = vectorizer_reviews.fit_transform(df_yelp_review['text'])

In [40]:
print(vectorized_reviews.shape)

(254171, 924)


 top 100 vocabularies:

In [41]:
' | '.join(vectorizer_reviews.get_feature_names()[:100]) # only the first 100

'! | + | - | 00 | 1 | 10 | 12 | 15 | 2 | 20 | 3 | 30 | 4 | 5 | 50 | 6 | 7 | 8 | 99 | : | ; | a | able | about | absolutely | across | actually | add | added | after | afternoon | again | ago | all | almost | along | already | also | although | always | am | amazing | ambiance | american | amount | and | another | anyone | anything | anyway | anywhere | appetizer | appetizers | are | area | around | arrived | as | asian | ask | asked | asking | at | ate | atmosphere | attention | attentive | attitude | authentic | available | average | avocado | away | awesome | awful | back | bacon | bad | baked | bar | barely | bartender | based | basically | bbq | be | bean | beans | beautiful | beef | beer | beers | before | behind | believe | best | better | beyond | big | bill'

In [42]:
vectorizer_categories = CountVectorizer(min_df = 1, max_df = 1., tokenizer = lambda x: x.split(', '))
vectorized_categories = vectorizer_categories.fit_transform(df_yelp_business['categories'])

In [43]:
print(vectorized_categories.shape)

(5747, 405)


We also show 100 categories..

In [44]:
' | '.join(vectorizer_categories.get_feature_names()[:100]) # only the first 100

'acai bowls | accessories | active life | adult education | adult entertainment | afghan | african | air duct cleaning | airport shuttles | amateur sports teams | american (new) | american (traditional) | amusement parks | antiques | apartments | appliances | arabian | arcades | argentine | armenian | art classes | art galleries | art schools | arts & crafts | arts & entertainment | asian fusion | australian | austrian | automotive | bagels | bakeries | bangladeshi | bar crawl | barbeque | barbers | bars | bartenders | basque | beach bars | beaches | beauty & spas | bed & breakfast | beer | beer bar | beer gardens | belgian | bingo halls | bistros | boat charters | body shops | books | bookstores | bowling | brasseries | brazilian | breakfast & brunch | breweries | brewpubs | british | bubble tea | buffets | burgers | burmese | bus tours | butcher | cafes | cafeteria | cajun/creole | cambodian | canadian (new) | candy stores | cannabis dispensaries | cantonese | car dealers | car renta

In [45]:
%%time
from scipy import sparse
businessxreview = sparse.csr_matrix(pd.get_dummies(df_yelp_review['business_id']).values)

CPU times: user 11 s, sys: 744 ms, total: 11.8 s
Wall time: 11.8 s


Let's print out the shapes of the matrices we have prepared and make sure they make sense (by matching their dimensions):

In [46]:
print('restuarants x categories: \t', vectorized_categories.shape) 
print('restuarants x reviews: \t\t' , businessxreview.shape) 
print('reviews x words: \t\t', vectorized_reviews.shape)

restuarants x categories: 	 (5747, 405)
restuarants x reviews: 		 (254171, 5747)
reviews x words: 		 (254171, 924)


Now we are ready to choose a seed restaurant and find  other restaurants that might be as good as the seed restaurant. We make sure to choose a restaurant with good number of reviews and ratings.

In [47]:
# to choose a restaurant, just copy the business id and paste it in the next cell
# you can always rerun the cell to choose another restuarant. 
df_yelp_business.sample(10)

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
15453,"3560 Rutherford Rd, Unit 60","{'RestaurantsGoodForGroups': 'True', 'GoodForK...",H9WgoLOMS5emyetYD6YVUA,"Pizza, Restaurants, Italian",Vaughan,"{'Tuesday': '12:0-20:0', 'Wednesday': '12:0-20...",1,43.828219,-79.549951,Taste of Naples Pizzeria,L4H 2J3,32,5.0,ON
13008,6008 MacLeod Trail SW,"{'RestaurantsPriceRange2': '2', 'RestaurantsDe...",WlDC_QWKuEVrSMQh5CuGSA,"Vietnamese, Restaurants",Calgary,"{'Monday': '10:30-21:0', 'Tuesday': '10:30-21:...",1,50.999562,-114.070702,Golden Bell Saigon & Vietnamese Restaurant,T2H 0K1,35,3.5,AB
4296,1090 Don Mills Rd,"{'BusinessParking': '{'garage': False, 'street...",7UPTUpex3O1Gav3td7GOEw,"Restaurants, Burgers",Toronto,,0,43.736442,-79.344201,South St Burger Co,M3C 3R6,6,3.0,ON
9128,820 S Rampart Blvd,"{'OutdoorSeating': 'True', 'RestaurantsReserva...",Ke0nQzmHSGTdSqmagaNr4Q,"Restaurants, American (New)",Las Vegas,,0,36.162198,-115.288825,Panda Express,89145,3,2.5,NV
17475,2813 N Sherman Ave,"{'GoodForKids': 'True', 'RestaurantsTakeOut': ...",5sEcWQd9-LqRvE98ouEILA,"Sandwiches, Fast Food, Restaurants",Madison,"{'Monday': '7:0-22:0', 'Tuesday': '7:0-22:0', ...",1,43.127623,-89.363022,Subway,53704,4,3.0,WI
15879,3124 Milton Rd,"{'GoodForKids': 'True', 'Alcohol': 'u'none'', ...",0gfGLDe_gSr9CYUBO57i0g,"Restaurants, Seafood",Charlotte,,1,35.234822,-80.735712,Mayflower Seafood Restaurant,28215,3,3.5,NC
10218,2365 E Windmill Ln,"{'HasTV': 'False', 'Alcohol': ''none'', 'Resta...",taaYWCkUulZL56qZmS1VzQ,"Fast Food, Restaurants",Las Vegas,,1,36.041935,-115.120227,Jack In The Box,89123,24,3.0,NV
10902,114 Avenue Laurier Ouest,"{'BusinessParking': '{'garage': False, 'street...",5eHHAGIrDDyvFZfQn1kX4Q,"French, Restaurants",Montréal,,0,45.521543,-73.59404,Raza,H2T 2N7,5,5.0,QC
13241,35 Peel Centre Drive,"{'GoodForKids': 'True', 'BikeParking': 'True',...",6SxrnI0BKc7bVj7Vcuu7Vw,"Restaurants, Food, Sandwiches, Donuts, Coffee ...",Brampton,,1,43.713822,-79.72112,Tim Horton's,L6T 5T9,5,2.5,ON
9710,4338 Macleod Trail SW,"{'RestaurantsReservations': 'False', 'Restaura...",W41b6wGEZ5YBJxyqdN1RpA,"Middle Eastern, Hookah Bars, Restaurants, Nigh...",Calgary,"{'Monday': '17:0-0:0', 'Tuesday': '17:0-0:0', ...",0,51.014576,-114.064933,Babylon Restaurant,T2G 0A4,10,3.0,AB


In [48]:
business_choose = 'aUrOyWFKxKeVXiFzwbTXSA' # vegan, vegetarian, cafes

First, we pull up the reivews and then show some of them

In [49]:
new_reviews = df_yelp_review.loc[df_yelp_review['business_id'] == business_choose, 'text']

In [50]:
print('\n'.join([r[:100] for r in new_reviews.tolist()])) # restaurant reviews




*Then we pull up the categories:

In [51]:
new_categories = df_yelp_business.loc[df_yelp_business['business_id'] == business_choose, 'categories']

In [52]:
new_categories.tolist() #  restaurant categories

[]

Here, we compute two sets of distancecs: we compute the correlation distance of the average vectorized reviews to all the reviews, and compute the correlation distance between this category and all other categories. The category trick will be clearer when we see the results.

In [53]:
from scipy.spatial.distance import cdist
# find most similar reviews
dists1 = cdist(vectorizer_reviews.transform(new_reviews).todense().mean(axis=0), 
              vectorized_reviews.T.dot(businessxreview).T.todense(), 
               metric='correlation')
# find most similar categories
dists2 = cdist(vectorizer_categories.transform(new_categories).todense().mean(axis=0), 
              vectorized_categories.todense(), 
               metric='correlation')

  return N.ndarray.mean(self, axis, dtype, out, keepdims=True)._collapse(axis)
  ret, rcount, out=ret, casting='unsafe', subok=False)


*Now we combine the two sets of distances and take the average of those (we can take other metrics such as min or max, depending on your priority).

In [54]:
# combine the two vectors in one matrix
dists_together = np.vstack([dists1.ravel(), dists2.ravel()]).T

In [56]:
# this is a key cell: how are we going to prioritize ?
dists = dists_together.mean(axis=1)

Let's select the closest 10 restaurants to the seed restaurant.

In [59]:
# select the closest 10
closest = dists.argsort().ravel()[:10]

Here is our seed restaurant:

In [60]:
df_yelp_business.loc[df_yelp_business['business_id']== business_choose, ['business_id', 'categories', 'name', 'stars']]

Unnamed: 0,business_id,categories,name,stars


Now let's see what the top matches:

In [61]:
df_yelp_business.loc[df_yelp_business['business_id'].isin(df_yelp_business['business_id'].iloc[closest]), ['business_id', 'categories', 'name', 'stars']]

Unnamed: 0,business_id,categories,name,stars
1,QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Emerald Chinese Restaurant,2.5
12921,viHZE3j_XZ9Mp4DwmvscVw,"Ice Cream & Frozen Yogurt, Food, Restaurants, ...",Pinkberry,4.0
12930,DQbbE26lEFq4rEFSaEEhRg,"American (Traditional), Restaurants, Comfort Food",Swiss Chalet Rotisserie & Grill,2.0
12931,2ZEnhnWEascgZDMdf6kfAg,"Chinese, Restaurants",Chez Chen,4.0
12937,nICEDIVnk6OJZMaDr1Dz3A,"Mexican, Restaurants",Los Alegres Compadres Family Mexican Restaurant,3.5
12939,W7hCuNdn2gzehta6eSHzgQ,"Restaurants, Fish & Chips, Seafood",Pete's Fish & Chips,2.0
12947,YyyqcbVyoBo4wPsgowUtUQ,"Japanese, Restaurants",SASA,4.0
12953,KN0gPRzDvA6uVYims2KA0w,"Restaurants, Canadian (New), Greek",Rosemary & Thyme Family Restaurant,4.0
12975,PtBYoyHs9P3SNG-8G8-EGw,"Italian, Canadian (New), Barbeque, Seafood, Re...",Le Pirate de Laval,3.5
12978,DI8SC8MF0wh3ba45aN9dTw,"Restaurants, American (Traditional)",Dunkin' Donuts,2.5


Although many of those seem to come from the same category (Vegetarian and Vegan), there is a considerable variation in those categories (Ethiopian, African, Asian Fusion, .. etc). Most importantly, they all seem highly rated.

Would you go and try those out and be little surprised ? It is really up to you.