In [1]:
import numpy as np
import pandas as pd

from operator import add
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Word2Vec, Word2VecModel
from pyspark.ml.feature import HashingTF

from pyspark.ml import Pipeline

from pyspark.sql.functions import *

from pyspark.sql.types import *

import folium
import html

In [2]:
data_path = '/home/osboxes/yelp-data/dataset/'
model_path = '/home/osboxes/yelp-data/'

In [3]:
business_df = spark.read.parquet(data_path + 'business-small.parquet')

In [4]:
business_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- business_name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- businessId: long (nullable = true)



In [8]:
business_df.createOrReplaceTempView("businesses")

In [9]:
user_df = spark.read.parquet(data_path + 'user-small.parquet')
user_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- fans: long (nullable = true)
 |-- average_stars: double (nullable = true)
 |-- $9: long (nullable = true)



In [10]:
user_df.count()

66424

In [13]:
user_df = user_df.withColumnRenamed('name', 'user_name')

In [14]:
user_df.createOrReplaceTempView("users")

In [15]:
query = """
SELECT * FROM USERS 
limit 10
"""

sqlContext.sql(query).toPandas()


Unnamed: 0,user_id,user_name,review_count,yelping_since,useful,funny,cool,fans,average_stars,$9
0,--7gjElmOrthETJ8XqzMBw,Elizabeth,17,2014-11-16,14,0,0,0,3.88,1
1,--Br-QsbO9ad5GbZxVGxaw,Melanie,12,2015-06-10,0,0,0,0,3.25,2
2,--BumyUHiO_7YsHurb9Hkw,Sapna,38,2017-01-13,0,0,0,1,3.87,3
3,--DKDJlRHfsvufdGSk_Sdw,Tony,1,2016-07-25,0,0,0,0,1.0,4
4,--EVSb3jbKVL3WJ5NUCuCA,Janet,26,2012-03-15,0,0,0,1,4.65,5
5,--KQJPdrU0Md97DiOliDzw,Steve,166,2006-01-17,211,185,179,6,3.42,6
6,--Qh8yKWAvIP4V4K8ZPfHA,Dixie,503,2011-01-19,21,32,23,41,3.19,7
7,--RYvmB6UYRyZQqXkBv4eQ,James,4,2011-01-19,0,0,0,0,1.75,8
8,--UOvCH5qEgdNQ8lzR8QYQ,Chuan,13,2013-09-09,8,0,1,0,3.86,9
9,--WhK4MJx0fKvg64LqwStg,Rachel,10,2014-11-10,0,0,0,0,4.6,10


In [16]:
review_df = spark.read.parquet(data_path + 'review-small.parquet')

In [17]:
review_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- review_date: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)



In [18]:
review_df.count()

276887

In [19]:
review_df.show(3)

+--------------------+--------------------+--------------------+-----+-----------+--------------------+------+-----+----+
|           review_id|             user_id|         business_id|stars|review_date|         review_text|useful|funny|cool|
+--------------------+--------------------+--------------------+-----+-----------+--------------------+------+-----+----+
|Z5l99h18E3_g1GLcD...|djpMXOA1ic5wv3FPt...|mr4FiPaXTWlJ3qGzp...|    3| 2009-07-21|I left Table 17 f...|     3|    0|   0|
|Z3Fw292i0Eg8liW0D...|-pXs08gJq9ExIk275...|mr4FiPaXTWlJ3qGzp...|    3| 2008-12-13|for the time bein...|     1|    0|   0|
|hsKINx1dIKeFTDe-Z...|PTj29rhujYETuFlAZ...|mr4FiPaXTWlJ3qGzp...|    5| 2013-10-12|Love this place. ...|     1|    0|   1|
+--------------------+--------------------+--------------------+-----+-----------+--------------------+------+-----+----+
only showing top 3 rows



In [20]:
review_df.createOrReplaceTempView("reviews")

In [22]:
query = """
SELECT
    business_id,
    COUNT(*) as 5_stars_count
FROM reviews
WHERE stars = '5'
GROUP BY business_id 
ORDER BY COUNT(*) DESC
limit 10
"""

sqlContext.sql(query).toPandas()


Unnamed: 0,business_id,5_stars_count
0,r_BrIgzYcwo1NAuG9dLbpg,604
1,aLcFhMe6DDJ430zelCpd2A,462
2,RtUvSWO_UZ8V3Wpj0n077w,458
3,iGEvDk6hsizigmXhDKs2Vg,457
4,N93EYZy9R0sdlEvubu94ig,407
5,Yl2TN9c23ZGLUBSD9ks5Uw,279
6,ZumOnWbstgsIE6bJlxw0_Q,267
7,mZRKH9ngRY92bI_irrHq6w,267
8,k6zmSLmYAquCpJGKNnTgSQ,259
9,JMiaNitMzMbJm6Kh0RbT5A,247


In [23]:
reviews_text = spark.sql("SELECT business_id, review_text FROM reviews")

In [24]:
reviews_text.show(3)

+--------------------+--------------------+
|         business_id|         review_text|
+--------------------+--------------------+
|mr4FiPaXTWlJ3qGzp...|I left Table 17 f...|
|mr4FiPaXTWlJ3qGzp...|for the time bein...|
|mr4FiPaXTWlJ3qGzp...|Love this place. ...|
+--------------------+--------------------+
only showing top 3 rows



In [25]:
reviews_text_rdd = reviews_text.rdd
reviews_by_business_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_business_df = spark.createDataFrame(reviews_by_business_rdd)
reviews_by_business_df = reviews_by_business_df \
                            .withColumnRenamed('_1', 'business_id') \
                            .withColumnRenamed('_2', 'text')
reviews_by_business_df.count()   

6750

In [26]:
reviews_by_business_df.show(3)

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|bfR-vJvrjdOJaWsXG...|Attention allergy...|
|Dl2vgi5W_nbe-A97D...|I don't understan...|
|65ZGMedBm7TBpWv6f...|Food here is alwa...|
+--------------------+--------------------+
only showing top 3 rows



In [27]:

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')

reviews_by_business_token_df = regexTokenizer.transform(reviews_by_business_df)
reviews_by_business_token_df.show(3)


+--------------------+--------------------+--------------------+
|         business_id|                text|               token|
+--------------------+--------------------+--------------------+
|bfR-vJvrjdOJaWsXG...|Attention allergy...|[attention, aller...|
|Dl2vgi5W_nbe-A97D...|I don't understan...|[i, don, t, under...|
|65ZGMedBm7TBpWv6f...|Food here is alwa...|[food, here, is, ...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [28]:
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')

reviews_by_business_token_nostopwrd_df = stopWordsRemover.transform(reviews_by_business_token_df)
reviews_by_business_token_nostopwrd_df.show(3)

+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|           nostopwrd|
+--------------------+--------------------+--------------------+--------------------+
|bfR-vJvrjdOJaWsXG...|Attention allergy...|[attention, aller...|[attention, aller...|
|Dl2vgi5W_nbe-A97D...|I don't understan...|[i, don, t, under...|[understand, prev...|
|65ZGMedBm7TBpWv6f...|Food here is alwa...|[food, here, is, ...|[food, always, fr...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



#### The follwoing step of creating wordevec model is resource intensive and time consuming.  
#### Just load the previusly trained model unless you need to rerun / refresh the existing model

In [33]:
#word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwrd', outputCol = 'word_vec')
#word2Vec_model = word2Vec.fit(reviews_by_business_token_nostopwrd_df)

# save the word2vec model
#word2Vec_model.write().overwrite().save(model_path + 'word2Vec')

In [34]:
# load the word2vec trained model

word2Vec_mdl = Word2VecModel.load(model_path + 'word2Vec')

In [35]:
reviews_by_business_vec_df = word2Vec_mdl.transform(reviews_by_business_token_nostopwrd_df)

reviews_by_business_vec_df.show(3)

reviews_by_business_vec_df.select('word_vec').show(1, truncate = True)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|           nostopwrd|            word_vec|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|bfR-vJvrjdOJaWsXG...|Attention allergy...|[attention, aller...|[attention, aller...|[-0.0949216104917...|
|Dl2vgi5W_nbe-A97D...|I don't understan...|[i, don, t, under...|[understand, prev...|[-0.0657136221337...|
|65ZGMedBm7TBpWv6f...|Food here is alwa...|[food, here, is, ...|[food, always, fr...|[-0.0036732712861...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows

+--------------------+
|            word_vec|
+--------------------+
|[-0.0949216104917...|
+--------------------+
only showing top 1 row



In [36]:
word2Vec_mdl.findSynonyms("good", 5).show()

+-------+------------------+
|   word|        similarity|
+-------+------------------+
| decent|0.7664605379104614|
|  great|0.6792967319488525|
|  tasty| 0.577859103679657|
|  solid|0.5670980215072632|
|amazing|0.5537719130516052|
+-------+------------------+



In [37]:
word2Vec_mdl.findSynonyms("chinese", 5).show() 

+----------+------------------+
|      word|        similarity|
+----------+------------------+
|     asian|0.8069467544555664|
|vietnamese|0.7198611497879028|
|  northern|0.7195839881896973|
|     hakka|0.7080489993095398|
|    korean|0.7080056667327881|
+----------+------------------+



In [38]:
word2Vec_mdl.findSynonyms("burger", 5).show()

+------------+------------------+
|        word|        similarity|
+------------+------------------+
|     burgers|0.8352599143981934|
|   hamburger|0.7846404910087585|
|       patty|0.7472882270812988|
|      priest|0.7470744252204895|
|cheeseburger|0.7308202981948853|
+------------+------------------+



In [39]:
def CosineSim(vec1, vec2): 
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2)) 

In [40]:
all_vecs = reviews_by_business_vec_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

In [41]:
all_vecs[0]

('bfR-vJvrjdOJaWsXGJgzPA',
 DenseVector([-0.0949, -0.0314, 0.0272, 0.0091, 0.056, -0.0003, 0.0116, 0.02, 0.0641, 0.0029, -0.0145, -0.0746, 0.0152, 0.0599, 0.0564, 0.0002, 0.0737, -0.0414, 0.0264, 0.0784, 0.0352, 0.0016, -0.0452, -0.0517, 0.0131, 0.0295, -0.0073, -0.0313, -0.0145, -0.0208, 0.0482, -0.0188, -0.0831, -0.124, -0.0201, 0.0666, -0.0406, 0.0484, -0.0362, -0.0267, -0.082, -0.0519, -0.011, 0.0132, 0.0178, 0.0408, -0.0015, -0.0176, 0.0003, -0.0212, 0.0747, 0.0396, -0.0601, -0.0949, 0.0083, -0.0494, -0.046, 0.0526, -0.0364, -0.0131, 0.0674, -0.0098, 0.0806, 0.0081, -0.0304, 0.0295, -0.0349, -0.0166, 0.0589, -0.0113, -0.0102, 0.0147, 0.1199, -0.0484, 0.0634, -0.0595, 0.0359, -0.0389, 0.0082, 0.0057, 0.0475, -0.0751, -0.03, -0.0067, 0.0143, -0.013, 0.0334, -0.0109, -0.0418, 0.0716, -0.0147, 0.061, 0.0054, -0.0138, -0.0156, -0.0109, -0.0053, 0.0462, -0.0234, 0.092]))

In [42]:
all_vecs[0][1]

DenseVector([-0.0949, -0.0314, 0.0272, 0.0091, 0.056, -0.0003, 0.0116, 0.02, 0.0641, 0.0029, -0.0145, -0.0746, 0.0152, 0.0599, 0.0564, 0.0002, 0.0737, -0.0414, 0.0264, 0.0784, 0.0352, 0.0016, -0.0452, -0.0517, 0.0131, 0.0295, -0.0073, -0.0313, -0.0145, -0.0208, 0.0482, -0.0188, -0.0831, -0.124, -0.0201, 0.0666, -0.0406, 0.0484, -0.0362, -0.0267, -0.082, -0.0519, -0.011, 0.0132, 0.0178, 0.0408, -0.0015, -0.0176, 0.0003, -0.0212, 0.0747, 0.0396, -0.0601, -0.0949, 0.0083, -0.0494, -0.046, 0.0526, -0.0364, -0.0131, 0.0674, -0.0098, 0.0806, 0.0081, -0.0304, 0.0295, -0.0349, -0.0166, 0.0589, -0.0113, -0.0102, 0.0147, 0.1199, -0.0484, 0.0634, -0.0595, 0.0359, -0.0389, 0.0082, 0.0057, 0.0475, -0.0751, -0.03, -0.0067, 0.0143, -0.013, 0.0334, -0.0109, -0.0418, 0.0716, -0.0147, 0.061, 0.0054, -0.0138, -0.0156, -0.0109, -0.0053, 0.0462, -0.0234, 0.092])

In [43]:
# test similarity by Business

b_id = 'RtUvSWO_UZ8V3Wpj0n077w'

bus_details_df = business_df.filter(col("business_id") == b_id) \
                            .select(['business_id', 'name', 'categories'])
print('Buiness details:')           
bus_details_df.show(truncate = False) 

input_vec = reviews_by_business_vec_df.select('word_vec')\
            .filter(reviews_by_business_vec_df['business_id'] == b_id)\
            .collect()[0][0]
        
#all_vecs = reviews_by_business_vec_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

similar_business_rdd = sc.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in all_vecs)

similar_business_df = spark.createDataFrame(similar_business_rdd).\
    withColumnRenamed('_1', 'business_id').\
    withColumnRenamed('_2', 'similarity_score').\
    orderBy("similarity_score", ascending = False)

a = similar_business_df.filter(col("business_id") != b_id).limit(10).alias("a")

b = business_df.alias("b")
j = a.join(b, col("a.business_id") == col("b.business_id"), 'inner')\
     .select([col('a.'+xx) for xx in a.columns] + [col('b.name'),col('b.categories'),
                                                   col('b.stars'),col('b.review_count'),
                                                   col('b.latitude'),col('b.longitude')])
print('Top 10 similar businesses:')
j.toPandas()

Buiness details:
+----------------------+----------------------+------------------------------------------------------------------------------+
|business_id           |name                  |categories                                                                    |
+----------------------+----------------------+------------------------------------------------------------------------------+
|RtUvSWO_UZ8V3Wpj0n077w|KINKA IZAKAYA ORIGINAL|[Pubs, Japanese, Restaurants, Bars, Nightlife, Tapas Bars, Tapas/Small Plates]|
+----------------------+----------------------+------------------------------------------------------------------------------+

Top 10 similar businesses:


Unnamed: 0,business_id,similarity_score,name,categories,stars,review_count,latitude,longitude
0,CN5nuUQod0f8g3oh99qq0w,0.993978,KINKA IZAKAYA BLOOR,"[Nightlife, Restaurants, Pubs, Japanese, Tapas...",4.0,351,43.665157,-79.410658
1,CfxVkwEJk1NAqgqMSesLzA,0.980039,KINKA IZAKAYA NORTH YORK,"[Bars, Nightlife, Restaurants, Tapas/Small Pla...",3.5,209,43.76019,-79.410112
2,wpQsmMvdhefqIlxvRt_Jbg,0.974047,DonDon Izakaya,"[Restaurants, Japanese, Tapas/Small Plates, Ta...",3.0,225,43.655741,-79.384625
3,L82O1ZFFQfjJxF0_PYWPnA,0.970267,Guu Izakaya Toronto,"[Tapas Bars, Izakaya, Japanese, Restaurants]",4.0,50,43.641867,-79.43109
4,sYKB4nITCLLFcCZPn3QECQ,0.961989,Teppan Kenta,"[Japanese, Restaurants, Food]",3.5,58,43.665279,-79.385945
5,g6GXqg-QdDiQGLYMVqNOUw,0.948832,Hapa Izakaya,"[Japanese, Restaurants]",3.5,148,43.655264,-79.414242
6,478TIlfHXfT3wvww54QsPg,0.942653,Ki Modern Japanese + Bar,"[Sushi Bars, Restaurants, Japanese]",3.5,169,43.647208,-79.379381
7,SjgeuBlgKER9yegpoxT99w,0.941614,Nomé Izakaya,"[Bars, Nightlife, Restaurants, Lounges, Tapas ...",4.0,374,43.76265,-79.411469
8,KxcQs2Lkm3FJiltVWXOz_Q,0.938435,Hashi Izakaya,"[Tapas Bars, Nightlife, Japanese, Restaurants,...",3.5,37,43.779256,-79.415713
9,8J0NuWmoFfSGe5LuaiMfpg,0.9376,Sake Bar Kushi,"[Sushi Bars, Japanese, Restaurants, Tapas Bars]",4.0,67,43.704833,-79.406917


In [44]:
def getBusinessDetails(in_bus):
    
    a = in_bus.alias("a")
    b = business_df.alias("b")
    
    return a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.name'),col('b.categories'),
                                                           col('b.stars'),col('b.review_count'),
                                                           col('b.latitude'),col('b.longitude')])
    

In [58]:
def showInMap(df):
    
    mp = folium.Map(location=[43.70011, -79.4163], zoom_start=12)

    for i, r in df.toPandas().iterrows():
        folium.Marker(
                    location =[r.latitude, r.longitude], 
                    popup = html.escape(r["name"]) + '<br>' + 'Stars: ' + str(r.stars) + '<br>' + 'Reviews: ' + str(r.review_count),    
                    icon = folium.Icon(color='green')).add_to(mp)
    mp.
    return mp

In [45]:
def getKeyWordsRecoms(key_words, sim_bus_count):
    
    print('Businesses similar to key words: "' + key_words + '"')
    
    input_words_df = sc.parallelize([(0, key_words)]).toDF(['business_id', 'key_words'])

    regexToken = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'key_words', outputCol = 'token')
    stopWrdRem = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')


    # Build the pipeline
    pipeline = Pipeline(stages=[regexToken, stopWrdRem])


    mdl = pipeline.fit(input_words_df)
    input_words_token_nostopwrd_df = mdl.transform(input_words_df)

    input_vec_df = word2Vec_mdl.transform(input_words_token_nostopwrd_df)

    input_key_words_vec = input_vec_df.select('word_vec').collect()[0][0]

    #all_vecs = reviews_by_business_vec_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

    similar_business_by_key_word_rdd = sc.parallelize((i[0], float(CosineSim(input_key_words_vec, i[1]))) \
                                                                                      for i in all_vecs)

    similar_business_by_key_word_df = spark.createDataFrame(similar_business_by_key_word_rdd).\
        withColumnRenamed('_1', 'business_id').\
        withColumnRenamed('_2', 'similarity_score').\
        orderBy("similarity_score", ascending = False)

    a = similar_business_by_key_word_df.limit(sim_bus_count)
    return getBusinessDetails(a)


In [46]:
key_words = 'chicken cheese burger'

keywords_recom_df = getKeyWordsRecoms(key_words, 10)
keywords_recom_df.toPandas()

Businesses similar to key words: "chicken cheese burger"


Unnamed: 0,business_id,similarity_score,name,categories,stars,review_count,latitude,longitude
0,37joQpD9m5AIcrW1c8OBnQ,0.706561,Urban Smoke Fusion BBQ Food Truck,"[Desserts, Barbeque, Food, Restaurants, Food T...",4.0,8,43.718711,-79.470037
1,3Cu-af4en3uWCrAkkqfiHQ,0.698635,Epic Burgers and Waffles,"[Burgers, Food, Restaurants]",2.5,5,43.632351,-79.42128
2,nP87zXxeS-8got7IBvoAuA,0.657255,McCoy Burger Company,"[Local Flavor, Sandwiches, Restaurants, Poutin...",4.0,33,43.731511,-79.404081
3,DiCMYxT69I22-1nfsvYAJQ,0.651565,Gourmet Burger Co,"[Burgers, Restaurants]",3.5,37,43.664683,-79.368279
4,UN0UwUh7jaeX6Jg3lZImCg,0.638442,Holy Chuck,"[Food, Restaurants, Desserts, Poutineries, Bur...",3.0,43,43.665211,-79.384925
5,ZzF5098L4xg-0COjng2LVA,0.638042,Burgatory,"[Pubs, Burgers, Food Trucks, Nightlife, Bars, ...",3.0,9,43.655055,-79.418563
6,PkeaeQS8aJTeS8PS_Hl_-g,0.637751,Steak and Cheese Factory,"[Sandwiches, Cheesesteaks, Restaurants]",3.0,3,43.708213,-79.392367
7,ky9RbwLtChekSrqcYR39kw,0.635411,Big Smoke Burger,"[Burgers, Poutineries, Restaurants]",3.0,6,43.611289,-79.556867
8,ycAW6Q5quaCSDX5zwQ3tPg,0.630411,New York Fries,"[Canadian (New), Specialty Food, Food, Restaur...",3.5,8,43.776875,-79.256655
9,7UPTUpex3O1Gav3td7GOEw,0.625449,South St Burger Co,"[Burgers, Restaurants]",3.0,6,43.736442,-79.344201


In [47]:
def getSimilarBusinesses(b_ids, sim_bus_count):
    
    schema = StructType([
                            StructField("business_id", StringType(), True), 
                            StructField("similarity_score", IntegerType(), True)
                        ])
    
    similar_businesses_df = spark.createDataFrame([], schema)
    
    for b_id in b_ids:
        
        print('Businesses similar to: ' + b_id)
        
        input_vec = reviews_by_business_vec_df.select('word_vec')\
                    .filter(reviews_by_business_vec_df['business_id'] == b_id)\
                    .collect()[0][0]

        #all_vecs = reviews_by_business_vec_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

        similar_business_rdd = sc.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in all_vecs)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed('_1', 'business_id') \
            .withColumnRenamed('_2', 'similarity_score') \
            .orderBy("similarity_score", ascending = False)
            
        similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(10)
        similar_business_df.show()
        
        similar_businesses_df = similar_businesses_df.union(similar_business_df)
    
    return similar_businesses_df
    

In [48]:
def getContentRecoms(u_id, sim_bus_count=10):
    
    query = """
    SELECT distinct business_id FROM reviews  
    where stars >= 3.0 
    and user_id = "{}"
    """.format(u_id)

    usr_rev_bus = sqlContext.sql(query)

    usr_rev_bus = usr_rev_bus.sample(False, 0.5).limit(5)

    usr_rev_bus_det = getBusinessDetails(usr_rev_bus)

    print('Businesses previously reviewed by user:')
    usr_rev_bus_det.select(['business_id', 'name', 'categories']).show(truncate = False)

    bus_list = [i.business_id for i in usr_rev_bus.collect()]

    sim_bus_df = getSimilarBusinesses(bus_list, sim_bus_count)

    s = sim_bus_df.alias("s")
    r = usr_rev_bus.alias("r")
    j = s.join(r, col("s.business_id") == col("r.business_id"), 'left_outer') \
         .where(col("r.business_id").isNull()) \
         .select([col('s.business_id'),col('s.similarity_score')])

    a = j.orderBy("similarity_score", ascending = False).limit(sim_bus_count)

    return getBusinessDetails(a)

     

In [66]:
u_id = 'ZWD8UH1T7QXQr0Eq-mcWYg'

content_recom_df = getContentRecoms(u_id)

print("Businesses recommended to user based on his previously reviewd businesses:")
content_recom_df.toPandas()

Businesses previously reviewed by user:
+----------------------+--------------------------------+-------------------------------------------------------------+
|business_id           |name                            |categories                                                   |
+----------------------+--------------------------------+-------------------------------------------------------------+
|wv0qXO8gg71HwIWOun_Mbw|Red Rock Restaurant             |[Mediterranean, Canadian (New), Restaurants]                 |
|6oq1tUbRkfC7Qye2SrdxMw|Tucker's Marketplace Restaurants|[Buffets, Restaurants]                                       |
|9Oc8UhYNarSisYM9SEBsKQ|Chop Steakhouse & Bar           |[Seafood, Steakhouses, Nightlife, Restaurants, Lounges, Bars]|
|Jo3Cpvf8fpfcAIFG3TagbQ|Pantheon Restaurant             |[Mediterranean, Greek, Restaurants]                          |
|ZumOnWbstgsIE6bJlxw0_Q|Jacobs & Co. Steakhouse         |[Restaurants, Steakhouses]                                   |


Unnamed: 0,business_id,similarity_score,name,categories,stars,review_count,latitude,longitude
0,Qzbcq82RJKIcAl0HSoSBJQ,0.982819,Wildfire Steakhouse & Wine Bar,"[Steakhouses, Desserts, Nightlife, Bars, Resta...",3.0,106,43.733672,-79.404448
1,nDn2h-_c7Xk4UwM0aiXZlw,0.981278,Mezes,"[Mediterranean, Greek, Restaurants]",4.0,145,43.678004,-79.350213
2,JJ8ypBu3b--fy4HA5RB1gg,0.980275,Morton's The Steakhouse,"[Steakhouses, Restaurants]",4.0,85,43.669713,-79.394975
3,Vf_RHj0f1VViEF6OYnEfUA,0.978866,Quinn's Steakhouse & Irish Bar,"[Restaurants, Steakhouses, Irish]",3.5,84,43.651089,-79.382919
4,Q2ZNaN3p8s_-XXjBWyY2qA,0.97877,Ruth's Chris Steak House,"[Restaurants, Steakhouses, Party & Event Plann...",3.5,172,43.649612,-79.385306
5,fLxn7nVQzESaIqrFLsjEbQ,0.973278,Canyon Creek,"[Seafood, Steakhouses, Restaurants]",3.5,48,43.689328,-79.584311
6,usGow17X8F4qER7kNPBXzg,0.97145,Bâton Rouge Steakhouse & Bar,"[American (Traditional), Canadian (New), Glute...",3.0,39,43.778928,-79.257318
7,DuzxF9yWIxYqt4jDKw_bbA,0.971417,Pan On the Danforth,"[Restaurants, Mediterranean, Greek]",3.5,122,43.678359,-79.348685
8,dulgBrNtDCGjYcbQa2E6uA,0.970617,Michael's on Simcoe,"[Steakhouses, Restaurants, Italian, Seafood]",3.5,73,43.648092,-79.386363
9,aKESUG6MntqZNvC0DiFcJg,0.969492,Penelope Restaurant,"[Greek, Mediterranean, Restaurants]",3.0,54,43.646655,-79.388236


In [67]:
showInMap(content_recom_df)

In [68]:
u_id = 'Wc5L6iuvSNF5WGBlqIO8nw'

content_recom_df = getContentRecoms(u_id)

print("Businesses recommended to user based on his previously reviewd businesses:")
content_recom_df.toPandas()

Businesses previously reviewed by user:
+----------------------+----------------------------+-------------------------------------------------------------------------------------------+
|business_id           |name                        |categories                                                                                 |
+----------------------+----------------------------+-------------------------------------------------------------------------------------------+
|F_oPMHJrH42R67xp5eKtQA|Yummy Korean Food Restaurant|[Korean, Restaurants]                                                                      |
|JmZj7wzAJ7_4ksjG9WXdqw|Gladstone Hotel             |[Hotels & Travel, Lounges, Restaurants, Bars, Nightlife, Event Planning & Services, Hotels]|
|c78Pat78fVUBFPXYeVvbaQ|Odd Seoul                   |[Restaurants, Bars, Korean, Dive Bars, Asian Fusion, Nightlife]                            |
|9jYnZymuaW-XpMIS75YxgQ|The Beaver                  |[Canadian (New), Nightlife, Caf

Unnamed: 0,business_id,similarity_score,name,categories,stars,review_count,latitude,longitude
0,rO3lZpVSoRMhhd0AEJBjlg,0.986628,Sunrise House,"[Restaurants, Korean]",4.0,135,43.664068,-79.415668
1,rhyjGfqYlCJoi8Zeulg6QA,0.985523,Kimchi Korea House,"[Korean, Restaurants]",3.5,155,43.655256,-79.385475
2,j-Z_HAev26ZftdErMhIBuA,0.981557,Thumbs Up Korean Restaurant,"[Restaurants, Korean]",4.0,56,43.664451,-79.413786
3,_MA98TVmvVIy-XdI0poc7w,0.980885,Mom's Korean Food,"[Korean, Restaurants]",3.5,62,43.664686,-79.413785
4,SNkkuchbVtUzCwyENcai_g,0.980637,Danji,"[Restaurants, Chinese, Japanese, Korean]",3.5,57,43.6653,-79.384899
5,Fl2yDmC0B0TTVsNixxVXfA,0.976537,The Emerson,"[Canadian (New), Restaurants]",4.0,104,43.658353,-79.442003
6,X6ZZksefmR_piQj2Gbnduw,0.975906,Paldo Gangsan,"[Restaurants, Korean]",4.0,47,43.663799,-79.417393
7,uChTCA6MsLAciDRklpO-Fw,0.973643,Makkal Chon,"[Greek, Restaurants, Korean]",4.0,210,43.744944,-79.296636
8,ZCrK07xb6w5Vi1vathV0NQ,0.973478,Bapbo Korean Restaurant,"[Korean, Japanese, Restaurants]",3.0,86,43.655606,-79.384966
9,l5VG7_TWg4JGljx3UJStVw,0.973018,Hana Korea Restaurant,"[Korean, Restaurants]",3.5,30,43.655822,-79.393727


In [69]:
showInMap(content_recom_df)