In [1]:
import numpy as np
import pandas as pd

from operator import add
from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import HashingTF

from pyspark.ml import Pipeline

from pyspark.sql.functions import *

from pyspark.sql.types import *

In [2]:
business_df = spark.read.parquet('/home/osboxes/yelp-data/dataset/business-small.parquet')

In [3]:
business_df.printSchema()

root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- stars: double (nullable = true)
 |-- review_count: long (nullable = true)
 |-- categories: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [4]:
business_df.describe()

DataFrame[summary: string, business_id: string, name: string, neighborhood: string, address: string, city: string, state: string, postal_code: string, latitude: string, longitude: string, stars: string, review_count: string]

In [5]:
business_df.show(2)

+--------------------+--------------------+------------+--------------------+-------+-----+-----------+----------+-----------+-----+------------+--------------------+
|         business_id|                name|neighborhood|             address|   city|state|postal_code|  latitude|  longitude|stars|review_count|          categories|
+--------------------+--------------------+------------+--------------------+-------+-----+-----------+----------+-----------+-----+------------+--------------------+
|qim0lD112TkDhm8Zy...|McCarthy's Irish Pub| Upper Beach|1801 Gerrard Stre...|Toronto|   ON|    M4L 2B5|43.6780488|-79.3147736|  4.0|           5|[Pubs, Restaurant...|
|Wf5C8Amv_SlhoYE3_...|         Oishi Sushi|            |    1325 Finch Ave W|Toronto|   ON|    M3J 2G5|43.7635097|-79.4907499|  2.0|          27|[Asian Fusion, Re...|
+--------------------+--------------------+------------+--------------------+-------+-----+-----------+----------+-----------+-----+------------+--------------------

In [6]:
business_df.count()

6750

In [7]:
business_rdd = business_df.rdd

In [8]:
business_rdd.take(2)

[Row(business_id='qim0lD112TkDhm8ZyQlRnA', name="McCarthy's Irish Pub", neighborhood='Upper Beach', address='1801 Gerrard Street E', city='Toronto', state='ON', postal_code='M4L 2B5', latitude=43.6780488, longitude=-79.3147736, stars=4.0, review_count=5, categories=['Pubs', 'Restaurants', 'Bars', 'Irish', 'Nightlife']),
 Row(business_id='Wf5C8Amv_SlhoYE3_W66WQ', name='Oishi Sushi', neighborhood='', address='1325 Finch Ave W', city='Toronto', state='ON', postal_code='M3J 2G5', latitude=43.7635097, longitude=-79.4907499, stars=2.0, review_count=27, categories=['Asian Fusion', 'Restaurants', 'Sushi Bars'])]

In [9]:
business_df.createOrReplaceTempView("businesses")

In [10]:
query = """
SELECT * FROM businesses where review_count > 100 limit 10
"""

sqlContext.sql(query).toPandas()

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,categories
0,kLw_FmSiEqYH-MtFhDIUFQ,Big Daddy's Bourbon Street Bistro & Oyster Bar,Entertainment District,212 King Street W,Toronto,ON,M5H 1K5,43.647499,-79.386471,3.5,132,"[Cajun/Creole, Restaurants, Seafood]"
1,rxA9c0_XObabVL1WCTA4FA,Sneaky Dee's,Kensington Market,431 College Street,Toronto,ON,M5T 1T1,43.656333,-79.407487,3.5,362,"[Breakfast & Brunch, Nightlife, Dive Bars, Tex..."
2,769NudnrUxWFtJCGU66A_A,Thompson Diner,Niagara,550 Wellington Street W,Toronto,ON,M5V 2V4,43.642914,-79.402046,3.0,207,"[American (New), Restaurants, Canadian (New), ..."
3,ofw8aDSEg1HoQdmCgvLtaQ,The Pie Commission,Etobicoke,935 Queensway,Toronto,ON,M8Z 1P4,43.623881,-79.512074,4.5,183,"[Canadian (New), Fast Food, Food, Do-It-Yourse..."
4,hDy-uY7Vy_TZdGBzw59lhA,Saku Sushi,Alexandra Park,478 Queen Street W,Toronto,ON,M5V 2B2,43.648071,-79.400286,4.0,261,"[Japanese, Breakfast & Brunch, Restaurants, Su..."
5,fK1oj0dk9Bc6KsBk5mMDxg,Playa Cabana Cantina,The Junction,2883 Dundas Street W,Toronto,ON,M6P 1Y9,43.665303,-79.465505,3.5,229,"[Restaurants, Mexican]"
6,Vg4N2DsGrzzoam9jS1L3Wg,Johnny's Hamburgers,Scarborough,2595 Victoria Park Avenue,Toronto,ON,M1T 1A4,43.774878,-79.322278,3.5,166,"[Burgers, Restaurants]"
7,bz07FlaDmxHV9ER-cF6XuA,Platito Filipino Soul Food,Downtown Core,35 Baldwin Street,Toronto,ON,M5T 1L1,43.655859,-79.393467,3.5,113,"[Filipino, Restaurants]"
8,W2NzlS8OJzGfDfr9oRz11Q,Drake One Fifty,Financial District,150 York Street,Toronto,ON,M5H 3S5,43.649354,-79.384684,3.5,168,"[Cocktail Bars, Brasseries, Food, Canadian (Ne..."
9,XmgdFa3G_CZVfjtQEJMZfQ,Caplansky's Delicatessen,,356 College Street,Toronto,ON,M5T 1S6,43.657207,-79.404248,3.5,390,"[Restaurants, Delis, Caterers, Event Planning ..."


In [11]:
user_df = spark.read.parquet("/home/osboxes/yelp-data/dataset/user-small.parquet")
user_df.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- review_count: long (nullable = true)
 |-- yelping_since: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)
 |-- fans: long (nullable = true)
 |-- average_stars: double (nullable = true)



In [12]:
user_df.count()

66424

In [13]:
user_df.createOrReplaceTempView("users")

In [14]:
query = """
SELECT * FROM USERS 
limit 10
"""

sqlContext.sql(query).toPandas()


Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,fans,average_stars
0,om5ZiponkpRqUNa3pVPiRg,Andrea,2559,2006-01-18,83681,10882,40110,835,3.94
1,Wc5L6iuvSNF5WGBlqIO8nw,Risa,1122,2011-07-30,26395,4880,19108,435,4.1
2,uxKSnOVAoEj4I6X9YhLBlg,Vivian,73,2013-03-02,34,5,2,8,3.54
3,s8bVHRqx6cI8F8HGf3A_og,Colleen,32,2014-12-18,19,3,7,2,4.15
4,xEajChTkzWIYTMLkYNoIIw,Di,71,2012-09-26,31,8,4,3,3.29
5,YJLlvBPtvB8iJg8_WKxVzQ,Casey,72,2014-03-01,5,2,3,7,3.95
6,YTdNcIWAt2nEzZ7NY-fniw,Jeff,754,2011-05-16,151,105,125,68,3.74
7,ZWD8UH1T7QXQr0Eq-mcWYg,Jason,121,2013-11-13,192,29,54,33,3.91
8,YSDzb8DnvKozByqBjYiS4w,Jarita,68,2012-03-30,1,1,0,8,3.64
9,ljdo6-BZlywsF5RiGd5e5A,Justina,75,2014-06-18,7,2,0,5,3.82


In [15]:
review_df = spark.read.parquet("/home/osboxes/yelp-data/dataset/review-small.parquet")

In [16]:
review_df.printSchema()

root
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: long (nullable = true)
 |-- review_date: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- useful: long (nullable = true)
 |-- funny: long (nullable = true)
 |-- cool: long (nullable = true)



In [17]:
review_df.count()

276887

In [18]:
review_df.show(3)

+--------------------+--------------------+--------------------+-----+-----------+--------------------+------+-----+----+
|           review_id|             user_id|         business_id|stars|review_date|         review_text|useful|funny|cool|
+--------------------+--------------------+--------------------+-----+-----------+--------------------+------+-----+----+
|Z5l99h18E3_g1GLcD...|djpMXOA1ic5wv3FPt...|mr4FiPaXTWlJ3qGzp...|    3| 2009-07-21|I left Table 17 f...|     3|    0|   0|
|Z3Fw292i0Eg8liW0D...|-pXs08gJq9ExIk275...|mr4FiPaXTWlJ3qGzp...|    3| 2008-12-13|for the time bein...|     1|    0|   0|
|hsKINx1dIKeFTDe-Z...|PTj29rhujYETuFlAZ...|mr4FiPaXTWlJ3qGzp...|    5| 2013-10-12|Love this place. ...|     1|    0|   1|
+--------------------+--------------------+--------------------+-----+-----------+--------------------+------+-----+----+
only showing top 3 rows



In [19]:
review_df.createOrReplaceTempView("reviews")

In [20]:
query = """
SELECT
    business_id,
    COUNT(*) as 5_stars_count
FROM reviews
WHERE stars = '5'
GROUP BY business_id 
ORDER BY COUNT(*) DESC
limit 10
"""

sqlContext.sql(query).toPandas()


Unnamed: 0,business_id,5_stars_count
0,r_BrIgzYcwo1NAuG9dLbpg,604
1,aLcFhMe6DDJ430zelCpd2A,462
2,RtUvSWO_UZ8V3Wpj0n077w,458
3,iGEvDk6hsizigmXhDKs2Vg,457
4,N93EYZy9R0sdlEvubu94ig,407
5,Yl2TN9c23ZGLUBSD9ks5Uw,279
6,ZumOnWbstgsIE6bJlxw0_Q,267
7,mZRKH9ngRY92bI_irrHq6w,267
8,k6zmSLmYAquCpJGKNnTgSQ,259
9,JMiaNitMzMbJm6Kh0RbT5A,247


In [21]:
query = """
SELECT
    review_text
FROM reviews
WHERE stars = '1'
limit 10
"""

sqlContext.sql(query).show(2)


+--------------------+
|         review_text|
+--------------------+
|They messed up my...|
|#detox ...... wil...|
+--------------------+
only showing top 2 rows



In [22]:
reviews_text = spark.sql("SELECT business_id, review_text FROM reviews")

In [23]:
reviews_text.show(3)

+--------------------+--------------------+
|         business_id|         review_text|
+--------------------+--------------------+
|mr4FiPaXTWlJ3qGzp...|I left Table 17 f...|
|mr4FiPaXTWlJ3qGzp...|for the time bein...|
|mr4FiPaXTWlJ3qGzp...|Love this place. ...|
+--------------------+--------------------+
only showing top 3 rows



In [24]:
reviews_text_rdd = reviews_text.rdd
reviews_by_business_rdd = reviews_text_rdd.map(tuple).reduceByKey(add)  
reviews_by_business_df = spark.createDataFrame(reviews_by_business_rdd)
reviews_by_business_df = reviews_by_business_df \
                            .withColumnRenamed('_1', 'business_id') \
                            .withColumnRenamed('_2', 'text')
reviews_by_business_df.count()   

6750

In [25]:
reviews_by_business_df.show(3)

+--------------------+--------------------+
|         business_id|                text|
+--------------------+--------------------+
|bfR-vJvrjdOJaWsXG...|Attention allergy...|
|Dl2vgi5W_nbe-A97D...|I don't understan...|
|65ZGMedBm7TBpWv6f...|Food here is alwa...|
+--------------------+--------------------+
only showing top 3 rows



In [26]:

regexTokenizer = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'text', outputCol = 'token')

reviews_by_business_token_df = regexTokenizer.transform(reviews_by_business_df)
reviews_by_business_token_df.show(3)


+--------------------+--------------------+--------------------+
|         business_id|                text|               token|
+--------------------+--------------------+--------------------+
|bfR-vJvrjdOJaWsXG...|Attention allergy...|[attention, aller...|
|Dl2vgi5W_nbe-A97D...|I don't understan...|[i, don, t, under...|
|65ZGMedBm7TBpWv6f...|Food here is alwa...|[food, here, is, ...|
+--------------------+--------------------+--------------------+
only showing top 3 rows



In [27]:
stopWordsRemover = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')

reviews_by_business_token_nostopwrd_df = stopWordsRemover.transform(reviews_by_business_token_df)
reviews_by_business_token_nostopwrd_df.show(3)

+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|           nostopwrd|
+--------------------+--------------------+--------------------+--------------------+
|bfR-vJvrjdOJaWsXG...|Attention allergy...|[attention, aller...|[attention, aller...|
|Dl2vgi5W_nbe-A97D...|I don't understan...|[i, don, t, under...|[understand, prev...|
|65ZGMedBm7TBpWv6f...|Food here is alwa...|[food, here, is, ...|[food, always, fr...|
+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [28]:
word2Vec = Word2Vec(vectorSize = 100, minCount = 5, inputCol = 'nostopwrd', outputCol = 'word_vec')
word2Vec_model = word2Vec.fit(reviews_by_business_token_nostopwrd_df)


In [29]:
reviews_by_business_vec_df = word2Vec_model.transform(reviews_by_business_token_nostopwrd_df)

reviews_by_business_vec_df.show(3)

reviews_by_business_vec_df.select('word_vec').show(1, truncate = True)

+--------------------+--------------------+--------------------+--------------------+--------------------+
|         business_id|                text|               token|           nostopwrd|            word_vec|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|bfR-vJvrjdOJaWsXG...|Attention allergy...|[attention, aller...|[attention, aller...|[-0.0949216104917...|
|Dl2vgi5W_nbe-A97D...|I don't understan...|[i, don, t, under...|[understand, prev...|[-0.0657136221337...|
|65ZGMedBm7TBpWv6f...|Food here is alwa...|[food, here, is, ...|[food, always, fr...|[-0.0036732712861...|
+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows

+--------------------+
|            word_vec|
+--------------------+
|[-0.0949216104917...|
+--------------------+
only showing top 1 row



In [30]:
word2Vec_model.findSynonyms("good", 5).show()

+-------+------------------+
|   word|        similarity|
+-------+------------------+
| decent|0.7664605379104614|
|  great|0.6792967319488525|
|  tasty| 0.577859103679657|
|  solid|0.5670980215072632|
|amazing|0.5537719130516052|
+-------+------------------+



In [31]:
word2Vec_model.write().overwrite().save('/home/osboxes/yelp-data/word2Vec')

In [32]:
w2vw = word2Vec_model.load('/home/osboxes/yelp-data/word2Vec')

In [33]:

w2vw.findSynonyms("good", 5).show()

+-------+------------------+
|   word|        similarity|
+-------+------------------+
| decent|0.7664605379104614|
|  great|0.6792967319488525|
|  tasty| 0.577859103679657|
|  solid|0.5670980215072632|
|amazing|0.5537719130516052|
+-------+------------------+



In [34]:
w2vw.findSynonyms("chinese", 5).show() 

+----------+------------------+
|      word|        similarity|
+----------+------------------+
|     asian|0.8069467544555664|
|vietnamese|0.7198611497879028|
|  northern|0.7195839881896973|
|     hakka|0.7080489993095398|
|    korean|0.7080056667327881|
+----------+------------------+



In [35]:
w2vw.findSynonyms("burger", 5).show()

+------------+------------------+
|        word|        similarity|
+------------+------------------+
|     burgers|0.8352599143981934|
|   hamburger|0.7846404910087585|
|       patty|0.7472882270812988|
|      priest|0.7470744252204895|
|cheeseburger|0.7308202981948853|
+------------+------------------+



In [36]:
def CosineSim(vec1, vec2): 
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1)) / np.sqrt(np.dot(vec2, vec2)) 

In [37]:
all_vecs = reviews_by_business_vec_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

In [38]:
all_vecs[0]

('bfR-vJvrjdOJaWsXGJgzPA',
 DenseVector([-0.0949, -0.0314, 0.0272, 0.0091, 0.056, -0.0003, 0.0116, 0.02, 0.0641, 0.0029, -0.0145, -0.0746, 0.0152, 0.0599, 0.0564, 0.0002, 0.0737, -0.0414, 0.0264, 0.0784, 0.0352, 0.0016, -0.0452, -0.0517, 0.0131, 0.0295, -0.0073, -0.0313, -0.0145, -0.0208, 0.0482, -0.0188, -0.0831, -0.124, -0.0201, 0.0666, -0.0406, 0.0484, -0.0362, -0.0267, -0.082, -0.0519, -0.011, 0.0132, 0.0178, 0.0408, -0.0015, -0.0176, 0.0003, -0.0212, 0.0747, 0.0396, -0.0601, -0.0949, 0.0083, -0.0494, -0.046, 0.0526, -0.0364, -0.0131, 0.0674, -0.0098, 0.0806, 0.0081, -0.0304, 0.0295, -0.0349, -0.0166, 0.0589, -0.0113, -0.0102, 0.0147, 0.1199, -0.0484, 0.0634, -0.0595, 0.0359, -0.0389, 0.0082, 0.0057, 0.0475, -0.0751, -0.03, -0.0067, 0.0143, -0.013, 0.0334, -0.0109, -0.0418, 0.0716, -0.0147, 0.061, 0.0054, -0.0138, -0.0156, -0.0109, -0.0053, 0.0462, -0.0234, 0.092]))

In [39]:
all_vecs[0][1]

DenseVector([-0.0949, -0.0314, 0.0272, 0.0091, 0.056, -0.0003, 0.0116, 0.02, 0.0641, 0.0029, -0.0145, -0.0746, 0.0152, 0.0599, 0.0564, 0.0002, 0.0737, -0.0414, 0.0264, 0.0784, 0.0352, 0.0016, -0.0452, -0.0517, 0.0131, 0.0295, -0.0073, -0.0313, -0.0145, -0.0208, 0.0482, -0.0188, -0.0831, -0.124, -0.0201, 0.0666, -0.0406, 0.0484, -0.0362, -0.0267, -0.082, -0.0519, -0.011, 0.0132, 0.0178, 0.0408, -0.0015, -0.0176, 0.0003, -0.0212, 0.0747, 0.0396, -0.0601, -0.0949, 0.0083, -0.0494, -0.046, 0.0526, -0.0364, -0.0131, 0.0674, -0.0098, 0.0806, 0.0081, -0.0304, 0.0295, -0.0349, -0.0166, 0.0589, -0.0113, -0.0102, 0.0147, 0.1199, -0.0484, 0.0634, -0.0595, 0.0359, -0.0389, 0.0082, 0.0057, 0.0475, -0.0751, -0.03, -0.0067, 0.0143, -0.013, 0.0334, -0.0109, -0.0418, 0.0716, -0.0147, 0.061, 0.0054, -0.0138, -0.0156, -0.0109, -0.0053, 0.0462, -0.0234, 0.092])

In [40]:
# test similarity by Business

b_id = 'RtUvSWO_UZ8V3Wpj0n077w'

bus_details_df = business_df.filter(col("business_id") == b_id) \
                            .select(['business_id', 'name', 'categories'])
print('Buiness details:')           
bus_details_df.show(truncate = False) 

input_vec = reviews_by_business_vec_df.select('word_vec')\
            .filter(reviews_by_business_vec_df['business_id'] == b_id)\
            .collect()[0][0]
        
#all_vecs = reviews_by_business_vec_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

similar_business_rdd = sc.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in all_vecs)

similar_business_df = spark.createDataFrame(similar_business_rdd).\
    withColumnRenamed('_1', 'business_id').\
    withColumnRenamed('_2', 'similarity_score').\
    orderBy("similarity_score", ascending = False)

a = similar_business_df.filter(col("business_id") != b_id).limit(10).alias("a")

b = business_df.alias("b")
j = a.join(b, col("a.business_id") == col("b.business_id"), 'inner')\
     .select([col('a.'+xx) for xx in a.columns] + [col('b.name'),col('b.categories'),
                                                   col('b.stars'),col('b.review_count'),
                                                   col('b.latitude'),col('b.latitude')])
print('Top 10 similar businesses:')
j.toPandas()

Buiness details:
+----------------------+----------------------+------------------------------------------------------------------------------+
|business_id           |name                  |categories                                                                    |
+----------------------+----------------------+------------------------------------------------------------------------------+
|RtUvSWO_UZ8V3Wpj0n077w|KINKA IZAKAYA ORIGINAL|[Pubs, Japanese, Restaurants, Bars, Nightlife, Tapas Bars, Tapas/Small Plates]|
+----------------------+----------------------+------------------------------------------------------------------------------+

Top 10 similar businesses:


Unnamed: 0,business_id,similarity_score,name,categories,stars,review_count,latitude,latitude.1
0,CN5nuUQod0f8g3oh99qq0w,0.993978,KINKA IZAKAYA BLOOR,"[Nightlife, Restaurants, Pubs, Japanese, Tapas...",4.0,351,43.665157,43.665157
1,CfxVkwEJk1NAqgqMSesLzA,0.980039,KINKA IZAKAYA NORTH YORK,"[Bars, Nightlife, Restaurants, Tapas/Small Pla...",3.5,209,43.76019,43.76019
2,wpQsmMvdhefqIlxvRt_Jbg,0.974047,DonDon Izakaya,"[Restaurants, Japanese, Tapas/Small Plates, Ta...",3.0,225,43.655741,43.655741
3,L82O1ZFFQfjJxF0_PYWPnA,0.970267,Guu Izakaya Toronto,"[Tapas Bars, Izakaya, Japanese, Restaurants]",4.0,50,43.641867,43.641867
4,sYKB4nITCLLFcCZPn3QECQ,0.961989,Teppan Kenta,"[Japanese, Restaurants, Food]",3.5,58,43.665279,43.665279
5,g6GXqg-QdDiQGLYMVqNOUw,0.948832,Hapa Izakaya,"[Japanese, Restaurants]",3.5,148,43.655264,43.655264
6,478TIlfHXfT3wvww54QsPg,0.942653,Ki Modern Japanese + Bar,"[Sushi Bars, Restaurants, Japanese]",3.5,169,43.647208,43.647208
7,SjgeuBlgKER9yegpoxT99w,0.941614,Nomé Izakaya,"[Bars, Nightlife, Restaurants, Lounges, Tapas ...",4.0,374,43.76265,43.76265
8,KxcQs2Lkm3FJiltVWXOz_Q,0.938435,Hashi Izakaya,"[Tapas Bars, Nightlife, Japanese, Restaurants,...",3.5,37,43.779256,43.779256
9,8J0NuWmoFfSGe5LuaiMfpg,0.9376,Sake Bar Kushi,"[Sushi Bars, Japanese, Restaurants, Tapas Bars]",4.0,67,43.704833,43.704833


In [41]:
def getBusinessDetails(in_bus):
    
    a = in_bus.alias("a")
    b = business_df.alias("b")
    
    return a.join(b, col("a.business_id") == col("b.business_id"), 'inner') \
             .select([col('a.'+xx) for xx in a.columns] + [col('b.name'),col('b.categories'),
                                                           col('b.stars'),col('b.review_count'),
                                                           col('b.latitude'),col('b.latitude')])
    

In [42]:
def getKeyWordsRecoms(key_words, sim_bus_count):
    
    print('Businesses similar to key words: "' + key_words + '"')
    
    input_words_df = sc.parallelize([(0, key_words)]).toDF(['business_id', 'key_words'])

    regexToken = RegexTokenizer(gaps = False, pattern = '\w+', inputCol = 'key_words', outputCol = 'token')
    stopWrdRem = StopWordsRemover(inputCol = 'token', outputCol = 'nostopwrd')


    # Build the pipeline
    pipeline = Pipeline(stages=[regexToken, stopWrdRem])


    mdl = pipeline.fit(input_words_df)
    input_words_token_nostopwrd_df = mdl.transform(input_words_df)

    input_vec_df = word2Vec_model.transform(input_words_token_nostopwrd_df)

    input_key_words_vec = input_vec_df.select('word_vec').collect()[0][0]

    #all_vecs = reviews_by_business_vec_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

    similar_business_by_key_word_rdd = sc.parallelize((i[0], float(CosineSim(input_key_words_vec, i[1]))) \
                                                                                      for i in all_vecs)

    similar_business_by_key_word_df = spark.createDataFrame(similar_business_by_key_word_rdd).\
        withColumnRenamed('_1', 'business_id').\
        withColumnRenamed('_2', 'similarity_score').\
        orderBy("similarity_score", ascending = False)

    a = similar_business_by_key_word_df.limit(sim_bus_count)
    return getBusinessDetails(a)


In [43]:
key_words = 'chicken cheese burger'

keywords_recom_df = getKeyWordsRecoms(key_words, 10)
keywords_recom_df.toPandas()

Businesses similar to key words: "chicken cheese burger"


Unnamed: 0,business_id,similarity_score,name,categories,stars,review_count,latitude,latitude.1
0,37joQpD9m5AIcrW1c8OBnQ,0.706561,Urban Smoke Fusion BBQ Food Truck,"[Desserts, Barbeque, Food, Restaurants, Food T...",4.0,8,43.718711,43.718711
1,3Cu-af4en3uWCrAkkqfiHQ,0.698635,Epic Burgers and Waffles,"[Burgers, Food, Restaurants]",2.5,5,43.632351,43.632351
2,nP87zXxeS-8got7IBvoAuA,0.657255,McCoy Burger Company,"[Local Flavor, Sandwiches, Restaurants, Poutin...",4.0,33,43.731511,43.731511
3,DiCMYxT69I22-1nfsvYAJQ,0.651565,Gourmet Burger Co,"[Burgers, Restaurants]",3.5,37,43.664683,43.664683
4,UN0UwUh7jaeX6Jg3lZImCg,0.638442,Holy Chuck,"[Food, Restaurants, Desserts, Poutineries, Bur...",3.0,43,43.665211,43.665211
5,ZzF5098L4xg-0COjng2LVA,0.638042,Burgatory,"[Pubs, Burgers, Food Trucks, Nightlife, Bars, ...",3.0,9,43.655055,43.655055
6,PkeaeQS8aJTeS8PS_Hl_-g,0.637751,Steak and Cheese Factory,"[Sandwiches, Cheesesteaks, Restaurants]",3.0,3,43.708213,43.708213
7,ky9RbwLtChekSrqcYR39kw,0.635411,Big Smoke Burger,"[Burgers, Poutineries, Restaurants]",3.0,6,43.611289,43.611289
8,ycAW6Q5quaCSDX5zwQ3tPg,0.630411,New York Fries,"[Canadian (New), Specialty Food, Food, Restaur...",3.5,8,43.776875,43.776875
9,7UPTUpex3O1Gav3td7GOEw,0.625449,South St Burger Co,"[Burgers, Restaurants]",3.0,6,43.736442,43.736442


In [48]:
similar_business_df.printSchema


<bound method DataFrame.printSchema of DataFrame[business_id: string, similarity_score: double]>

In [57]:
def getSimilarBusinesses(b_ids, sim_bus_count):
    
    schema = StructType([
                            StructField("business_id", StringType(), True), 
                            StructField("similarity_score", IntegerType(), True)
                        ])
    
    similar_businesses_df = spark.createDataFrame([], schema)
    
    for b_id in b_ids:
        
        print('Businesses similar to: ' + b_id)
        
        input_vec = reviews_by_business_vec_df.select('word_vec')\
                    .filter(reviews_by_business_vec_df['business_id'] == b_id)\
                    .collect()[0][0]

        #all_vecs = reviews_by_business_vec_df.select('business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()

        similar_business_rdd = sc.parallelize((i[0], float(CosineSim(input_vec, i[1]))) for i in all_vecs)

        similar_business_df = spark.createDataFrame(similar_business_rdd) \
            .withColumnRenamed('_1', 'business_id') \
            .withColumnRenamed('_2', 'similarity_score') \
            .orderBy("similarity_score", ascending = False)
            
        similar_business_df = similar_business_df.filter(col("business_id") != b_id).limit(10)
        similar_business_df.show()
        
        similar_businesses_df = similar_businesses_df.union(similar_business_df)
    
    return similar_businesses_df
    

In [58]:
def getContentRecoms(u_id, sim_bus_count=10):
    
    query = """
    SELECT distinct business_id FROM reviews  
    where stars >= 3.0 
    and user_id = "{}"
    """.format(u_id)

    usr_rev_bus = sqlContext.sql(query)

    usr_rev_bus = usr_rev_bus.sample(False, 0.5).limit(5)

    usr_rev_bus_det = getBusinessDetails(usr_rev_bus)

    print('Businesses previously reviewed by user:')
    usr_rev_bus_det.select(['business_id', 'name', 'categories']).show(truncate = False)

    bus_list = [i.business_id for i in usr_rev_bus.collect()]

    sim_bus_df = getSimilarBusinesses(bus_list, sim_bus_count)

    s = sim_bus_df.alias("s")
    r = usr_rev_bus.alias("r")
    j = s.join(r, col("s.business_id") == col("r.business_id"), 'left_outer') \
         .where(col("r.business_id").isNull()) \
         .select([col('s.business_id'),col('s.similarity_score')])

    a = j.orderBy("similarity_score", ascending = False).limit(sim_bus_count)

    return getBusinessDetails(a)

     

In [59]:
u_id = 'ZWD8UH1T7QXQr0Eq-mcWYg'

content_recom_df = getContentRecoms(u_id)

print("Businesses recommended to user based on his previously reviewd businesses:")
content_recom_df.toPandas()

Businesses previously reviewed by user:
+----------------------+-----------------------+-----------------------------------+
|business_id           |name                   |categories                         |
+----------------------+-----------------------+-----------------------------------+
|Jo3Cpvf8fpfcAIFG3TagbQ|Pantheon Restaurant    |[Mediterranean, Greek, Restaurants]|
|ZumOnWbstgsIE6bJlxw0_Q|Jacobs & Co. Steakhouse|[Restaurants, Steakhouses]         |
|JJ8ypBu3b--fy4HA5RB1gg|Morton's The Steakhouse|[Steakhouses, Restaurants]         |
+----------------------+-----------------------+-----------------------------------+

Businesses similar to: Jo3Cpvf8fpfcAIFG3TagbQ
+--------------------+------------------+
|         business_id|  similarity_score|
+--------------------+------------------+
|nDn2h-_c7Xk4UwM0a...|0.9812778665324616|
|DuzxF9yWIxYqt4jDK...|0.9714170272798817|
|aKESUG6MntqZNvC0D...|0.9694920916328215|
|aO94Wsh2VzrgdCXu2...|0.9681354898988823|
|bGoGHSWHvhftHMWWx...|0.

Unnamed: 0,business_id,similarity_score,name,categories,stars,review_count,latitude,latitude.1
0,Q2ZNaN3p8s_-XXjBWyY2qA,0.98841,Ruth's Chris Steak House,"[Restaurants, Steakhouses, Party & Event Plann...",3.5,172,43.649612,43.649612
1,nDn2h-_c7Xk4UwM0aiXZlw,0.981278,Mezes,"[Mediterranean, Greek, Restaurants]",4.0,145,43.678004,43.678004
2,7ODXq--HE7QpzvWwgk5rMA,0.981036,Barberian's Steak House,"[Steakhouses, Restaurants]",4.0,195,43.657592,43.657592
3,Q2ZNaN3p8s_-XXjBWyY2qA,0.97877,Ruth's Chris Steak House,"[Restaurants, Steakhouses, Party & Event Plann...",3.5,172,43.649612,43.649612
4,0BW6h-igJinzbqc-prYUaQ,0.97544,Hy's Steakhouse & Cocktail Bar,"[Seafood, Nightlife, Bars, Steakhouses, Restau...",3.5,84,43.649731,43.649731
5,4POPYEONJpkfhWOMx_PyGg,0.973019,Harbour Sixty,"[Seafood, Steakhouses, Restaurants]",3.5,170,43.642064,43.642064
6,DuzxF9yWIxYqt4jDKw_bbA,0.971417,Pan On the Danforth,"[Restaurants, Mediterranean, Greek]",3.5,122,43.678359,43.678359
7,Vf_RHj0f1VViEF6OYnEfUA,0.970874,Quinn's Steakhouse & Irish Bar,"[Restaurants, Steakhouses, Irish]",3.5,84,43.651089,43.651089
8,tzl4KHt6ZAwyUJIEyemrtQ,0.970269,Smith Bros.,"[Steakhouses, Restaurants, Nightlife, Bars]",4.0,214,43.728887,43.728887
9,aKESUG6MntqZNvC0DiFcJg,0.969492,Penelope Restaurant,"[Greek, Mediterranean, Restaurants]",3.0,54,43.646655,43.646655


In [60]:
u_id = 'Wc5L6iuvSNF5WGBlqIO8nw'

content_recom_df = getContentRecoms(u_id)

print("Businesses recommended to user based on his previously reviewd businesses:")
content_recom_df.toPandas()

Businesses previously reviewed by user:
+----------------------+----------------------------+-------------------------------------------------------------------------------------------+
|business_id           |name                        |categories                                                                                 |
+----------------------+----------------------------+-------------------------------------------------------------------------------------------+
|F_oPMHJrH42R67xp5eKtQA|Yummy Korean Food Restaurant|[Korean, Restaurants]                                                                      |
|JmZj7wzAJ7_4ksjG9WXdqw|Gladstone Hotel             |[Hotels & Travel, Lounges, Restaurants, Bars, Nightlife, Event Planning & Services, Hotels]|
|c78Pat78fVUBFPXYeVvbaQ|Odd Seoul                   |[Restaurants, Bars, Korean, Dive Bars, Asian Fusion, Nightlife]                            |
|9jYnZymuaW-XpMIS75YxgQ|The Beaver                  |[Canadian (New), Nightlife, Caf

Unnamed: 0,business_id,similarity_score,name,categories,stars,review_count,latitude,latitude.1
0,rO3lZpVSoRMhhd0AEJBjlg,0.986628,Sunrise House,"[Restaurants, Korean]",4.0,135,43.664068,43.664068
1,rhyjGfqYlCJoi8Zeulg6QA,0.985523,Kimchi Korea House,"[Korean, Restaurants]",3.5,155,43.655256,43.655256
2,j-Z_HAev26ZftdErMhIBuA,0.981557,Thumbs Up Korean Restaurant,"[Restaurants, Korean]",4.0,56,43.664451,43.664451
3,_MA98TVmvVIy-XdI0poc7w,0.980885,Mom's Korean Food,"[Korean, Restaurants]",3.5,62,43.664686,43.664686
4,SNkkuchbVtUzCwyENcai_g,0.980637,Danji,"[Restaurants, Chinese, Japanese, Korean]",3.5,57,43.6653,43.6653
5,ShUh_MMkaVp_KXCtNjPvXA,0.976432,Universal Grill,"[American (Traditional), Canadian (New), Break...",3.5,45,43.670521,43.670521
6,X6ZZksefmR_piQj2Gbnduw,0.975906,Paldo Gangsan,"[Restaurants, Korean]",4.0,47,43.663799,43.663799
7,uChTCA6MsLAciDRklpO-Fw,0.973643,Makkal Chon,"[Greek, Restaurants, Korean]",4.0,210,43.744944,43.744944
8,ZCrK07xb6w5Vi1vathV0NQ,0.973478,Bapbo Korean Restaurant,"[Korean, Japanese, Restaurants]",3.0,86,43.655606,43.655606
9,oQylTvXwGIkKFdCjmafKVg,0.973024,Fire on the East Side,"[Southern, Restaurants, Breakfast & Brunch, Am...",3.5,119,43.666765,43.666765
