In [1]:
# import project directory helper
import os, sys
ROOT_DIR = os.path.abspath('/home/hduser/document/jupyter/FYP/') 
sys.path.insert(0, ROOT_DIR)

In [2]:
# import pyspark packages
# from pyspark import SparkContext
# from pyspark.sql import SQLContext
# set the kafka dependencies before create spark context or session
import os
os.environ[
    'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.4,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'
from pyspark.sql import SparkSession, functions, types
from pyspark.sql.types import *

In [3]:
# import util packages
from IPython.display import display
import pandas as pd

In [4]:
spark = SparkSession.builder.appName('attraction').getOrCreate()
spark

In [5]:
# define path
tripad_attr_path = ROOT_DIR + '/crawler/datasets/tripadvisor_dataset/attractions/'
parquet_path = ROOT_DIR + '/crawler/datasets/tripadvisor_dataset/spark/'

# Attraction Details

In [6]:
# read spark dataframe from parquet
final_attr_spark_df = spark.read.parquet(parquet_path + 'tripad_attr')

## Processing category

In [7]:
def get_cat(x):
    if isinstance(x, list):
        return x[0]
    else:
        return x

In [8]:
get_category_udf = functions.udf(lambda x: get_cat(x), StringType())

In [9]:
final_attr_spark_df = final_attr_spark_df.withColumn(
    'updated_category', get_category_udf(final_attr_spark_df.category))

In [10]:
final_attr_spark_df = final_attr_spark_df.drop('category')
final_attr_spark_df = final_attr_spark_df.withColumnRenamed('updated_category', 'category')

In [11]:
# visualize spark dataframe
display(final_attr_spark_df.count())
final_attr_spark_df.printSchema()
display(final_attr_spark_df.toPandas())

4257

root
 |-- activityId: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- name: string (nullable = true)
 |-- price: double (nullable = true)
 |-- rating: double (nullable = true)
 |-- category: string (nullable = true)



Unnamed: 0,activityId,city,state,country,latitude,longitude,name,price,rating,category
0,12962337,Kuala Lumpur,Wilayah Persekutuan,Malaysia,3.156802,101.69717,Malaysia Countryside and Batu Caves Tour from ...,43.27,4.5,featured_tours_and_tickets
1,12905660,Kuala Lumpur,Wilayah Persekutuan,Malaysia,3.156802,101.69717,Petronas Twin Towers Admission Tickets (E-Tick...,119.49,4.0,featured_tours_and_tickets
2,11807013,Kuala Lumpur,Wilayah Persekutuan,Malaysia,3.156802,101.69717,Discover the Best Local Food Tour by Night in ...,142.75,4.5,featured_tours_and_tickets
3,12471375,Kuala Lumpur,Wilayah Persekutuan,Malaysia,3.156802,101.69717,Genting Highlands Day Trip from Kuala Lumpur w...,103.01,4.5,featured_tours_and_tickets
4,12469580,Kuala Lumpur,Wilayah Persekutuan,Malaysia,3.156802,101.69717,Half-Day Kuala Lumpur City Tour,43.27,4.0,featured_tours_and_tickets
5,16722683,Kuala Lumpur,Wilayah Persekutuan,Malaysia,3.156802,101.69717,KL Hop On Hop Off City Tour,53.07,3.0,featured_tours_and_tickets
6,11454341,Kuah,Langkawi,Malaysia,6.326672,99.843025,Langkawi Archipelago Jet Ski Tour Including Da...,515.06,5.0,featured_tours_and_tickets
7,11812460,Kuala Lumpur,Wilayah Persekutuan,Malaysia,3.156802,101.69717,Private Half-Day Batu Caves Waterfalls and Hot...,377.67,5.0,featured_tours_and_tickets
8,11992275,Kuah,Langkawi,Malaysia,6.326672,99.843025,12 Flights: Zipline Eco Adventure in Langkawi,473.86,5.0,featured_tours_and_tickets
9,11991123,Penang Island,Penang,Malaysia,5.388071,100.29352,Cheong Fatt Tze George Town Penang: The Blue M...,20.61,4.5,featured_tours_and_tickets


In [12]:
# create table or view from spark dataframe
final_attr_spark_df.createOrReplaceTempView('final_attr_spark_df')

## Processing city

In [13]:
# create spark user defined function
clean_string_udf = functions.udf(lambda x: '_'.join(x.split(' ')).lower(), StringType())

In [14]:
# final_attr_spark_df = final_attr_spark_df.withColumn(
#     'city', clean_string_udf(final_attr_spark_df.city))

## Processing rating

In [15]:
# show null row from spark dataframe basec on specific column
final_attr_spark_df.where(final_attr_spark_df.rating.isNull()).show()
final_attr_spark_df.count()

+----------+----+-----+-------+--------+---------+----+-----+------+--------+
|activityId|city|state|country|latitude|longitude|name|price|rating|category|
+----------+----+-----+-------+--------+---------+----+-----+------+--------+
+----------+----+-----+-------+--------+---------+----+-----+------+--------+



4257

In [16]:
# set a constant or literal value to null fields (for showing, if needed update as required)
temp_spark_df = final_attr_spark_df.where(
    final_attr_spark_df.rating.isNull()).withColumn("rating",
                                                    functions.lit(-1))

final_attr_spark_df = final_attr_spark_df.where(
    final_attr_spark_df.rating.isNotNull()).union(temp_spark_df)

final_attr_spark_df.count()

4257

###### Recalculate for those field with -1 rating value based on category

In [17]:
attr_rating_spark_df = final_attr_spark_df

In [18]:
avg_rat_spark_df = spark.sql(
    "SELECT category, AVG(rating) as avg_rating FROM final_attr_spark_df WHERE rating != -1 GROUP BY category"
)
avg_rat_spark_df.show()

+--------------------+------------------+
|            category|        avg_rating|
+--------------------+------------------+
|air_helicopter_ba...|               5.0|
|   classes_workshops|              3.75|
|   tours_sightseeing| 4.103448275862069|
|featured_tours_an...| 4.483333333333333|
|walking_biking_tours| 4.688679245283019|
|        water_sports|              4.13|
| food_wine_nightlife| 4.208333333333333|
|    shopping_fashion|               4.5|
|day_trips_excursions|              4.22|
|transfers_ground_...|4.5523255813953485|
|shows_concerts_sp...|3.3333333333333335|
|  outdoor_activities| 4.443661971830986|
|   local_experiences|            4.6875|
|         theme_parks| 3.526315789473684|
|cruises_sailing_w...|              3.75|
|luxury_special_oc...| 4.141025641025641|
|    shore_excursions| 4.090909090909091|
|     family_friendly|3.8275862068965516|
|multiday_extended...|             4.375|
|sightseeing_ticke...| 4.038461538461538|
+--------------------+------------

In [19]:
def roundTo5(x, base=.5):
    return float(round(x / base) * base)
round_to_5_udf = functions.udf(lambda x: roundTo5(x), DoubleType())

In [20]:
avg_rat_spark_df = avg_rat_spark_df.withColumn(
    'updated_rating', round_to_5_udf(avg_rat_spark_df.avg_rating))
avg_rat_spark_df.show()

+--------------------+------------------+--------------+
|            category|        avg_rating|updated_rating|
+--------------------+------------------+--------------+
|air_helicopter_ba...|               5.0|           5.0|
|   classes_workshops|              3.75|           4.0|
|   tours_sightseeing| 4.103448275862069|           4.0|
|featured_tours_an...| 4.483333333333333|           4.5|
|walking_biking_tours| 4.688679245283019|           4.5|
|        water_sports|              4.13|           4.0|
| food_wine_nightlife| 4.208333333333333|           4.0|
|    shopping_fashion|               4.5|           4.5|
|day_trips_excursions|              4.22|           4.0|
|transfers_ground_...|4.5523255813953485|           4.5|
|shows_concerts_sp...|3.3333333333333335|           3.5|
|  outdoor_activities| 4.443661971830986|           4.5|
|   local_experiences|            4.6875|           4.5|
|         theme_parks| 3.526315789473684|           3.5|
|cruises_sailing_w...|         

In [21]:
attr_rating_spark_df = attr_rating_spark_df.join(avg_rat_spark_df,
                                                 ['category'], 'left')
attr_rating_spark_df.count()

4257

In [22]:
attr_rating_spark_df = attr_rating_spark_df.withColumn(
    "rating",
    functions.when(attr_rating_spark_df["rating"] == -1,
                   attr_rating_spark_df["updated_rating"]).otherwise(
                       attr_rating_spark_df["rating"])).drop(
                           'updated_rating', 'avg_rating')

In [23]:
# show null row from spark dataframe basec on specific column
attr_rating_spark_df.count()
display(attr_rating_spark_df.where(attr_rating_spark_df.rating.isNull()).toPandas())

Unnamed: 0,category,activityId,city,state,country,latitude,longitude,name,price,rating
0,weddings_honeymoons,19530324,Kuala Lumpur,Wilayah Persekutuan,Malaysia,3.156802,101.69717,Things to do in Kuala Lumpur - Book a photoshoot,593.43,
1,weddings_honeymoons,19267760,Langkawi,Langkawi District,Malaysia,6.351074,99.75824,Private Explorer Boat - Langkawi Geopark Islan...,679.97,
2,weddings_honeymoons,19267759,Langkawi,Langkawi District,Malaysia,6.351074,99.75824,Private Day Cruise,4945.27,
3,weddings_honeymoons,15342175,Port Klang,Selangor,Malaysia,2.999852,101.39283,3 Nights 4 Days Honeymoon in Malaysia,618.16,
4,weddings_honeymoons,17335319,Batu Ferringhi,Penang Island,Malaysia,5.471783,100.24629,Beach Wedding,24726.36,


In [24]:
attr_rating_spark_df.createOrReplaceTempView('final_attr_spark_df')

###### Recalculate for those field with NaN rating value based on city

In [25]:
avg_rat_spark_df = spark.sql(
    "SELECT city, AVG(rating) as avg_rating FROM final_attr_spark_df WHERE rating is not NULL GROUP BY city"
)
avg_rat_spark_df.show()

+-------------+-----------------+
|         city|       avg_rating|
+-------------+-----------------+
|  Kuala Tahan|              4.5|
|     Bongawan|              5.0|
|   Georgetown|              4.5|
|       Sepang|4.468421052631579|
|       Ampang|              2.5|
|    Cherating|            4.425|
|  Kuala Kurau|              4.0|
|    Singapore|4.409090909090909|
|     Malaysia|4.323529411764706|
|   Katunayake|              4.0|
|      Salalah|              4.0|
|      Beijing|             4.75|
|       Melaka|4.271028037383178|
|    Tamparuli|              4.5|
|Petaling Jaya|         4.390625|
|  George Town| 4.27536231884058|
|  Kota Tinggi|              4.0|
|    Penampang|              4.0|
|      Sematan|              4.0|
|  Bayan Lepas|              4.6|
+-------------+-----------------+
only showing top 20 rows



In [26]:
def roundTo5(x, base=.5):
    return float(round(x / base) * base)
round_to_5_udf = functions.udf(lambda x: roundTo5(x), DoubleType())

In [27]:
avg_rat_spark_df = avg_rat_spark_df.withColumn(
    'updated_rating', round_to_5_udf(avg_rat_spark_df.avg_rating))
avg_rat_spark_df.show()

+-------------+-----------------+--------------+
|         city|       avg_rating|updated_rating|
+-------------+-----------------+--------------+
|  Kuala Tahan|              4.5|           4.5|
|     Bongawan|              5.0|           5.0|
|   Georgetown|              4.5|           4.5|
|       Sepang|4.468421052631579|           4.5|
|       Ampang|              2.5|           2.5|
|    Cherating|            4.425|           4.5|
|  Kuala Kurau|              4.0|           4.0|
|    Singapore|4.409090909090909|           4.5|
|     Malaysia|4.323529411764706|           4.5|
|   Katunayake|              4.0|           4.0|
|      Salalah|              4.0|           4.0|
|      Beijing|             4.75|           5.0|
|       Melaka|4.271028037383178|           4.5|
|    Tamparuli|              4.5|           4.5|
|Petaling Jaya|         4.390625|           4.5|
|  George Town| 4.27536231884058|           4.5|
|  Kota Tinggi|              4.0|           4.0|
|    Penampang|     

In [28]:
attr_rating_spark_df = attr_rating_spark_df.join(avg_rat_spark_df, ['city'],
                                                 'left')
attr_rating_spark_df.count()

4257

In [29]:
attr_rating_spark_df = attr_rating_spark_df.withColumn(
    "rating",
    functions.when(attr_rating_spark_df.rating.isNull(),
                   attr_rating_spark_df["updated_rating"]).otherwise(
                       attr_rating_spark_df["rating"])).drop(
                           'updated_rating', 'avg_rating')

In [30]:
# show null row from spark dataframe basec on specific column
attr_rating_spark_df.count()
display(attr_rating_spark_df.where(attr_rating_spark_df.rating.isNull()).toPandas())

Unnamed: 0,city,category,activityId,state,country,latitude,longitude,name,price,rating


In [31]:
display(attr_rating_spark_df.toPandas())

Unnamed: 0,city,category,activityId,state,country,latitude,longitude,name,price,rating
0,Bongawan,cruises_sailing_water_tours,17683780,Sabah,Malaysia,5.507984,115.87458,Sky-Mirror Photographing and Proboscis Monkey...,164.84,5.0
1,Georgetown,sightseeing_tickets_passes,12990313,Demerara-Mahaica,Guyana,6.8,-58.1667,Colonial Penang Museum Admission Ticket,29.26,4.5
2,Kuala Tahan,multiday_extended_tours,14911433,Jerantut District,Malaysia,4.38195,102.40177,"6d 5n Tour Paradise Perhentian, Redang Island ...",3877.92,4.5
3,Kuala Tahan,transfers_ground_transport,19253545,Jerantut District,Malaysia,4.38195,102.40177,Chauffeur: Taman Negara to Singapore Transfer ...,489.86,4.5
4,Kuala Tahan,transfers_ground_transport,19253546,Jerantut District,Malaysia,4.38195,102.40177,Chauffeur: Taman Negara to Singapore Transfer ...,1524.04,4.5
5,Sepang,multiday_extended_tours,18910585,Sepang District,Malaysia,2.69093,101.75114,Luxurious Package to Malaysia with cheap price,4116.94,4.5
6,Sepang,multiday_extended_tours,17404431,Sepang District,Malaysia,2.69093,101.75114,A Passage Through Time,7195.37,4.5
7,Sepang,transfers_ground_transport,15340261,Sepang District,Malaysia,2.69093,101.75114,Kuala Lumpur Airport Arrival Transfer,71.29,4.5
8,Sepang,transfers_ground_transport,19281537,Sepang District,Malaysia,2.69093,101.75114,Kuala Lumpur Airport To Kuala Lumpur City Hotels,56.66,4.5
9,Sepang,transfers_ground_transport,19709350,Sepang District,Malaysia,2.69093,101.75114,Kuala Lumpur Airport To Malacca City Hotels EN...,185.45,4.5


In [32]:
attr_rating_spark_df.createOrReplaceTempView('final_attr_spark_df')
final_attr_spark_df = attr_rating_spark_df

## Processing price

In [33]:
attr_price_spark_df = final_attr_spark_df

In [34]:
avg_price_spark_df = spark.sql(
    "SELECT city, AVG(price) as avg_price FROM final_attr_spark_df GROUP BY city"
)
avg_price_spark_df.show()

+-------------+------------------+
|         city|         avg_price|
+-------------+------------------+
|     Bongawan|            164.84|
|   Georgetown|             29.26|
|  Kuala Tahan|1963.9399999999998|
|       Sepang| 625.9777894736839|
|       Ampang|214.29500000000002|
|    Cherating| 379.7414999999999|
|  Kuala Kurau|            278.17|
|     Malaysia| 768.8152941176468|
|    Singapore|1902.5433333333333|
|   Katunayake|            280.23|
|      Beijing|            226.66|
|      Salalah|             214.3|
|       Melaka| 354.8621495327104|
|    Tamparuli|            185.45|
|Petaling Jaya| 265.3947500000002|
|  George Town| 328.2397101449276|
|  Kota Tinggi|             24.73|
|    Penampang|            232.84|
|      Sematan|           2357.25|
|  Bayan Lepas|206.87600000000003|
+-------------+------------------+
only showing top 20 rows



In [35]:
round_price_udf = functions.udf(lambda x: round(x, 2), DoubleType())

In [36]:
avg_price_spark_df = avg_price_spark_df.withColumn(
    'updated_price', round_price_udf(avg_price_spark_df["avg_price"]))
avg_price_spark_df.show()

+-------------+------------------+-------------+
|         city|         avg_price|updated_price|
+-------------+------------------+-------------+
|     Bongawan|            164.84|       164.84|
|   Georgetown|             29.26|        29.26|
|  Kuala Tahan|1963.9399999999998|      1963.94|
|       Sepang| 625.9777894736839|       625.98|
|       Ampang|214.29500000000002|        214.3|
|    Cherating| 379.7414999999999|       379.74|
|  Kuala Kurau|            278.17|       278.17|
|     Malaysia| 768.8152941176468|       768.82|
|    Singapore|1902.5433333333333|      1902.54|
|   Katunayake|            280.23|       280.23|
|      Beijing|            226.66|       226.66|
|      Salalah|             214.3|        214.3|
|       Melaka| 354.8621495327104|       354.86|
|    Tamparuli|            185.45|       185.45|
|Petaling Jaya| 265.3947500000002|       265.39|
|  George Town| 328.2397101449276|       328.24|
|  Kota Tinggi|             24.73|        24.73|
|    Penampang|     

In [37]:
attr_price_spark_df = attr_price_spark_df.join(avg_price_spark_df,
                                               ['city'],
                                               'left_outer')
attr_price_spark_df.count()

4257

In [38]:
attr_price_spark_df = attr_price_spark_df.withColumn(
    "price",
    functions.when(attr_price_spark_df["price"] == -1,
                   attr_price_spark_df["updated_price"]).otherwise(
                       attr_price_spark_df["price"])).drop(
                           'updated_price', 'avg_price')

In [39]:
# show null row from spark dataframe basec on specific column
attr_price_spark_df.where(attr_price_spark_df.price.isNull()).show()
attr_price_spark_df.count()

+----+--------+----------+-----+-------+--------+---------+----+-----+------+
|city|category|activityId|state|country|latitude|longitude|name|price|rating|
+----+--------+----------+-----+-------+--------+---------+----+-----+------+
+----+--------+----------+-----+-------+--------+---------+----+-----+------+



4257

In [40]:
display(attr_price_spark_df.toPandas())

Unnamed: 0,city,category,activityId,state,country,latitude,longitude,name,price,rating
0,Bongawan,cruises_sailing_water_tours,17683780,Sabah,Malaysia,5.507984,115.87458,Sky-Mirror Photographing and Proboscis Monkey...,164.84,5.0
1,Georgetown,sightseeing_tickets_passes,12990313,Demerara-Mahaica,Guyana,6.8,-58.1667,Colonial Penang Museum Admission Ticket,29.26,4.5
2,Kuala Tahan,multiday_extended_tours,14911433,Jerantut District,Malaysia,4.38195,102.40177,"6d 5n Tour Paradise Perhentian, Redang Island ...",3877.92,4.5
3,Kuala Tahan,transfers_ground_transport,19253545,Jerantut District,Malaysia,4.38195,102.40177,Chauffeur: Taman Negara to Singapore Transfer ...,489.86,4.5
4,Kuala Tahan,transfers_ground_transport,19253546,Jerantut District,Malaysia,4.38195,102.40177,Chauffeur: Taman Negara to Singapore Transfer ...,1524.04,4.5
5,Sepang,multiday_extended_tours,18910585,Sepang District,Malaysia,2.69093,101.75114,Luxurious Package to Malaysia with cheap price,4116.94,4.5
6,Sepang,multiday_extended_tours,17404431,Sepang District,Malaysia,2.69093,101.75114,A Passage Through Time,7195.37,4.5
7,Sepang,transfers_ground_transport,15340261,Sepang District,Malaysia,2.69093,101.75114,Kuala Lumpur Airport Arrival Transfer,71.29,4.5
8,Sepang,transfers_ground_transport,19281537,Sepang District,Malaysia,2.69093,101.75114,Kuala Lumpur Airport To Kuala Lumpur City Hotels,56.66,4.5
9,Sepang,transfers_ground_transport,19709350,Sepang District,Malaysia,2.69093,101.75114,Kuala Lumpur Airport To Malacca City Hotels EN...,185.45,4.5


In [41]:
attr_price_spark_df.createOrReplaceTempView('final_attr_spark_df')
final_attr_spark_df = attr_price_spark_df

## Processing geo coordinates

In [63]:
attr_geo_spark_df = final_attr_spark_df

In [64]:
avg_geo_spark_df = spark.sql(
    "SELECT city, AVG(latitude) as avg_lat, AVG(longitude) as avg_lng FROM final_attr_spark_df GROUP BY city"
)
avg_geo_spark_df.show()

+-------------+------------------+------------------+
|         city|           avg_lat|           avg_lng|
+-------------+------------------+------------------+
|     Bongawan|          5.507984|         115.87458|
|   Georgetown|               6.8|          -58.1667|
|  Kuala Tahan|           4.38195|         102.40177|
|       Sepang|2.6909300000000034|101.75114000000015|
|       Ampang|          3.156205|         101.75031|
|    Cherating|          4.133792|103.38025999999995|
|  Kuala Kurau|          5.018007|         100.43176|
|     Malaysia| 3.480286999999999|102.47243000000002|
|    Singapore|1.2858009999999995|103.85111000000005|
|   Katunayake|           7.15966|          79.87023|
|      Beijing|         39.909336|         116.39452|
|      Salalah|          17.01493|          54.09302|
|       Melaka|2.2068470000000033|102.25326999999989|
|    Tamparuli|           6.13333|         116.26667|
|Petaling Jaya|3.1036300000000066| 101.6477799999997|
|  George Town| 5.4119380000

In [65]:
attr_geo_spark_df = attr_geo_spark_df.join(avg_geo_spark_df, ['city'],
                                           'left_outer')
attr_geo_spark_df.count()

4257

In [66]:
attr_geo_spark_df = attr_geo_spark_df.withColumn(
    'latitude',
    functions.when(attr_geo_spark_df['latitude'].isNull(),
                   attr_geo_spark_df['avg_lat']).otherwise(
                       attr_geo_spark_df['latitude'])).withColumn(
                           'longitude',
                           functions.when(
                               attr_geo_spark_df['longitude'].isNull(),
                               attr_geo_spark_df['avg_lng']).otherwise(
                                   attr_geo_spark_df['longitude'])).drop(
                                       attr_geo_spark_df['avg_lat']).drop(
                                           attr_geo_spark_df['avg_lng'])

In [67]:
# show null row from spark dataframe basec on specific column
display(
    attr_geo_spark_df.where(attr_geo_spark_df.latitude.isNull()).toPandas())
display(
    attr_geo_spark_df.where(attr_geo_spark_df.longitude.isNull()).toPandas())
attr_geo_spark_df.count()

Unnamed: 0,city,category,activityId,state,country,latitude,longitude,name,price,rating
0,Pulau Tioman,shore_excursions,19709623,Rompin District,Malaysia,,,Scuba in Malaysia,144.24,4.0
1,Pulau Tioman,water_sports,19773031,Rompin District,Malaysia,,,Scuba Diving Course Open Water,1203.35,4.0


Unnamed: 0,city,category,activityId,state,country,latitude,longitude,name,price,rating
0,Pulau Tioman,shore_excursions,19709623,Rompin District,Malaysia,,,Scuba in Malaysia,144.24,4.0
1,Pulau Tioman,water_sports,19773031,Rompin District,Malaysia,,,Scuba Diving Course Open Water,1203.35,4.0


4257

###### Get the geocode for empty latitude and longitude fields

In [68]:
import googlemaps
import os

In [69]:
def get_geocode(place):
    gmaps = googlemaps.Client(key=os.environ['GOOGLE_API_KEY'])
    place = gmaps.places(place)
    geocode = {
        'lat': place["results"][0]["geometry"]["location"]["lat"],
        'lng': place["results"][0]["geometry"]["location"]["lng"]
    }
    return geocode

In [70]:
get_lat_udf = functions.udf(lambda x: get_geocode(x)['lat'], StringType())
get_lng_udf = functions.udf(lambda x: get_geocode(x)['lng'], StringType())

In [71]:
temp_geo_spark_df = attr_geo_spark_df.where(attr_geo_spark_df.latitude.isNull())
temp_geo_spark_df.count()
display(temp_geo_spark_df.toPandas())

Unnamed: 0,city,category,activityId,state,country,latitude,longitude,name,price,rating
0,Pulau Tioman,shore_excursions,19709623,Rompin District,Malaysia,,,Scuba in Malaysia,144.24,4.0
1,Pulau Tioman,water_sports,19773031,Rompin District,Malaysia,,,Scuba Diving Course Open Water,1203.35,4.0


In [72]:
temp_geo_spark_df = temp_geo_spark_df.where(
    temp_geo_spark_df.latitude.isNull()).withColumn(
        "latitude", get_lat_udf(temp_geo_spark_df.city))

temp_geo_spark_df = temp_geo_spark_df.where(
    temp_geo_spark_df.longitude.isNull()).withColumn(
        "longitude", get_lng_udf(temp_geo_spark_df.city))

In [73]:
temp_geo_spark_df = temp_geo_spark_df.toPandas()
temp_geo_spark_df.head()

Unnamed: 0,city,category,activityId,state,country,latitude,longitude,name,price,rating
0,Pulau Tioman,shore_excursions,19709623,Rompin District,Malaysia,2.7902494,104.1698463,Scuba in Malaysia,144.24,4.0
1,Pulau Tioman,water_sports,19773031,Rompin District,Malaysia,2.7902494,104.1698463,Scuba Diving Course Open Water,1203.35,4.0


In [75]:
temp_geo_spark_df = spark.createDataFrame(temp_geo_spark_df)

In [76]:
attr_geo_spark_df = attr_geo_spark_df.where(attr_geo_spark_df.latitude.isNotNull()).union(temp_geo_spark_df)
attr_geo_spark_df.count()

4257

In [77]:
# show null row from spark dataframe basec on specific column
display(
    attr_geo_spark_df.where(attr_geo_spark_df.latitude.isNull()).toPandas())
display(
    attr_geo_spark_df.where(attr_geo_spark_df.longitude.isNull()).toPandas())
attr_geo_spark_df.count()

Unnamed: 0,city,category,activityId,state,country,latitude,longitude,name,price,rating


Unnamed: 0,city,category,activityId,state,country,latitude,longitude,name,price,rating


4257

In [78]:
display(attr_geo_spark_df.toPandas())

Unnamed: 0,city,category,activityId,state,country,latitude,longitude,name,price,rating
0,Bongawan,cruises_sailing_water_tours,17683780,Sabah,Malaysia,5.507984,115.87458,Sky-Mirror Photographing and Proboscis Monkey...,164.84,5.0
1,Georgetown,sightseeing_tickets_passes,12990313,Demerara-Mahaica,Guyana,6.8,-58.1667,Colonial Penang Museum Admission Ticket,29.26,4.5
2,Kuala Tahan,multiday_extended_tours,14911433,Jerantut District,Malaysia,4.38195,102.40177,"6d 5n Tour Paradise Perhentian, Redang Island ...",3877.92,4.5
3,Kuala Tahan,transfers_ground_transport,19253545,Jerantut District,Malaysia,4.38195,102.40177,Chauffeur: Taman Negara to Singapore Transfer ...,489.86,4.5
4,Kuala Tahan,transfers_ground_transport,19253546,Jerantut District,Malaysia,4.38195,102.40177,Chauffeur: Taman Negara to Singapore Transfer ...,1524.04,4.5
5,Sepang,multiday_extended_tours,18910585,Sepang District,Malaysia,2.69093,101.75114,Luxurious Package to Malaysia with cheap price,4116.94,4.5
6,Sepang,multiday_extended_tours,17404431,Sepang District,Malaysia,2.69093,101.75114,A Passage Through Time,7195.37,4.5
7,Sepang,transfers_ground_transport,15340261,Sepang District,Malaysia,2.69093,101.75114,Kuala Lumpur Airport Arrival Transfer,71.29,4.5
8,Sepang,transfers_ground_transport,19281537,Sepang District,Malaysia,2.69093,101.75114,Kuala Lumpur Airport To Kuala Lumpur City Hotels,56.66,4.5
9,Sepang,transfers_ground_transport,19709350,Sepang District,Malaysia,2.69093,101.75114,Kuala Lumpur Airport To Malacca City Hotels EN...,185.45,4.5


In [79]:
attr_geo_spark_df.createOrReplaceTempView('final_attr_spark_df')
final_attr_spark_df = attr_geo_spark_df

## Saving Attraction Details

In [80]:
final_attr_spark_df.coalesce(8).write.parquet(parquet_path + 'etl/attractions',
                                              mode='overwrite')

In [81]:
final_attr_spark_df.coalesce(1).write.format('json').save(
    parquet_path + 'etl/attractions/json', mode='overwrite')

# Attraction Review

In [5]:
# define path
tripad_attr_path = ROOT_DIR + '/crawler/datasets/tripadvisor_dataset/attractions/'
parquet_path = ROOT_DIR + '/crawler/datasets/tripadvisor_dataset/spark/'

In [6]:
# read spark dataframe from parquet
attr_review_spark_df = spark.read.parquet(parquet_path + 'tripad_attr_review').repartition(160)
attr_review_spark_df.count()

7146

In [7]:
attr_review_spark_df = attr_review_spark_df.select(
    attr_review_spark_df.activityId.cast("string"), attr_review_spark_df.rating,
    attr_review_spark_df.text.alias('review'),
    attr_review_spark_df.publishedDate.alias('review_date'),
    attr_review_spark_df.username.alias('user'), attr_review_spark_df.userId)

In [8]:
attr_review_df = attr_review_spark_df.toPandas()
attr_review_df['user_id'] = attr_review_df.index

In [9]:
display(attr_review_df)

Unnamed: 0,activityId,rating,review,review_date,user,userId,user_id
0,11449677,4,Very good Bus tour with a great guide/speaker ...,2018-02-12,Vincent W,BB0401B4A7E786BCB80F6A34F9CA7CD5,0
1,12880465,5,This was our second visit to KL and we had tic...,2018-10-19,DarrenCBristol,37060EBD1EB824EBB6E8E70215D27C70,1
2,16722683,5,Cheap way to see main highlights of KL. Hop on...,2016-10-27,BudiMohdYunus,08C1EDB33313D195676C1334F800272C,2
3,12468919,5,"Zoo is not my usual favourite, but I must say ...",2018-09-29,amyrosylily,7048960540A76440C6BCA16D4B0D649B,3
4,12650625,5,It was an enjoyable ride with great scenary wi...,2019-06-07,LakeGardenEcoRide,27721A880DA255F34D797EDAF03D28E5,4
5,11454341,5,We booked the jet skiing experience the day be...,2018-02-10,alic1988,3D56FCDC833D0F6E4CE6410A56964CB9,5
6,16722683,5,We went in the hop on off bus on a very hot da...,2016-09-29,Bubbles W,1BC5DBF5897A853A5A3A7F06B1D60B44,6
7,16722683,2,Having travelled on a number of Hop on Hop off...,2015-05-18,rhondda1,07972311330CF222252B246D4E8DE087,7
8,12943485,5,Its was very challenging and fun! The staff ar...,2019-10-08,njlaaaa,08538AF0DDAE4466DA16EBC29ABC32F0,8
9,17404880,4,Had a lot of fun even for first timer ! Suitab...,2019-10-07,Zettyzulkifli,89EC1E138DAEBFF018D741F86CD84260,9


In [10]:
attr_review_spark_df = spark.createDataFrame(attr_review_df)

In [11]:
attr_review_spark_df.createOrReplaceTempView('attr_review_spark_df')

In [12]:
user_rev_count = spark.sql("SELECT user, COUNT(*) as rev_count FROM attr_review_spark_df GROUP BY user ORDER BY rev_count DESC")
user_rev_count.count()

6684

In [13]:
print("Reviews are available for {attr_count} attractions.".format(
    attr_count=len(
        attr_review_spark_df.select('activityId').distinct().collect())))
print(
    "Matrix will be highly sparse as the maximum number of reviews provided by an user is {val}."
    .format(val=user_rev_count.select('rev_count').limit(1).collect()[0][0]))

Reviews are available for 696 attractions.
Matrix will be highly sparse as the maximum number of reviews provided by an user is 12.


## Saving Attraction Reviews

In [14]:
attr_review_spark_df.coalesce(8).write.parquet(parquet_path +
                                               'etl/attraction_reviews',
                                               mode='overwrite')

In [15]:
attr_review_spark_df.coalesce(1).write.format('json').save(
    parquet_path + 'etl/attraction_reviews/json', mode='overwrite')