In [1]:
# import project directory helper
import os, sys, inspect
currentDir = os.getcwd()
parentDir = os.path.dirname(currentDir)
sys.path.insert(0, parentDir)

In [2]:
# import pyspark packages
from pyspark import SparkContext as sc
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession, functions, types
from pyspark.sql.types import *

In [3]:
# import util packages
from IPython.display import display
import pandas as pd

In [4]:
# sc = sc(appName="attraction")
# sqlContext = SQLContext(sc)

spark = SparkSession.builder.appName("attraction").getOrCreate()

In [123]:
tripadvisor_attr_href_cat = parentDir + '/crawler/json/tripadvisor_attr_href_cat.json'

attr_href_cat_df = pd.read_json(tripadvisor_attr_href_cat, orient='records')

# Get activityId from dataframe
attr_href_cat_df['activityId'] = attr_href_cat_df['href'].str.extract(
    r'd(\d+)', expand=True)
attr_href_cat_df['locationId'] = attr_href_cat_df['href'].str.extract(
    r'g(\d+)', expand=True)
attr_href_cat_df_cl = ['activityId', 'locationId', 'href',
                       'category']  # columns order
attr_href_cat_df = attr_href_cat_df.reindex(columns=attr_href_cat_df_cl)
attr_href_cat_df.head()

Unnamed: 0,activityId,locationId,href,category
0,12962337,298570,/AttractionProductReview-g298570-d12962337-Mal...,"[featured_tours_and_tickets, luxury_special_oc..."
1,12905660,298570,/AttractionProductReview-g298570-d12905660-Pet...,"[featured_tours_and_tickets, sightseeing_ticke..."
2,11807013,298570,/AttractionProductReview-g298570-d11807013-Dis...,"[featured_tours_and_tickets, private_custom_to..."
3,12471375,298570,/AttractionProductReview-g298570-d12471375-Gen...,"[featured_tours_and_tickets, luxury_special_oc..."
4,12469580,298570,/AttractionProductReview-g298570-d12469580-Hal...,"[featured_tours_and_tickets, luxury_special_oc..."


In [124]:
# showing and removing duplicate value
print('Duplicated values:')
print(attr_href_cat_df[attr_href_cat_df['activityId'].duplicated(keep=False)])
print()
print('Length before removing duplicated values: ' +
      str(len(attr_href_cat_df)))

# # dropping duplicate values
attr_href_cat_df = attr_href_cat_df.drop_duplicates(subset='activityId',
                                                    keep=False)
print('Length after removing duplicated values: ' + str(len(attr_href_cat_df)))

# dropping nan values
attr_href_cat_df = attr_href_cat_df.dropna()
print('Length after removing NaN values: ' + str(len(attr_href_cat_df)))

Duplicated values:
     activityId locationId                                               href  \
375         NaN        NaN  /AttractionProductReview?partner=Viator&produc...   
1338        NaN        NaN  /AttractionProductReview?partner=Viator&produc...   
1944        NaN        NaN  /AttractionProductReview?partner=Viator&produc...   

                                       category  
375    [classes_workshops, food_wine_nightlife]  
1338                       [outdoor_activities]  
1944  [private_custom_tours, tours_sightseeing]  

Length before removing duplicated values: 4260
Length after removing duplicated values: 4257
Length after removing NaN values: 4257


In [125]:
attr_href_cat_df_spark = spark.createDataFrame(attr_href_cat_df)
attr_href_cat_df_spark.show()

+----------+----------+--------------------+--------------------+
|activityId|locationId|                href|            category|
+----------+----------+--------------------+--------------------+
|  12962337|    298570|/AttractionProduc...|[featured_tours_a...|
|  12905660|    298570|/AttractionProduc...|[featured_tours_a...|
|  11807013|    298570|/AttractionProduc...|[featured_tours_a...|
|  12471375|    298570|/AttractionProduc...|[featured_tours_a...|
|  12469580|    298570|/AttractionProduc...|[featured_tours_a...|
|  16722683|    298570|/AttractionProduc...|[featured_tours_a...|
|  11454341|   1096277|/AttractionProduc...|[featured_tours_a...|
|  11812460|    298570|/AttractionProduc...|[featured_tours_a...|
|  11992275|   1096277|/AttractionProduc...|[featured_tours_a...|
|  11991123|    660694|/AttractionProduc...|[featured_tours_a...|
|  11482502|    298570|/AttractionProduc...|[featured_tours_a...|
|  19350513|    635527|/AttractionProduc...|[featured_tours_a...|
|  1147497

In [126]:
tripad_location = parentDir + '/crawler/datasets/tripadvisor_dataset/tripad_location.json'
tripad_location_df_spark = spark.read.json(tripad_location)

In [129]:
tripad_location_df_spark.select(
    tripad_location_df_spark.location_id,
    tripad_location_df_spark.name,
    tripad_location_df_spark.latitude,
    tripad_location_df_spark.longitude).show()

+-----------+--------------------+--------+---------+
|location_id|                name|latitude|longitude|
+-----------+--------------------+--------+---------+
|     298570|        Kuala Lumpur|3.156802|101.69717|
|     298307|       Kota Kinabalu|5.979383|116.07349|
|     660694|       Penang Island|5.388071|100.29352|
|    1096277|                Kuah|6.326672|99.843025|
|     635527|      Batu Ferringhi|5.471783|100.24629|
|     298303|         George Town|5.411938|100.32664|
|     298313|       Petaling Jaya| 3.10363|101.64778|
|     298306|               Sabah|5.176992|117.13721|
|     298277|               Johor|1.889306|103.42438|
|     298291|              Pahang| 3.62074|102.59239|
|     298308|             Sarawak|2.310777|112.50697|
|     293938| Bandar Seri Begawan|  4.8919|114.94044|
|     298298|                Ipoh| 4.59725|101.12015|
|     303997|            Sandakan|5.838944|118.11561|
|     294265|           Singapore|1.285801|103.85111|
|     298283|            Lan

In [112]:
tripad_activity = parentDir + '/crawler/datasets/tripadvisor_dataset/tripad_activity.json'
tripad_activity_df_spark = spark.read.json(tripad_activity)

In [None]:
tripad_activity_df_spark

In [109]:
tripad_activity_df_spark = tripad_activity_df_spark.drop(
    'productCarousel', 'highlights', 'whatToExpect', 'importantInformation',
    'additionalInfo', 'cancellationPolicy',
    'obfuscatedViatorCommerceLink', '__COMMENT')

In [118]:
tripad_activity_df_spark.select(
    tripad_activity_df_spark.productHeader.activityId,
    tripad_activity_df_spark.productHeader.productTitle,
    tripad_activity_df_spark.productHeader.reviewRating,
    tripad_activity_df_spark.tourPlanner.numericPrice).show()

+------------------------+--------------------------+--------------------------+------------------------+
|productHeader.activityId|productHeader.productTitle|productHeader.reviewRating|tourPlanner.numericPrice|
+------------------------+--------------------------+--------------------------+------------------------+
|                12962337|      Malaysia Countrys...|                       4.5|                   41.36|
|                12905660|      Petronas Twin Tow...|                       4.0|                  119.96|
|                11807013|      Discover the Best...|                       4.5|                  142.04|
|                12471375|      Genting Highlands...|                       4.5|                  103.41|
|                12469580|      Half-Day Kuala Lu...|                       4.0|                   43.43|
|                16722683|      KL Hop On Hop Off...|                       3.0|                   53.28|
|                11476707|      Tunku Abdul Ra

In [71]:
tripad_activity_df_spark.productHeader.productId

Column<b'productHeader[productId]'>

In [54]:
tripad_activity_df_spark

DataFrame[additionalInfo: string, cancellationPolicy: string, highlights: array<struct<encodedValue:string,originalValue:string>>, importantInformation: struct<departurePoint:string,departureTime:string,duration:string,exclusions:string,inclusions:string,returnDetails:string>, obfuscatedViatorCommerceLink: string, overview: struct<amenities:struct<additionalLanguages:string,additionalLanguagesCount:bigint,duration:string,freeCancellation:string,hotelPickup:string,instantConfirmation:string,locale:string,primaryLanguages:string,supplierName:string,supplierUrl:string,supplierUrlIsObfuscated:boolean,voucher:string>,locale:string,productText:string>, productCarousel: struct<isTravelersChoice:boolean,mediaList:array<struct<mediaSource:string,mediaType:string,mediaUrl:string,title:string>>>, productItinerary: struct<endsAtStartPoint:boolean,pickupAdditionalInfo:string,pickupTypeEnum:string,productItineraryItems:array<struct<admissionInclusionTypeEnum:string,description:string,durationInMinut

In [32]:
tripad_activity_df_spark.count()

4122

In [33]:
tripad_activity_df_spark.write.parquet(parentDir + '/crawler/datasets/tripadvisor_dataset/spark/tripad_act',mode='overwrite')

In [35]:
attr_act_parquet_path = parentDir + '/crawler/datasets/tripadvisor_dataset/spark/tripad_act'

attr_act_df = spark.read.parquet(attr_act_parquet_path)
attr_act_df.createOrReplaceTempView('attr_act_df')

In [37]:
attr_act_df.count()

4122