In [1]:
# import project directory helper
import os, sys
ROOT_DIR = os.path.abspath('/home/hduser/document/jupyter/FYP/') 
sys.path.insert(0, ROOT_DIR)

In [2]:
# import util packages
from IPython.display import display
import ipywidgets as w
import pandas as pd
import re

In [3]:
# import pyspark packages
# set the kafka dependencies before create spark context or session
# import os
# os.environ[
#     'PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.4.4,org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.4 pyspark-shell'
from pyspark.sql import SparkSession
# from pyspark.sql.types import *

In [4]:
spark = SparkSession.builder.appName('attraction').getOrCreate()
spark

In [5]:
# define path
ds_dir = ROOT_DIR + '/crawler/datasets/tripadvisor_dataset/attractions/'
spark_warehouse_dir = ROOT_DIR + '/crawler/datasets/tripadvisor_dataset/attractions/spark-warehouse/'

In [6]:
# read spark dataframe from parquet
final_attr_spark_df = spark.read.parquet(spark_warehouse_dir + 'etl/attractions')

In [7]:
final_attr_spark_df.createOrReplaceTempView('final_attr_spark_df')

In [8]:
attr_price_spark_df = spark.sql(
    "SELECT MIN(price) as min_price, MAX(price) as max_price FROM final_attr_spark_df"
)

In [9]:
attr_price_df = attr_price_spark_df.toPandas()

In [10]:
attr_category_df = final_attr_spark_df.select(final_attr_spark_df.category).distinct().toPandas()

In [11]:
attr_city_df = final_attr_spark_df.select(final_attr_spark_df.city).distinct().toPandas()

# User details and Preferences

In [12]:
w_uname = w.Text(placeholder='Liho', description="User Name")
w_destination = w.Dropdown(options=attr_city_df.city.tolist(),
                           value='Kuala Lumpur',
                           placeholder='Kuala Lumpur',
                           description="Destination")
w_budget = w.IntRangeSlider(
    min=attr_price_df.min_price[0],
    max=attr_price_df.max_price[0],
    step=10,
    value=[attr_price_df.min_price[0], attr_price_df.max_price[0]],
    description="Budget")
v1 = w.VBox([w_uname, w_destination])

w_start_date = w.DatePicker(description='Start Date', disabled=False)
w_end_date = w.DatePicker(description='End Date', disabled=False)
v2 = w.VBox([w_start_date, w_end_date])

out = w.HBox([v1, v2, w_budget])
display(out)

HBox(children=(VBox(children=(Text(value='', description='User Name', placeholder='Liho'), Dropdown(descriptio…

In [13]:
attr_category_list = attr_category_df.category.tolist()

In [14]:
attr_category_title_list = [' '.join(i.split('_')).title() for i in attr_category_list]

In [15]:
w_cat_rating = dict()


def btn_cat_rating(btn):
    cat = '_'.join(btn.description.split(' ')).lower()

    if cat in w_cat_rating:
        return
    print(btn.description)
    slider = w.IntSlider(min=0, max=5, step=1, description='Rate')
    display(slider)
    w_cat_rating[cat] = slider
    if (len(w_cat_rating) < 5):
        print("Rate {x} more!\n".format(x=5 - len(w_cat_rating)))

In [16]:
button_layout = w.Layout(width='100%', height='50px', border="1px solid black")
button_items = [
    w.Button(description=cat, layout=button_layout)
    for cat in attr_category_title_list
]

on_click = [item.on_click(btn_cat_rating) for item in button_items]

buttons = w.GridBox(button_items,
                    layout=w.Layout(grid_template_columns="repeat(4, 24%)"))

print("Select and rate at least 5 categories and rate them: ")
display(buttons)

Select and rate at least 5 categories and rate them: 


GridBox(children=(Button(description='Air Helicopter Balloon Tours', layout=Layout(border='1px solid black', h…

Theme Parks


IntSlider(value=0, description='Rate', max=5)

Rate 4 more!

Transfers Ground Transport


IntSlider(value=0, description='Rate', max=5)

Rate 3 more!

Outdoor Activities


IntSlider(value=0, description='Rate', max=5)

Rate 2 more!



In [26]:
uname = re.sub(' ', '_', w_uname.value.lower())
destination = w_destination.value

(budget_low, budget_high) = tuple([float(i) for i in w_budget.value])

start_date = w_start_date.value
end_date = w_end_date.value
    
cat_rating = {key: float(value.value) for key, value in w_cat_rating.items()}

# Model recommendation and filtering

In [95]:
from attraction_recc import *

In [96]:
hyperparameter = {
    'rows': 5000,
    'epochs': 20,
    'batch_size': 8,
    'alpha': 0.01,
    'H': 128
}
# hyperparameter = {
#     'rows': 40000,
#     'epochs': 50,
#     'batch_size': 16,
#     'alpha': 0.01,
#     'H': 128
# }

In [97]:
filename, user, rbm_att = get_recc(spark, cat_rating, hyperparameter)

Reading the data from /home/hduser/document/jupyter/FYP/crawler/datasets/tripadvisor_dataset/attractions/spark-warehouse/etl/attractions
Reading the data from /home/hduser/document/jupyter/FYP/crawler/datasets/tripadvisor_dataset/attractions/spark-warehouse/etl/attraction_reviews
Extracting 5000 rows from ratings
Preprocessing the dataset
Similar User: 2499
Model restored


Unnamed: 0,activityId,Recommendation Score
0,11449623,0.000131
1,11449646,0.000127
2,11449648,0.000104
3,11449649,0.000117
4,11449657,9.1e-05


In [98]:
with_url = filter_df(spark, filename, user, budget_low, budget_high,
                     destination, final_attr_spark_df.toPandas())

In [99]:
final = get_recc_final(with_url, start_date, end_date)


Item no.: 1 --> Item name = Half-Day Penang Countryside Cycling Tour
Evaluating...
Getting URLs without downloading images...
Image URL: https://media-cdn.tripadvisor.com/media/attractions-splice-spp-540x360/06/70/63/fd.jpg
Printed url without downloading

Errors: 0


Item no.: 1 --> Item name = Penang Tour: Penang Hill and Kek Lok Si Temple
Evaluating...
Getting URLs without downloading images...
Image URL: https://media-cdn.tripadvisor.com/media/attractions-splice-spp-540x360/06/71/a4/2f.jpg
Printed url without downloading

Errors: 0


Item no.: 1 --> Item name = Penang City Sightseeing Tour
Evaluating...
Getting URLs without downloading images...
Image URL: http://www.myhoponhopoff.com/pg/images/citytour-details.jpg
Printed url without downloading

Errors: 0


Item no.: 1 --> Item name = Pinang Peranakan Museum Admission Ticket
Evaluating...
Getting URLs without downloading images...
Image URL: http://www.pinangperanakanmansion.com.my/images/banner_entrance.jpg
Printed url without 

In [121]:
final

{'timeofday': ['Morning', 'Morning', 'Evening', 'Evening'],
 'name': ['Half-Day Penang Countryside Cycling Tour',
  'Penang Tour: Penang Hill and Kek Lok Si Temple',
  'Penang City Sightseeing Tour',
  'Pinang Peranakan Museum Admission Ticket'],
 'location': [['5.411938', '100.32664'],
  ['5.411938', '100.32664'],
  ['5.411938', '100.32664'],
  ['5.411938', '100.32664']],
 'price': [115.81, 253.61, 343.86, 32.6],
 'rating': [5.0, 4.0, 4.5, 5.0],
 'category': ['featured_tours_and_tickets',
  'private_custom_tours',
  'recommended_experiences',
  'sightseeing_tickets_passes'],
 'image': ['https://media-cdn.tripadvisor.com/media/attractions-splice-spp-540x360/06/70/63/fd.jpg',
  'https://media-cdn.tripadvisor.com/media/attractions-splice-spp-540x360/06/71/a4/2f.jpg',
  'http://www.myhoponhopoff.com/pg/images/citytour-details.jpg',
  'http://www.pinangperanakanmansion.com.my/images/banner_entrance.jpg']}

# Final Recommendation Results

In [100]:
days = (end_date - start_date).days + 1
days

1

In [101]:
time = ['MORNING', 'EVENING']
fields = ['NAME', 'CATEGORY', 'LOCATION', 'PRICE', 'RATING']
recommendations = ['Recommendation 1:', 'Recommendation 2:']

In [102]:
from ipywidgets import HBox, VBox, widgets

In [103]:
box_layout = w.Layout(
    justify_content='space-between',
    display='flex',
    flex_flow='row',
    align_items='stretch',
)
column_layout = w.Layout(
    justify_content='space-between',
    width='75%',
    display='flex',
    flex_flow='column',
)
tab = []

In [104]:
for i in range(days):
    name = [
        re.sub('_', ' ', i).capitalize()
        for i in final['name'][i*4:(i+1)*4]
    ]
    category = [
        re.sub('_', ' ', i).capitalize()
        for i in final['category'][i*4:(i+1)*4]
    ]
    location = [
        "(" + str(i[0]) + "," + str(i[1]) + ")"
        for i in final['location'][i*4:(i+1)*4]
    ]
    price = [str(i) for i in final['price']][i*4:(i+1)*4]
    rating = [str(i) for i in final['rating']][i*4:(i+1)*4]

    images = final['image'][i*4:(i+1)*4]
    tab.append(
        VBox(children=[
            HBox(children=[
                VBox(children=[
                    widgets.HTML(value=f"<b><font color='orange'>{time[0]}</b>"),
                    widgets.HTML(
                        value=f"<b><font color='purple'>{recommendations[0]}</b>"),
                    widgets.HTML(value="<img src='{}'>".format(images[0])),
                    widgets.HTML(description=fields[0],
                                 value=f"<b><font color='black'>{name[0]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[1],
                                 value=f"<b><font color='black'>{category[0]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[2],
                                 value=f"<b><font color='black'>{location[0]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[3],
                                 value=f"<b><font color='black'>{price[0]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[4],
                                 value=f"<b><font color='black'>{rating[0]}</b>",
                                 disabled=True)
                ],
                     layout=column_layout),
                VBox(children=[
                    widgets.HTML(value=f"<b><font color='orange'>{time[1]}</b>"),
                    widgets.HTML(
                        value=f"<b><font color='purple'>{recommendations[0]}</b>"),
                    widgets.HTML(value="<img src='{}'>".format(images[2])),
                    widgets.HTML(description=fields[0],
                                 value=f"<b><font color='black'>{name[2]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[1],
                                 value=f"<b><font color='black'>{category[2]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[2],
                                 value=f"<b><font color='black'>{location[2]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[3],
                                 value=f"<b><font color='black'>{price[2]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[4],
                                 value=f"<b><font color='black'>{rating[2]}</b>",
                                 disabled=True)
                ],
                     layout=column_layout)
            ],
                 layout=box_layout),
            HBox(children=[
                VBox(children=[
                    widgets.HTML(
                        value=f"<b><font color='purple'>{recommendations[1]}</b>"),
                    widgets.HTML(value="<img src='{}'>".format(images[1])),
                    widgets.HTML(description=fields[0],
                                 value=f"<b><font color='black'>{name[1]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[1],
                                 value=f"<b><font color='black'>{category[1]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[2],
                                 value=f"<b><font color='black'>{location[1]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[3],
                                 value=f"<b><font color='black'>{price[1]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[4],
                                 value=f"<b><font color='black'>{rating[1]}</b>",
                                 disabled=True)
                ],
                     layout=column_layout),
                VBox(children=[
                    widgets.HTML(
                        value=f"<b><font color='purple'>{recommendations[1]}</b>"),
                    widgets.HTML(value="<img src='{}'>".format(images[3])),
                    widgets.HTML(description=fields[0],
                                 value=f"<b><font color='black'>{name[3]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[1],
                                 value=f"<b><font color='black'>{category[3]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[2],
                                 value=f"<b><font color='black'>{location[3]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[3],
                                 value=f"<b><font color='black'>{price[3]}</b>",
                                 disabled=True),
                    widgets.HTML(description=fields[4],
                                 value=f"<b><font color='black'>{rating[3]}</b>",
                                 disabled=True)
                ],
                     layout=column_layout),
            ],
                 layout=box_layout)
        ]))

In [105]:
tab_recc = widgets.Tab(children=tab)
for i in range(len(tab_recc.children)):
    tab_recc.set_title(i, str('Day ' + str(i + 1)))

In [106]:
display(tab_recc)

Tab(children=(VBox(children=(HBox(children=(VBox(children=(HTML(value="<b><font color='orange'>MORNING</b>"), …

In [122]:
test = 'kuala_lumpur'

In [125]:
re.sub('_', ' ', test).title()

'Kuala Lumpur'