# Booking prices analysis

<br>

## Basic info:   
   * **goal**: Verify if the given hotel price limit for a particular city is accurate to the market prices 
   * **author**: Slawomir Drzymala   
   * **code**: <a href="http://www.gooogle.com" target="_blank">github/sdrzymala</a>
   * **last update date**: 2019-10-27

## Description:
To achieve the given goal the following steps has been made:
   1. Get hotel/apartments/etc (property) details and prices for given city for the same length of stay for different days 
   1. Specify the office location and calculate the distance between the office and each property
   1. Analye the data and check the distribution of price per person per night
      * in total
      * excluding outliers (most luxurious properties)
      * for hotels only
      * for trusted properties only (with review score > 5)
      * for properties that are within the walking distance to the office
      * all together



# 1.1 Import libraries and configure jupyter

In [1]:
#!jupyter nbextension enable --py widgetsnbextension
# import
import logging, sys
#logging.disable(sys.maxsize)
import datetime
import pyodbc
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
import math

import matplotlib.pyplot as plt
import gmaps
import gmaps.datasets
import plotly.express as px
import plotly
import plotly.graph_objects as go
import plotly.utils as pu
from plotly.subplots import make_subplots
import json
import ipywidgets as widgets
from ipywidgets import *
import shapely.geometry
import re
import uuid
import plotly.io as pio
import time
from ipywidgets.embed import embed_minimal_html
import IPython
%matplotlib inline
plotly.offline.init_notebook_mode(connected=True)

# 1.2 Get data from other notebooks

In [2]:
# Read data

%store -r df_search
%store -r df_search_result
%store -r df_distance_result
%store -r price_per_person_per_night_limit
%store -r walking_distance_limit_meters
%store -r gmaps_api_key


# 1.3 Configure google maps 

In [3]:
# asssign the google maps API key from config file
gmaps.configure(api_key=gmaps_api_key) 

# 1.4 Configure other parameters

In [4]:
# constant

chart_colors = [
    '#1f77b4',  # muted blue
    '#ff7f0e',  # safety orange
    '#2ca02c',  # cooked asparagus green
    '#d62728',  # brick red
    '#9467bd',  # muted purple
    '#8c564b',  # chestnut brown
    '#e377c2',  # raspberry yogurt pink
    '#7f7f7f',  # middle gray
    '#bcbd22',  # curry yellow-green
    '#17becf'   # blue-teal
]

total_no_properties = str(len(df_search_result['booking_property_id'].unique()))

# create hash map file name
gmaps_report_file_name = df_search['city_name'].apply(lambda x: x).unique()[0] + "_map.html"
output_file_name = df_search['city_name'].apply(lambda x: x).unique()[0] + ".html"


# 1.5 Prepare other functions

In [5]:
class HtmlRenderer(object):
    def __init__(self, text):
        self.text = text

    def _repr_html_(self):
        return self.text

# 2.1 Introduction to analysis

In [6]:
#display(HTML('<div class=\"custom\"><h1>We scored <span style=\"color:green;\">%g</span> touchdowns, which is <span style=\"color:orange;\">%s</span> %s previous season.</h1><ul><li>Goal was %g touchdowns</li><li>We were %g %s goal</li></ul></div>'%(1,2,3,4,5,6)))


df_search_output = df_search



new_line = "<br>"
title = "<h1>" + "Booking searches analysis" + "</h1>"



description = "<p>" + \
              "On <b>" + \
              str(df_search_output['search_date'].apply(lambda x: x).unique()[0]) + \
              "</b> the booking.com website was searched to create the analysis of prices in the city of <b>" + \
              str(df_search_output['city_name'].apply(lambda x: x).unique()[0]) + \
              "</b> ( <b>" + \
              str(df_search_output['country'].apply(lambda x: x).unique()[0]) + \
              "</b> ) for given criteries:" + \
              "<ul>" + \
              "<li>" + "Number of nights: <b>" + str(df_search_output['no_nights'].apply(lambda x: x).unique()[0]) + "</b></li>" + \
              "<li>" + "Number of adults: <b>" + str(df_search_output['no_adults'].apply(lambda x: x).unique()[0]) + "</b></li>" + \
              "<li>" + "Number of rooms: <b>" + str(df_search_output['no_rooms'].apply(lambda x: x).unique()[0]) + "</b></li>" + \
              "<li>" + "Is business trip: <b>" + str(df_search_output['is_business_trip'].apply(lambda x: x).unique()[0]) + "</b></li>" + \
              "</ul>" + \
              "</p>"


booking_searches_title = "<p>" + "The given searches has been done:" 
df_search_output['booking_dates'] = "from <b>" + \
                                    df_search_output['check_in_date'].apply(lambda x: x.strftime('%Y-%m-%d')) + \
                                    "</b> to <b>" + \
                                    df_search_output['check_out_date'].apply(lambda x: x.strftime('%Y-%m-%d')) + \
                                    "</b> which is <b>" + \
                                    df_search_output['no_days_before_travel'].apply(lambda x: str(x)) + \
                                    "</b> number of days in advance."
booking_searches = list(df_search["booking_dates"].unique())
booking_searches_list = "<ul>" + "".join(["<li>" + search + " </li> " for search in booking_searches]) + "</ul>"


stats = "<p>As the results the price and not only informations of <b>" + total_no_properties + "</b> properties in total has been collected</p>"


destination_list = list(df_distance_result["destination_name"].unique())
destinations = "To get a better insight the following destination has been defined: <b>" + \
               (df_distance_result['destination_name'].apply(lambda x: x).unique()[0]) + "</b>" + \
               ", location = (<b>" + (df_distance_result['destination_coordinates_long'].apply(lambda x: str(x)).unique()[0]) + \
               ", " + (df_distance_result['destination_coordinates_lat'].apply(lambda x: str(x)).unique()[0]) + ")</b>" 




parameters_information = "<p>Please note that the following parameters has been setup:" + \
                         "<ul>" + \
                         "<li>Price per person per night limit: <b>" + str(price_per_person_per_night_limit) + "</b></li>" + \
                         "<li>Walking distance limit (meters): <b>" + str(walking_distance_limit_meters) + "</b></li>" + \
                         "</ul>" + \
                         "</p>"

prices_information = "<p>Please also note that the prices below are shown in <b>" + str(df_search_output['currency'].apply(lambda x: x).unique()[0]) + "</b></p>"


output = ""
output += title
output += description
output += booking_searches_title + booking_searches_list
output += stats
output += destinations
output += parameters_information
output += prices_information

rendered = HtmlRenderer(output)
display(rendered)


# 3.1 Overview of the data - number of properties

In [7]:
no_properties = str(len(df_search_result['booking_property_id'].unique()))


title = "<b>Number of properties over property type</b> \
        <br>Show all properties. Number of properties = {0} out of {1} \
        <br>\
        ".format(no_properties, total_no_properties)


df = df_search_result \
        .groupby('property_type') \
        .agg({
                'booking_property_id' : ['nunique'],
                'total_price' : ['mean'],
                'price_per_person_per_night': ['mean'],
                'review_score' : ['mean'],
                'no_reviews' : ['mean']
        }) \
        .reset_index()

df.columns = ["_".join(x).strip() for x in df.columns.ravel()]


df['total_price_mean'] = round(df['total_price_mean'],2)
df['price_per_person_per_night_mean'] = round(df['price_per_person_per_night_mean'],2)
df['review_score_mean'] = round(df['review_score_mean'],2)
df['no_reviews_mean'] = round(df['no_reviews_mean'],2)

fig = px.bar(df, 
             x='property_type_', 
             y='booking_property_id_nunique', 
             template='none', 
             text='booking_property_id_nunique', 
             hover_data=df.columns,
             title = title,
             labels = {'property_type_': 'property type','booking_property_id_nunique':'distinct count'}
    )


fig.update_layout(
    title=go.layout.Title(
        text=title,
        xref="paper",
        x=-0
    )
)

fig.show()

# 3.2 Distribution of ppp per night (all)

In [8]:
no_properties = str(len(df_search_result['booking_property_id'].unique()))







_property_type_names = df_search_result.groupby('property_type') \
        .agg({'booking_property_id' : ['nunique']}) \
        .reset_index() \
        .applymap(str)
_property_type_names.columns = ["property_type", "row_count"]
_property_type_names.applymap(str)
_property_type_names["property_type_name"] = _property_type_names["property_type"] + " (" + _property_type_names["row_count"] + ")"
property_type_names = "Where: " + ", ".join(_property_type_names["property_type_name"].tolist())








title = "<b>Price per person per night distribution over property type</b> \
        <br>Show all properties. Number of properties = {0} out of {1} \
        ".format(no_properties, total_no_properties) + \
        "<br>" + property_type_names 
        
        

property_types = np.sort(df_search_result["property_type"].unique())


layout = go.Layout(
    title=title,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    showlegend=False,
    yaxis=dict(
        gridcolor='rgb(245, 245, 240)',
        zerolinecolor='rgb(0, 0, 0)',
        showgrid=True, 
        zeroline=True
        ),
    )

fig = go.Figure(
    layout = layout
    )

for property_type, color in list(zip(property_types, chart_colors)):
    
    x = df_search_result[
            (df_search_result['property_type'] == property_type)
        ]
    
    fig.add_trace(
    go.Box( 
        y=x["price_per_person_per_night"],
        name=property_type,
        boxmean=True,
        boxpoints="all",
        marker_color=color
        )
    )
    
    del x

    


    
# Add contant line with price per person limit
no_property_types = len(df_search_result["property_type"].unique())
fig.update_layout(
    shapes=[
        # Line Horizontal
        go.layout.Shape(
            type="line",
            x0=-0.5,
            x1=no_property_types-0.5,
            y0=price_per_person_per_night_limit,
            y1=price_per_person_per_night_limit,
            line=dict(
                color="LightSeaGreen",
                #width=4,
                dash="dashdot",
            )
        )
    ]
)



fig.show()

# 3.3 Distribution of ppp per night (without outliers)

In [9]:
df = df_search_result[
            (df_search_result['total_price_price_outlier_zscore'] < 1) 
        ]



_property_type_names = df.groupby('property_type') \
        .agg({'booking_property_id' : ['nunique']}) \
        .reset_index() \
        .applymap(str)
_property_type_names.columns = ["property_type", "row_count"]
_property_type_names.applymap(str)
_property_type_names["property_type_name"] = _property_type_names["property_type"] + " (" + _property_type_names["row_count"] + ")"
property_type_names = "Where: " + ", ".join(_property_type_names["property_type_name"].tolist())



no_properties = str(len(df['booking_property_id'].unique()))



title = "<b>Price per person per night distribution over property type</b> \
        <br>Exclude outliers (based on Z-Score). Number of properties = {0} out of {1} \
        ".format(no_properties, total_no_properties) + \
        "<br>" + property_type_names 

property_types = np.sort(df["property_type"].unique())

layout = go.Layout(
    title=title,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    showlegend=False,
    yaxis=dict(
        gridcolor='rgb(245, 245, 240)',
        zerolinecolor='rgb(0, 0, 0)',
        showgrid=True, 
        zeroline=True
        )
    )

fig = go.Figure(
    layout = layout
    )

for property_type, color in list(zip(property_types, chart_colors)):
    
    x = df[
             (df['property_type'] == property_type)
        ]
    
    fig.add_trace(
    go.Box( 
        y=x["price_per_person_per_night"],
        name=property_type,
        boxmean='sd',
        boxpoints="all",
        marker_color=color
        )
    )
    
    del x
    
    
# Add contant line with price per person limit
no_property_types = len(df["property_type"].unique())
fig.update_layout(
    shapes=[
        # Line Horizontal
        go.layout.Shape(
            type="line",
            x0=-0.5,
            x1=no_property_types-0.5,
            y0=price_per_person_per_night_limit,
            y1=price_per_person_per_night_limit,
            line=dict(
                color="LightSeaGreen",
                #width=4,
                dash="dashdot",
            )
        )
    ]
)

fig.show()

# 3.4 Distribution of ppp per night (without outliers) zoom in

In [10]:
df = df_search_result[
            (df_search_result['total_price_price_outlier_zscore'] < 1) 
        ]




no_properties = str(len(df['booking_property_id'].unique()))

title = "<b>Price per person per night distribution over property type</b> \
        <br>Exclude outliers (based on Z-Score) and zoom-in. Please note that no other exclusion has been made \
        <br>and only the range of Y-as has been changed. Number of properties = {0} out of {1} \
        ".format(no_properties, total_no_properties) 

how_many_multiply_by_price_per_person_per_night_limit_to_range = 3


property_types = np.sort(df["property_type"].unique())

layout = go.Layout(
    title=title,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    showlegend=False,
    yaxis=dict(
        gridcolor='rgb(245, 245, 240)',
        zerolinecolor='rgb(0, 0, 0)',
        showgrid=True, 
        zeroline=True,
        range=(0,how_many_multiply_by_price_per_person_per_night_limit_to_range*price_per_person_per_night_limit)
        ),
    )

fig = go.Figure(
    layout = layout
    )

for property_type, color in list(zip(property_types, chart_colors)):
    
    x = df[
             (df['property_type'] == property_type)
        ]
    
    fig.add_trace(
    go.Box( 
        y=x["price_per_person_per_night"],
        name=property_type,
        boxmean=True,
        boxpoints="all",
        marker_color=color
        )
    )
    
    del x
    
# Add contant line with price per person limit
no_property_types = len(df["property_type"].unique())
fig.update_layout(
    shapes=[
        # Line Horizontal
        go.layout.Shape(
            type="line",
            x0=-0.5,
            x1=no_property_types-0.5,
            y0=price_per_person_per_night_limit,
            y1=price_per_person_per_night_limit,
            line=dict(
                color="LightSeaGreen",
                #width=4,
                dash="dashdot",
            )
        )
    ]
)

fig.show()

# 3.5 Distribution of ppp per night (without outliers) per search date

In [11]:
df = df_search_result[
            (df_search_result['total_price_price_outlier_zscore'] < 1) 
        ]



no_properties = str(len(df['booking_property_id'].unique()))

title = "<b>Price per person per night distribution over property type</b> \
        <br>Exclude outliers (based on Z-Score). Distribution per search date.  \
        <br>Number of properties = {0} out of {1} \
        <br>\
        ".format(no_properties, total_no_properties)



how_many_multiply_by_price_per_person_per_night_limit_to_range = 3
booking_dates = df["booking_dates"].unique()

layout = go.Layout(
    title=title,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    showlegend=False,
    yaxis=dict(
        gridcolor='rgb(245, 245, 240)',
        zerolinecolor='rgb(0, 0, 0)',
        showgrid=True, 
        zeroline=True,
        range=(0,how_many_multiply_by_price_per_person_per_night_limit_to_range*price_per_person_per_night_limit)
        
        )
    )

fig = go.Figure(
    layout = layout
    )

for booking_date, color in list(zip(booking_dates, chart_colors )):
    
    x = df[
            (df['booking_dates'] == booking_date)
        ]
    
    fig.add_trace(
    go.Box( 
        y=x["price_per_person_per_night"],
        name=booking_date,
        boxmean=True,
        boxpoints="all",
        marker_color=color
        )
    )
    
    del x
    
    
# Add contant line with price per person limit
no_property_types = len(df["booking_dates"].unique())
fig.update_layout(
    shapes=[
        # Line Horizontal
        go.layout.Shape(
            type="line",
            x0=-0.5,
            x1=no_property_types-0.5,
            y0=price_per_person_per_night_limit,
            y1=price_per_person_per_night_limit,
            line=dict(
                color="LightSeaGreen",
                #width=4,
                dash="dashdot",
            )
        )
    ]
)

fig.show()

# 3.6 Distribution of ppp per night (without outliers) include ratings

In [12]:
__df = df_search_result[
            (df_search_result['total_price_price_outlier_zscore'] < 1) 
        ]

no_properties = str(len(__df['booking_property_id'].unique()))



title = "<b>Price per person per night distribution over property type</b> \
        <br>Exclude outliers (based on Z-Score). Distribution per property rate (binned). \
        <br>Green if below limit and orange if above. Number of properties = {0} out of {1} \
        <br>\
        ".format(no_properties, total_no_properties)


_df = df_search_result[(
            (df_search_result['total_price_price_outlier_zscore'] < 1) )
        ]

df = _df \
        .groupby(['review_score_bin', 'property_type']) \
        .agg({
                'booking_property_id' : ['nunique'],
                'price_per_person_per_night': ['mean'],
                'review_score' : ['mean'],
               'no_reviews' : ['mean']
        }) \
        .reset_index()

df.columns = ["_".join(x).strip() for x in df.columns.ravel()]

bins = pd.DataFrame(columns=["review_score_bin"], data=pd.Series(range(0,13,2)))
bins['review_score_bin_'] = pd.cut(bins['review_score_bin'], [float("-inf"),0, 2, 4, 6, 8, 10,np.inf], include_lowest=True).astype(str)



df = pd.merge(df, bins, on='review_score_bin_', how='outer')
df["booking_property_id_nunique"].fillna(0, inplace = True)
df["price_per_person_per_night_mean"].fillna(0, inplace = True)
df["review_score_mean"].fillna(0, inplace = True)
df["no_reviews_mean"].fillna(0, inplace = True)
df["property_type_"].fillna('N/A', inplace = True)

df = df.sort_values(by=['review_score_bin_'])

df.drop(['review_score_bin'], inplace=True, axis=1)

df['price_per_person_per_night_mean'] = round(df['price_per_person_per_night_mean'],0)
df['review_score_mean'] = round(df['review_score_mean'],2)
df['no_reviews_mean'] = round(df['no_reviews_mean'],2)

df.columns = ['review_score_bin', 'property_type', 'booking_property_id_nunique',
        'price_per_person_per_night_mean',
       'review_score_mean', 'no_reviews_mean']


df['review_score_bin_sort'] = df['review_score_bin'].apply(lambda x: float(str(x).split(',')[0].replace('(','').replace('-inf','0')))
df['price_per_person_per_night_mean_lower_than_limit'] = np.where(df['price_per_person_per_night_mean'] <= price_per_person_per_night_limit, "yes", "no")

df = df.sort_values(by=['review_score_bin_sort'])


df.drop(['review_score_bin_sort'], inplace=True, axis=1)


df = df.reset_index(drop=True)

fig = px.scatter(df, 
                 x="review_score_bin", 
                 y="property_type", 
                 size="price_per_person_per_night_mean", 
                 text="price_per_person_per_night_mean", 
                 size_max=100,
                 template='none',
                 hover_data=df.columns,
                 title = 'Mean of price pppn per review score (bin) and property type',
                 labels = {'review_score_bin': 'review score (bin)','property_type':'property type'},
                 #color = "property_type"
                 color = "price_per_person_per_night_mean_lower_than_limit",
                 color_discrete_map = {
                         "yes" : "rgb(132,222,2)", #green
                         "no" : "rgb(255,126,0)" # orange
                     }
                )


layout = go.Layout(
    title=dict(
            text=title,
            x=0
        ),
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    showlegend=False,
    xaxis=dict(
        categoryorder = "array",
        categoryarray = [x for x in list(df.review_score_bin.unique())],
        gridcolor='rgb(245, 245, 240)',
        showgrid=True, 
        ),
    )



fig.update_layout(layout)


fig.show()


# 3.7.1 Create function to show points on the map 

In [13]:
def get_distance_radius(lat, lon, radius, dummy_projection_ratio):

    
    """
    https://github.com/pbugnion/gmaps/issues/206
    Inspired by:
    https://stackoverflow.com/questions/7477003/calculating-new-longitude-latitude-from-old-n-meters

    center: lat/lon of center of circle
    radius: radius of circle in m
    num_points: how many points of the circle to use

    returns: a list of gmaps Lines that approximate a circle
    """

    r_earth = 6378 * 1000  # 6378km * 1000m/km

    num_points = 360
    weight = 2
    
    theta_list = np.arange(0, 360, 360/num_points)
    
    circle_coord_list = []
    for theta in theta_list:
        
        if dummy_projection_ratio == 0:
            
            dy = math.sin(math.radians(theta)) * radius
            dx = math.cos(math.radians(theta)) * radius
        else:
            # TODO: this is tupid, handle projection in correct way...
            dy = math.sin(math.radians(theta))/dummy_projection_ratio * radius
            #dy = math.sin(math.radians(theta)) * radius
            dx = math.cos(math.radians(theta)) * radius
        
        new_latitude = lat + (dy / r_earth) * (180 / math.pi)
        new_longitude = lon + (dx / r_earth) * (180 / math.pi) 
        circle_coord_list.append((new_latitude, new_longitude))

    gmaps_line_list = []
    for i, coord in enumerate(circle_coord_list):
        if i == len(circle_coord_list) - 1:  # last coordinate to first coordinate
            gmaps_line_list.append(gmaps.Line(circle_coord_list[i], circle_coord_list[0], stroke_weight=weight, stroke_color='blue'))
            break
        gmaps_line_list.append(gmaps.Line(circle_coord_list[i], circle_coord_list[i + 1], stroke_weight=weight, stroke_color='blue'))

    return gmaps_line_list

# 3.7.2 Create map with all properties

In [14]:


map_radius_limit_meters = 2500


destination_list = df_distance_result[['destination_name','destination_coordinates_lat', 'destination_coordinates_long']].drop_duplicates()

fig = gmaps.figure(layout={
        'height': '600px',
        'padding': '3px',
        'border': '1px solid black'
})

for index, row in destination_list.iterrows():
    
    hoover_text = "Destination: {0}".format(row["destination_name"])
    info_box_text = "".format()
    current_destination_coordinates = [(row["destination_coordinates_lat"],row["destination_coordinates_long"])]
    
   
    fig.add_layer(
            gmaps.symbol_layer(
                current_destination_coordinates, 
                fill_color="blue", 
                stroke_color="blue", 
                scale=4, 
                info_box_content=info_box_text, 
                hover_text=hoover_text
            )
    )

    p = get_distance_radius(
            row["destination_coordinates_lat"], 
            row["destination_coordinates_long"], 
            map_radius_limit_meters, 
            1.5
    )
    fig.add_layer(gmaps.drawing_layer(p))
    
    
    
for index, row in df_distance_result.iterrows():

    hoover_text = "Booking property ID: {0} \nProperty name: {1} \nAverage price per person per night (mean): {2} \n".format(str(row["booking_property_id"]), str(row["property_name"]), str(row["price_per_person_per_night_mean"]))
    
    for index2, row2 in destination_list.iterrows():
        current_booking_property_to_destination_result = df_distance_result[(df_distance_result["destination_name"] == row2["destination_name"]) & (df_distance_result["booking_property_id"] == row["booking_property_id"])]
        hoover_text = hoover_text + "\nWalking to [{0}]: {1} minutes, {2} m".format(row2["destination_name"], current_booking_property_to_destination_result["walking_duration_minutes"].values[0], current_booking_property_to_destination_result["walking_distance_meters"].values[0])
        
    info_box_text = "".format()
    current_destination_coordinates = [(row["property_coordinates_lat"],row["property_coordinates_long"])]

    color_based_on_price_limit = "green" if row["price_per_person_per_night_mean"] <= price_per_person_per_night_limit else "red"
    
    fig.add_layer(
        gmaps.symbol_layer(
            current_destination_coordinates, 
            fill_color=color_based_on_price_limit, 
            stroke_color=color_based_on_price_limit, 
            scale=4, 
            info_box_content=info_box_text, 
            hover_text=hoover_text
        )
    )


'''
https://translate.google.com/translate?hl=&sl=auto&tl=en&u=https%3A%2F%2Fwww.linkedin.com%2Fpulse%2Fresolvendo-o-problema-da-exporta%C3%A7%C3%A3o-de-mapas-do-google-arthur-giani-%2F
In my case using Python 3.7 via Anaconda, the 'embed.py' file can be found in the following directory where Anaconda was installed:

C: \\ Anaconda3 \ Lib \ site-packages \ ipywidgets
If you did not install the embed_minimal_html package at the beginning of the project, the directory will not be found.
If you use Python without Anaconda, this directory will be available in the same way.
When accessing the .py file, note that the script_escape_re function is between lines 230 and 241 of the script.
'''
    
# simply show the figure    
#fig



embed_minimal_html(gmaps_report_file_name, views=[fig])
time.sleep(5)


# 3.7.3 Show map with all properties
<b>Wait a bit till the map will be loaded.</b> If the map won't pop up double click on the empty area and wait again... <b>(this might take some time)</b>
    
<br>
<b>Hoover over the point to see the details about the property including estimated walk duration and the distance</b>
<br>
Please note that the blue circle is for reference only and due to the projection issue it might not fit the distance well. This will be only the projection issue and the distance is calculated using the google maps api and should be very acurate.
<br>

Blue dot - destination (office) location   
Red dots = properties that have the average price per person per night above given limit   
Green dots = properties that have the average price per person per night below given limit   

In [15]:
from IPython.display import IFrame

IFrame(gmaps_report_file_name, width='100%', height=650) 

# 3.8 Distribution of ppp per night (without outliers) exclude low ratings and distant properties

In [16]:


df = df_search_result[
             (df_search_result['total_price_price_outlier_zscore'] < 1) 
             & (df_search_result['is_within_walking_distance_limit_x'] == True) 
             & (df_search_result['review_score'] > 5) 
        ]


_property_type_names = df.groupby('property_type') \
        .agg({'booking_property_id' : ['nunique']}) \
        .reset_index() \
        .applymap(str)
_property_type_names.columns = ["property_type", "row_count"]
_property_type_names.applymap(str)
_property_type_names["property_type_name"] = _property_type_names["property_type"] + " (" + _property_type_names["row_count"] + ")"
property_type_names = "Where: " + ", ".join(_property_type_names["property_type_name"].tolist())



no_properties = str(len(df['booking_property_id'].unique()))



title = "<b>Price per person per night distribution over property type</b> \
        <br>Exclude outliers (based on Z-Score), exclude anything outside defined walking distance \
        <br>and exclude properties with reviews lower than 5. Number of properties = {0} out of {1} \
        ".format(no_properties, total_no_properties) + \
        "<br>" + property_type_names 


how_many_multiply_by_price_per_person_per_night_limit_to_range = 3
property_types = np.sort(df["property_type"].unique())


layout = go.Layout(
    title=title,
    paper_bgcolor='rgba(0,0,0,0)',
    plot_bgcolor='rgba(0,0,0,0)',
    showlegend=False,
    yaxis=dict(
        gridcolor='rgb(245, 245, 240)',
        zerolinecolor='rgb(0, 0, 0)',
        showgrid=True, 
        zeroline=True,
        #range=(0,how_many_multiply_by_price_per_person_per_night_limit_to_range*price_per_person_per_night_limit)
        ),
    )

fig = go.Figure(
    layout = layout
    )

for property_type, color in list(zip(property_types, chart_colors)):
    
    x = df[
             (df['property_type'] == property_type)
        ]
    
    fig.add_trace(
    go.Box( 
        y=x["price_per_person_per_night"],
        name=property_type,
        boxmean=True,
        boxpoints="all",
        marker_color=color
        )
    )
    
    del x
    
# Add contant line with price per person limit
no_property_types = len(df["property_type"].unique())
fig.update_layout(
    shapes=[
        # Line Horizontal
        go.layout.Shape(
            type="line",
            x0=-0.5,
            x1=no_property_types-0.5,
            y0=price_per_person_per_night_limit,
            y1=price_per_person_per_night_limit,
            line=dict(
                color="LightSeaGreen",
                #width=4,
                dash="dashdot",
            )
        )
    ]
)

fig.show()

# 4. Outcome and findings

In [17]:
#display(HTML('<div class=\"custom\"><h1>We scored <span style=\"color:green;\">%g</span> touchdowns, which is <span style=\"color:orange;\">%s</span> %s previous season.</h1><ul><li>Goal was %g touchdowns</li><li>We were %g %s goal</li></ul></div>'%(1,2,3,4,5,6)))

total_no_properties = len(df_search_result["booking_property_id"].unique())

df = df_search_result[
             (df_search_result['total_price_price_outlier_zscore'] < 1) 
             & (df_search_result['is_within_walking_distance_limit_x'] == True) 
             & (df_search_result['review_score'] > 5) 
             & (df_search_result['property_type'] == 'Hotel') 
        ]

no_properties = str(len(df['booking_property_id'].unique()))

df_stats = df \
        .agg({
                'price_per_person_per_night': ['mean', 'min', 'max', 'median', 'std'],
        }) \
        .reset_index()


new_line = "<br>"
title = "<h1>" + "Booking searches analysis" + "</h1>"

paragraph = "<p>The current limit for <b>" + str(df_search_output['city_name'].apply(lambda x: x).unique()[0]) + "</b> is <b>" + str(price_per_person_per_night_limit) + "</b>" \
             "<br><br>Taking into the consideration most recent data and the given conditions:" + \
             "<ul>" + \
             "<li>" + "Ignoring the outliers - properties that are much more expensive than the others (based on ZScore)" + "</li>" \
             "<li>" + "Ignoring properties that are not in the defined walking distance (" + str(walking_distance_limit_meters) + " meters) to the office location" + "</li>" \
             "<li>" + "Ignoring properties with review score lower than 5" + "</li>" \
             "<li>" + "Ignoring all the other types of the property apart from Hotels" + "</li>" \
             "</ul>" + \
             "<br> Out of <b>" + str(total_no_properties) + "</b> there is only <b>" + str(no_properties) + "</b> properties that match all of the criteries and the price statistics are as follow:" + \
             "<ul>" + \
             "<li>" + "Min: ~<b>" + str(int(round(df_stats[df_stats["index"]=='min'].reset_index()["price_per_person_per_night"][0],0))) + "</b></li>" + \
             "<li>" + "Mean: ~<b>" + str(int(round(df_stats[df_stats["index"]=='mean'].reset_index()["price_per_person_per_night"][0],0))) + "</b></li>" + \
             "<li>" + "Median: ~<b>" + str(int(round(df_stats[df_stats["index"]=='median'].reset_index()["price_per_person_per_night"][0],0))) + "</b></li>" + \
             "<li>" + "Standard deviation: ~<b>" + str(int(round(df_stats[df_stats["index"]=='std'].reset_index()["price_per_person_per_night"][0],0))) + "</b></li>" + \
             "<li>" + "Max: ~<b>" + str(int(round(df_stats[df_stats["index"]=='max'].reset_index()["price_per_person_per_night"][0],0))) + "</b></li>" + \
             "</ul>" + \
             "</p>"

price_per_person_per_night_limit_vs_mean = int(price_per_person_per_night_limit/int(round(df_stats[df_stats["index"]=='mean'].reset_index()["price_per_person_per_night"][0],0))*100)
price_per_person_per_night_limit_vs_median = int(price_per_person_per_night_limit/int(round(df_stats[df_stats["index"]=='median'].reset_index()["price_per_person_per_night"][0],0))*100)
no_properties_matching_all_criterias = len(df_search_result[
             (df_search_result['total_price_price_outlier_zscore'] < 1) 
             & (df_search_result['is_within_walking_distance_limit_x'] == True) 
             & (df_search_result['review_score'] > 5) 
             & (df_search_result['property_type'] == 'Hotel')
             & (df_search_result['price_per_person_per_night'] <= price_per_person_per_night_limit) 
        ]["booking_property_id"].unique())


conclusion = "<p>" + "Consequently the <b>current limit ( " + str(price_per_person_per_night_limit) + " ) is " + \
            "" + str(price_per_person_per_night_limit_vs_mean) + "% " + ('smaller' if price_per_person_per_night_limit_vs_mean < 100 else 'grater') +  " than the mean ( " + str(int(round(df_stats[df_stats["index"]=='mean'].reset_index()["price_per_person_per_night"][0],0))) + " )" +  \
            " and " + str(price_per_person_per_night_limit_vs_median) + "% " + ('smaller' if price_per_person_per_night_limit_vs_median < 100 else 'grater') +  " than the median ( " + str(int(round(df_stats[df_stats["index"]=='median'].reset_index()["price_per_person_per_night"][0],0))) + " )</b>" + \
            " comparing to the current prices of the hotels in the neighbours of the office (taking into the consideration above additional conditions). " + \
            "<b>Furthermore there is " + str(no_properties_matching_all_criterias) + " properties that are matching the given criterias if we would include the given price limit.</b>" + \
            "</p>"

end = "<p>For better understanding please take a look at the charts presented above or contact us</p>"

output = ""
output += title
output += paragraph
output += conclusion
output += end


rendered = HtmlRenderer(output)
display(rendered)


# 5. Export analysis 

In [18]:
time.sleep(5)
command_output = !jupyter nbconvert --ExecutePreprocessor.timeout=600 --to html --template hidecode.tplx 2-booking.ipynb --no-prompt --output $output_file_name
