In [104]:
# include visualization modules
import numpy as np
import pandas as pd
import cufflinks as cf
import matplotlib.pyplot as plt
import pandas_bokeh as pb
import warnings
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
cf.go_offline()
init_notebook_mode(connected='true')
pb.output_notebook()
warnings.filterwarnings("ignore")

In [105]:
# read in data
city_list = ["amsterdam", "athens", "barcelona", "berlin", "budapest", "lisbon", "london", "paris", "rome", "vienna"]
weekdays_data, weekends_data = [], []
for city in city_list:
    weekdays_data.append(pd.read_csv("./airbnb/" + city + "_weekdays.csv"))
    weekends_data.append(pd.read_csv("./airbnb/" + city + "_weekends.csv"))

In [106]:
# Query 1: (Comparing mean price amony cities) Compare the value of mean room price of cities in different time period
price_weekdays_mean = [sum(weekdays_data[i]['realSum']) / len(weekdays_data[i]) for i in range(len(city_list))]
price_weekends_mean = [sum(weekends_data[i]['realSum']) / len(weekends_data[i]) for i in range(len(city_list))]
price_alltime_mean = [(sum(weekdays_data[i]['realSum']) + sum(weekends_data[i]['realSum'])) / (len(weekdays_data[i]) + len(weekends_data[i])) for i in range(len(city_list))]
q1_dataframe = pd.DataFrame({
    'citys': city_list,
    'price_weekdays_mean': price_weekdays_mean,
    'price_weekends_mean': price_weekends_mean,
    'price_alltime_mean': price_alltime_mean
})
p1 = q1_dataframe.sort_values(by='price_alltime_mean', ascending=False).plot_bokeh(
    kind = 'bar',
    x = 'citys',
    y = ['price_weekdays_mean', 'price_weekends_mean', 'price_alltime_mean'],
    xlabel = 'City Name',
    ylabel = 'Average price',
    title = 'Average room price statistics for some Euro cities'
)

In [107]:
# Query 2: (Check the satisfication of customers towards rooms) Visualize the rating distribution if customers for rooms.
guest_satisfaction = {}
for i in range(len(city_list)):
    guest_satisfaction[city_list[i]] = [d for d in weekdays_data[i]['guest_satisfaction_overall']] + [d for d in weekends_data[i]['guest_satisfaction_overall']]
q2_dataframe = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in guest_satisfaction.items()]))
p2_origin = q2_dataframe.iplot(kind="box", title="Original Rating distribution")

for i in range(len(city_list)):
    guest_satisfaction[city_list[i]] = [d for d in weekdays_data[i]['guest_satisfaction_overall'] if d >= 80] + [d for d in weekends_data[i]['guest_satisfaction_overall'] if d >= 80]
q2_dataframe = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in guest_satisfaction.items()]))
p2_modified = q2_dataframe.iplot(kind="box", title="Modified Rating distribution")

In [108]:
# Query 3: (Relationship mining between attributes) Get the result that whether satisfaction of guests is related with room attributes
price, metro_dist = [], []
cleanliness_groupby = {}
for i in range(len(city_list)):
    for j in range(len(weekdays_data[i])):
        if weekdays_data[i]['bedrooms'][j] == 1 and weekdays_data[i]['realSum'][j] <= 1000:
            price.append(weekdays_data[i]['realSum'][j])
            metro_dist.append(weekdays_data[i]['metro_dist'][j] * 1000)
            if weekdays_data[i]['cleanliness_rating'][j] in cleanliness_groupby:
                cleanliness_groupby[weekdays_data[i]['cleanliness_rating'][j]].append(weekdays_data[i]['realSum'][j])
            else:
                cleanliness_groupby[weekdays_data[i]['cleanliness_rating'][j]] = [weekdays_data[i]['realSum'][j]]
q3_1_dataframe = pd.DataFrame({
    'price': price,
    'metro_dist': metro_dist
})
q3_2_dataframe = pd.DataFrame(dict(sorted([(k, pd.Series(v)) for k, v in cleanliness_groupby.items()])))
p3_1 = q3_1_dataframe.plot_bokeh.scatter(
    y="price",
    x="metro_dist",
    show_figure=True,
    size = 0.001
)
p3_2 = q3_2_dataframe.iplot(kind="box", title="Price distribution of different rate on room")

In [109]:
# Query 4: (Room Type Classification and Weight Comparison) Query the distribution of different room type of different Ciry 
city_status_list, total_dict = [], {}
for i in range(len(city_list)):
    status_dict = {}
    for j in range(len(weekdays_data[i])):
        room_type = weekdays_data[i]['room_type'][j]
        if room_type in status_dict:
            status_dict[room_type] += 1
        else:
            status_dict[room_type] = 1
        if room_type in total_dict:
            total_dict[room_type] += 1
        else:
            total_dict[room_type] = 1
    city_status_list.append(status_dict)
type_list, count_list_all, title_list = ["Private room", "Entire home/apt", "Shared room"], [], [city_list[0], city_list[1], city_list[2], city_list[3], city_list[4], "Total"]
for i in range(5):
    count_list = [city_status_list[i]['Private room'], city_status_list[i]['Entire home/apt'], city_status_list[i]['Shared room']]
    count_list_all.append(count_list)
count_list_all.append([total_dict['Private room'], total_dict['Entire home/apt'], total_dict['Shared room']])
p4_dataframe, p4 = [], []
for i in range(6):
    p4_dataframe.append(pd.DataFrame({
        'room_type': type_list,
        'room_count': count_list_all[i]
    }))
    p = p4_dataframe[i].plot_bokeh.pie(
        x = 'room_type',
        y = 'room_count',
        colormap=['orange', 'green', 'grey'],
        title=title_list[i]
    )
    p4.append(p)

__x__values_original


__x__values_original


__x__values_original


__x__values_original


__x__values_original


__x__values_original


In [110]:
pb.plot_grid([[p4[0], p4[1], p4[2]],
              [p4[3], p4[4], p4[5]]], width=450)