In [None]:
import pandas as pd
import numpy as np
import os
import datetime
import calendar

import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

from scripts.data_analysis import prepare_data

In [None]:
# Business questions:
# 1) How do the prices differ from area to area in Airbnb listings?
# 2) Is there a cheapest time to go travel?
# 3) What is the most important contributor to a listing price?

df = pd.read_csv(os.path.abspath("./data/listings.csv"))
df_cal = pd.read_csv(
    os.path.abspath("./data/calendar.csv"),
    dtype={
        'price': str,
        'date': str
    }
)
# remove $, , and .00 from the Price column.
df['price'] = df.price.astype(str).apply(lambda s: s.replace('$', '').replace('.00', '').replace(',', '')).astype(int)
df_cal.price = df_cal.price.astype(str).fillna('').apply(lambda s: s.replace('.00', '').replace(',', '').replace('$', ''))
print(df.shape, df_cal.shape)


In [None]:
df_area = pd.merge(
    df_cal,
    df[[
        'id',
        'neighbourhood_cleansed',
    ]],
    how='left',
    left_on='listing_id',
    right_on='id'
)

In [None]:
frac_bookings = {"Area": [], "Percentage of booked nights": [], "Price": [], 'Number of Airbnbs': []}
areas, counts = np.unique(df_area.neighbourhood_cleansed, return_counts=True)
for area, count in zip(areas, counts):
    taken = df_area.loc[
        (df_area.neighbourhood_cleansed==area) &
        (df_area.available == 'f')
    ].shape[0]/count

    area_price = df.loc[df.neighbourhood_cleansed==area].price
    
    frac_bookings['Area'].append(area)
    frac_bookings['Percentage of booked nights'].append(taken) # percentage of how many days the listings are taken.
    frac_bookings['Price'].append(area_price)
    frac_bookings['Number of Airbnbs'].append(int(count/365))
frac_df = pd.DataFrame(frac_bookings)
frac_df = frac_df.sort_values(by='Percentage of booked nights', ascending=False)

In [None]:
x = frac_df['Percentage of booked nights']
y = frac_df['Area']
fig, ax = plt.subplots(figsize=(17,15))
ax = sns.barplot(x=x, y=y, orient='h', ax=ax)
ax.set(yticklabels=[])
ax.tick_params(left=False)
ax.bar_label
sns.barplot(data=frac_df, x="Percentage of booked nights", y="Area")

In [None]:
df_price = frac_df.explode('Price')
x = df_price['Price']
y = df_price['Area']
fig, ax = plt.subplots(figsize=(17,15))
ax = sns.boxplot(x=x, y=y, orient='h', ax=ax)
ax.set(yticklabels=[])
ax.tick_params(left=False)
ax.bar_label
sns.boxplot(data=df_price, x=x, y=y)

In [None]:
x = frac_df['Number of Airbnbs']
y = frac_df['Area']
fig, ax = plt.subplots(figsize=(17,15))
ax = sns.barplot(x=x, y=y, orient='h', ax=ax)
ax.set(yticklabels=[])
ax.tick_params(left=False)
ax.bar_label
sns.barplot(data=frac_df, x="Number of Airbnbs", y="Area")

In [None]:
corr = df[[
    'price',
    'accommodates',
    'bedrooms',
    'host_total_listings_count',
    'number_of_reviews',
    'reviews_per_month',
    'review_scores_rating',
    'review_scores_value',
    'review_scores_cleanliness',
    'latitude',
    'longitude'
]].corr()
sns.heatmap(corr)

In [None]:
# is there a cheapest time to go travel?
df_available = df_area[(df_area.available=='t')].copy()
df_available['Weekday'] = df_available.date.apply(lambda s: calendar.day_name[datetime.datetime.strptime(s, '%m/%d/%Y').weekday()])
df_available['price'] = df_available.price.astype(int)
price_avg = []
dates = df_available.date.unique()
for date in dates:
    day_price = df_available[df_available.date==date].price.mean()
    price_avg.append(day_price)
cal_price = pd.DataFrame({'date': dates, 'price': price_avg})

In [None]:
# plot prices overview.
fig, axes = plt.subplots(2, figsize=(12, 10))
fig.suptitle('Available Airbnb prices')
sns.barplot(
    ax=axes[0],
    data=df_available,
    x='Weekday',
    y='price',
)
axes[0].set_ylabel('price [$]')

cal_price.plot(ax=axes[1], x='date', y='price', rot=90)
axes[1].set_ylabel('price [$]')
plt.show()

In [None]:
# load numerical columns.
vars = [
    'price',
    'host_total_listings_count',
    'accommodates',
    'number_of_reviews',
    'review_scores_rating',
    'reviews_per_month',
    'host_since'
]

# load categorical columns.
cat_cols = [
    'host_response_time',
    'host_is_superhost',
    'neighbourhood_group_cleansed',
    'room_type',
    'bed_type',
    'instant_bookable',
    'cancellation_policy',
    'require_guest_phone_verification'
]

df_prep = prepare_data(df, vars=vars, cat_cols=cat_cols)

In [None]:
y = df_prep.price
X = df_prep.loc[:, df_prep.columns != 'price']

In [None]:
# fit a linear model
seeds = [12834, 43271, 6555, 2359, 42]
r2_scores = []
importance = {'Coefficient': [], 'Column name': []}
for seed in seeds:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state=seed)
    lm_model = LinearRegression()
    lm_model.fit(X_train, y_train)
    lm_pred = lm_model.predict(X_test)
    importance['Coefficient'].append(lm_model.coef_)
    importance['Column name'].append(X_train.columns)
    lm_r2 = r2_score(y_test, lm_pred)
    r2_scores.append(lm_r2)
importance = pd.DataFrame(importance).explode(['Coefficient', 'Column name']).sort_values(by='Coefficient', ascending=False)
mean_r2 = sum(r2_scores)/len(r2_scores)

In [None]:
fig = plt.figure(figsize=(16, 6))
importance_groups = importance.groupby('Column name')['Coefficient'].mean().sort_values(ascending=False)
importance_groups.plot(kind='bar', y='Coefficient')

In [None]:
# plot the predicted-true airbnb prices.
grid = np.linspace(min(y_test), max(y_test), len(y_test))
z = np.poly1d(np.polyfit(y_test, lm_pred, 1))
plt.plot(grid, z(grid), 'r--', label="R2 {:.2f}".format(mean_r2))
plt.scatter(y_test, lm_pred)
plt.title('Plot of fitted vs. true Airbnb prices')
plt.legend(loc="upper left")
plt.xlabel("Ground truth price (USD)")
plt.ylabel("Predicted price (USD)")
plt.show()