# Set-up

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import os, glob
import math
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
# Dashboard (Global Variables)
dataset_data="2020-03-17" # "2019-12-11", "2020-01-10", "2020-02-18", "2020-03-17", "2020-05-14"
dataset_data_plus2mth="2020-05-14"
review_tf=("2019-12-15", "2020-03-15") # Timeframe for which reviews are considered in calculating occupancy rate
occ_thr=0.3 # Threshold for when a listing is deemed a "permanent rental"
pd.set_option('display.max_columns', 150)
pd.set_option('display.max_rows', 100)
pd.options.display.max_seq_items = 300
#pd.options.display.max_rows = 4000
sns.set(style="white")
review_rate=0.5    # Assumed share of bookings that were followed up by a user review

In [3]:
# Import data_clean
data = pd.read_pickle("saves/data_clean.pkl")

In [4]:
# Import reviews.csv and convert date to datetime
data_rev = pd.read_csv(f"data/{dataset_data_plus2mth}/reviews.csv")
data_rev.date = data_rev.date.astype('datetime64[D]')
print(data_rev.shape)
data_rev.head(3)

(517826, 2)


Unnamed: 0,listing_id,date
0,2015,2016-04-11
1,2015,2016-04-15
2,2015,2016-04-26


# Feature Engineering

## General Features

**Change column content**

In [5]:
# Reduce cancellation_policy to 4 classes
data.cancellation_policy.replace(["strict_14_with_grace_period", "super_strict_60", "super_strict_30"], ["strict", "super_strict", "super_strict"], inplace=True)

In [6]:
# Reduce property_type to 6 classes, as per Airbnb classification (see listing creation in pdf)
data.property_type.replace(["Condominium", "Loft", "Vacation home"], "Apartment", inplace=True)
data.property_type.replace(["Aparthotel", "Hostel", "Hotel", "Resort", "Serviced apartment"], "Boutique hotel", inplace=True)
data.property_type.replace(["Casa particular (Cuba)", "Farm stay", "Nature lodge", "Pension (South Korea)"], "Bed and breakfast", inplace=True)
data.property_type.replace(["Bungalow", "Cabin", "Chalet", "Cottage", "Dome house", "Earth house", "Houseboat", "Hut", "Lighthouse", "Tiny house", "Townhouse", "Villa"], "House", inplace=True)
data.property_type.replace(["Guesthouse", "Guest suite"], "Secondary unit", inplace=True)
data.property_type.replace(["Barn", "Boat", "Bus", "Camper/RV", "Campsite", "Castle", "Cave", "Igloo", "Island", "Plane", "Tent", "Tipi", "Train", "Treehouse", "Windmill", "Yurt"], "Unique space", inplace=True)

In [7]:
# Drop all listings that are not in the above 6 classes
data = data[data.property_type.isin(["Apartment", "Boutique hotel", "Bed and breakfast", "House", "Secondary unit", "Unique space"])]

In [8]:
# Fill all columns with "0" of "monthly_price" and "weekly_price" with 7x/30x "price"
data["monthly_price"] = np.where(data.monthly_price==0, data.price*30, data.monthly_price)
data["weekly_price"] = np.where(data.weekly_price==0, data.price*7, data.weekly_price)

**Convert binary features to 1/0**

In [9]:
# Convert t/f to 1/0 for various features
data.host_is_superhost.replace(["t", "f"], [1, 0], inplace=True)
data.host_identity_verified.replace(["t", "f"], [1, 0], inplace=True)
data.is_location_exact.replace(["t", "f"], [1, 0], inplace=True)
data.instant_bookable.replace(["t", "f"], [1, 0], inplace=True)

In [10]:
# Change availability_365 to 1/0
data.availability_365 = np.where(data.availability_365!=0, 1, 0)

In [11]:
# Create 1/0 for text descriptions
#data["description_exist"] = [0 for i in range(len(data.index))]
data["description_exist"] = np.where(data.description!="", 1, 0)
data["house_rules_exist"] = np.where(data.house_rules!="", 1, 0)
data["interaction_exist"] = np.where(data.interaction!="", 1, 0)
data["neighborhood_overview_exist"] = np.where(data.neighborhood_overview!="", 1, 0)
data["notes_exist"] = np.where(data.notes!="", 1, 0)
data["space_exist"] = np.where(data.space!="", 1, 0)
data["summary_exist"] = np.where(data.summary!="", 1, 0)
data["transit_exist"] = np.where(data.transit!="", 1, 0)

**Create numerical features**

In [12]:
# Calculate "price_calc" for one person from "price", "guests_included", "extra_people" and remove listings where "price_calc" ends up being <= 5
data["price_calc"] = data.price - 0.5*data.extra_people*(data.guests_included-1)
data = data[data.price_calc > 5]

In [13]:
# Calculate "price_extra_people" (price) for additional persons from "price", "guests_included", "extra_people" and "accommodates"
data["price_extra_people"] = (data.extra_people*(data.accommodates-data.guests_included)+(0.5*data.extra_people*(data.guests_included-1)))/(data.accommodates-1)
data.price_extra_people.fillna(0, inplace=True)

In [14]:
# Calculate occurrence of "price_extra_fees" from "security_deposit" and "cleaning_fee"
data["price_extra_fees"] = 0 + data.security_deposit + data.cleaning_fee

In [15]:
# Calculate "descr_detail" as measure for how well the listing is described
data["descr_detail"] = 0 + data.description_exist + data.house_rules_exist + data.interaction_exist + data.neighborhood_overview_exist + data.notes_exist + data.space_exist + data.summary_exist + data.transit_exist

In [16]:
# Calculate "accommodates_per_bed" as feature to de-correlate "accommodates", "beds" and "bedrooms"
data["accommodates_per_bed"] = data.accommodates/data.beds

In [17]:
# Calculate "wk_mth_discount" from "monthly_price" and "weekly_price" with "price"
data["wk_mth_discount"] = ((data.price*30-data.monthly_price)/(data.price*30) + (data.price*7-data.weekly_price)/(data.price*7)) / 2

In [18]:
# Calculate days since "first_review_days"
data["first_review_days"] = (datetime.strptime(dataset_data, '%Y-%m-%d'))-data.first_review
data.first_review_days = [i.days for i in data.first_review_days]

In [19]:
# Calculate days since "last_review_days"
data["last_review_days"] = (datetime.strptime(dataset_data, '%Y-%m-%d'))-data.last_review
data.last_review_days = [i.days for i in data.last_review_days]

**Create categorical features**

In [20]:
# Categorize listings by "state" (basic, moderate, luxurious)

In [21]:
# Create "text_len" as word count from text-based (.split().count)
data["description_len"] = [len(i.split()) for i in data.description]
data["house_rules_len"] = [len(i.split()) for i in data.house_rules]
data["interaction_len"] = [len(i.split()) for i in data.interaction]
data["neighborhood_overview_len"] = [len(i.split()) for i in data.neighborhood_overview]
data["notes_len"] = [len(i.split()) for i in data.notes]
data["space_len"] = [len(i.split()) for i in data.space]
data["summary_len"] = [len(i.split()) for i in data.summary]
data["transit_len"] = [len(i.split()) for i in data.transit]
data["text_len"] = (data.description_len/data.description_len.max() + data.house_rules_len/data.house_rules_len.max() +
                    data.interaction_len/data.interaction_len.max() + data.neighborhood_overview_len/data.neighborhood_overview_len.max() +
                    data.notes_len/data.notes_len.max() + data.space_len/data.space_len.max() + data.summary_len/data.summary_len.max() +
                    data.transit_len/data.transit_len.max()) / 8
data.text_len = data.text_len/data.text_len.max()

In [22]:
# Categorize listings as "review_scores_class" by "review_scores_rating"
review_scores_class = []
for score in data.review_scores_rating:
    if score == 0:
        review_scores_class.append(0)
    elif score <= 89:
        review_scores_class.append(1)
    elif score <= 93:
        review_scores_class.append(2)
    elif score <= 96:
        review_scores_class.append(3)
    elif score <= 99:
        review_scores_class.append(4)
    else:
        review_scores_class.append(5)
data["review_scores_class"] = review_scores_class

In [23]:
# Categorize listings as "price_class" by "price_calc"
price_class = []
for price in data.price_calc:
    if price <= 30:
        price_class.append(0)
    elif price <= 45:
        price_class.append(1)
    elif price <= 70:
        price_class.append(2)
    elif price <= 100:
        price_class.append(3)
    else:
        price_class.append(4)
data["price_class"] = price_class

**Convert text columns into meaningful information**

In [24]:
# TO-DO: data.description.sample(5)

**Create log/sqrt from existing features**

Now we will replace certain features, which have relatively high skew (see 2_Clean), with their log

In [25]:
# Create log "bathrooms_log" for numerical feature "bathrooms"
data["bathrooms_log"] = [math.log(el) for el in data["bathrooms"]]

In [26]:
# Create sqrt and log "calc_host_lst_count_sqrt_log" for numerical feature "calculated_host_listings_count"
data["calc_host_lst_count_sqrt_log"] = [math.log(math.sqrt(el)) for el in data["calculated_host_listings_count"]]

In [27]:
# Create sqrt "first_review_days_sqrt" for numerical feature "first_review_days"
data["first_review_days_sqrt"] = [math.sqrt(el) for el in data.first_review_days]

In [28]:
# Create sqrt "last_review_days_sqrt" for numerical feature "last_review_days"
data["last_review_days_sqrt"] = [math.sqrt(el) for el in data.last_review_days]

In [29]:
# Create sqrt "minimum_nights_sqrt" for numerical feature "minimum_nights"
data["minimum_nights_log"] = [math.sqrt(el) for el in data["minimum_nights"]]

In [30]:
# Create log "price_extra_fees_sqrt" for numerical feature "price_extra_fees"
data["price_extra_fees_sqrt"] = [math.sqrt(el) for el in data["price_extra_fees"]]

In [31]:
# Create log "price_log" for numerical feature "price_calc"
data["price_log"] = [math.log(el) for el in data["price_calc"]]

In [32]:
# Create sqrt "review_scores_rating_sqrt" for numerical feature "review_scores_rating"
review_max = data.review_scores_rating.max()
data["review_scores_rating_sqrt"] = [math.sqrt(review_max-el) for el in data.review_scores_rating]
review_log_max = data.review_scores_rating_sqrt.max()
data["review_scores_rating_sqrt"] = [(review_log_max-el) for el in data.review_scores_rating_sqrt]
#data["review_scores_rating_sqrt"].plot(kind='hist', bins=50, figsize=(12,6), facecolor='grey',edgecolor='black');

In [33]:
# Create log "text_len_sqrt" for numerical feature "text_len"
data["text_len_sqrt"] = [math.sqrt(el) for el in data["text_len"]]

## Occupancy

Calculation of **occupancy rate** is inspired by the **San Francisco model**, which is also applied by [Inside AirBnB](http://insideairbnb.com/about.html):

- (**A**) Determine the **average length of stay for Berlin**
- (**B**) Calculate **reviews relevant for considered timeframe**
- (**C**) Determine **active months in timeframe** from price (not relevant if only 1 month)
- (**D**) Estimate **# of bookings in considered timeframe** using (**B**)
- (**E**) **Occupancy rate** = (**D**)x(**A**) / ((**C**)/months x time span)

Read more about the core idea behind the calculations of the model [here](https://sfbos.org/sites/default/files/FileCenter/Documents/52601-BLA.ShortTermRentals.051315.pdf). Assumptions were adapted for the purpose of this analysis, mainly due to the core idea of considering only the two most recent years.

(**A**) Determine the **average length of stay for Berlin** (assumed 3 days in most cities)

For the purpose of this model, around **3 nights** are assumed as average length of stay in Berlin and used as basis for calculation, unless a higher minimum length is specified
- Back in 2016, [4.6](https://www.airbnbcitizen.com/wp-content/uploads/2016/04/airbnb-community-berlin-en.pdf) has been reported as the average length of stay
- Inside AirBnB uses 3 nights for cities where no current data is available, but uses [6.3 nights](http://insideairbnb.com/berlin/#) for its Berlin visualization

In [34]:
# Add column to main dataframe for avg length of stay, being either a) 5 nights or b) minimum_nights if higher than 5 or c) avg of min and max if max is 5 or lower
avg_nights = []
for idx in data.index:
    if data.maximum_nights[idx] <= 5:
        avg_nights.append(
            (data.maximum_nights[idx] + data.minimum_nights[idx]) / 2)
    elif data.minimum_nights[idx] > 3:
        avg_nights.append(data.minimum_nights[idx])
    else:
        avg_nights.append(3)
data["avg_nights"] = avg_nights

(**B**) Calculate **reviews in considered timeframe**

In [35]:
# Keep only reviews within a specified timeframe (see Dashboard)
data_rev_count = data_rev[(data_rev.date > review_tf[0]) & (data_rev.date < review_tf[1])]
data_rev_count = pd.DataFrame(data_rev_count.listing_id.value_counts()) # Count reviews per listing and save as table
data_rev_count.shape

(8352, 1)

In [36]:
# Merge review count to "data"
data_rev_count.rename(columns={"listing_id": "reviews_3mth"}, inplace=True)     # Change column name
data = pd.merge(data, data_rev_count, how="left", left_index=True, right_index=True)    # Add column to main dataset
data.reviews_3mth.fillna(0, inplace=True)

(**C**) Determine **active months and relevant months** from price

In [37]:
# Count the months where listings were online with a price (not relevant if 1 mth)
data["active_months"] = 1
relevant_mths = 1

(**D**) Estimate **# of bookings in considered timeframe** by dividing (**B**) through an assumed 50% review rate (i.e. one review corresponds to two bookings)

In [38]:
# Calculate bookings estimate and replace NaN with 0
data["bookings_est"] = data.reviews_3mth / review_rate
data.bookings_est.fillna(0, inplace=True)

(**E**) **Occupancy rate** = (**D**)x(**A**) / ((**C**)/months x time span)

In [39]:
# Calculate occupancy rate
data["occupancy_rate"] = data.bookings_est * data.avg_nights / (data.active_months/relevant_mths*90)

**Modify occupancy rate**

In [40]:
# Cap occupancy at 100%
occupancy_temp = []
for rate in data.occupancy_rate:
    if rate < 1:
        occupancy_temp.append(rate)
    else:
        occupancy_temp.append(1)
data.occupancy_rate = occupancy_temp

In [41]:
# Split occupancy into 2 classes according to threshold (splitting into temporary and permanent rentals)
occupancy_class = []
for rate in data.occupancy_rate:
    if rate < 0.3:
        occupancy_class.append(0)
    else:
        occupancy_class.append(1)
data["occupancy_class"] = occupancy_class

In [42]:
# Show occupancy split
data.occupancy_class.value_counts()

0    7485
1    3128
Name: occupancy_class, dtype: int64

## Final Check, Cleaning and Export

In [43]:
# Sort columns in dataset
data = data.reindex(sorted(data.columns, reverse=False), axis=1)

In [44]:
# Drop further columns
data.drop(
    [
        "active_months",
        "amenities",
        "am_coffee_machine",
        "am_cooking_basics",
        "am_parking",
        "availability_365",
        "avg_nights",
        "bathrooms",
        "beds",
        "bookings_est",
        "calculated_host_listings_count",
        "cleaning_fee",
        "descr_detail",
        "description",
        "description_exist",
        "description_len",
        "description_exist",
        "extra_people",
        "first_review",
        "first_review_days",
        "guests_included",
        "host_identity_verified",
        "house_rules",
        "house_rules_exist",
        "house_rules_len",
        "interaction",
        "interaction_exist",
        "interaction_len",
        "is_location_exact",
        "last_review",
        "last_review_days",
        "minimum_nights",
        "monthly_price",
        "name",
        "neighborhood_overview",
        "neighborhood_overview_exist",
        "neighborhood_overview_len",
        "notes",
        "notes_exist",
        "notes_len",
#        "occupancy_class",
        "price",
        "price_calc",
        #        "price_avg", "price_diff", "price_diff_perc",
        "price_extra_fees",
        "review_scores_rating",
        "reviews_3mth",
        "security_deposit",
        "space",
        "space_exist",
        "space_len",
        "summary",
        "summary_exist",
        "summary_len",
        "text_len",
        "transit",
        "transit_exist",
        "transit_len",
        "weekly_price"
    ],
    inplace=True,
    axis=1)

| **DROPPED FEATURE** | **REASONING** |
| :----- | :----- |
| **am_coffee_machine** | high correlation (>0.3) with >5 other features |
| **am_parking** | high correlation (>0.3) with >5 other features |
| **availability_365** | high correlation (>0.3) with >5 other features |
| **descr_detail** | dropped in favour of **text_len** |
| **** |  |
| **** |  |
| **** |  |
| **** |  |
| **** |  |
| **** |  |
| **** |  |
| **** |  |
| **** |  |


In [45]:
# Review datatypes (data.info()) (post-engineering)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10613 entries, 3176 to 42885615
Data columns (total 40 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   accommodates                  10613 non-null  int64  
 1   accommodates_per_bed          10613 non-null  float64
 2   am_balcony                    10613 non-null  float64
 3   am_breakfast                  10613 non-null  float64
 4   am_child_friendly             10613 non-null  float64
 5   am_elevator                   10613 non-null  float64
 6   am_essentials                 10613 non-null  float64
 7   am_nature_and_views           10613 non-null  float64
 8   am_pets_allowed               10613 non-null  float64
 9   am_private_entrance           10613 non-null  float64
 10  am_smoking_allowed            10613 non-null  float64
 11  am_tv                         10613 non-null  float64
 12  am_white_goods                10613 non-null  float64


In [46]:
# Display engineered dataset
print(data.shape)
data.head(3)

(10613, 40)


Unnamed: 0_level_0,accommodates,accommodates_per_bed,am_balcony,am_breakfast,am_child_friendly,am_elevator,am_essentials,am_nature_and_views,am_pets_allowed,am_private_entrance,am_smoking_allowed,am_tv,am_white_goods,availability_90,bathrooms_log,bedrooms,calc_host_lst_count_sqrt_log,cancellation_policy,first_review_days_sqrt,host_is_superhost,instant_bookable,last_review_days_sqrt,latitude,longitude,maximum_nights,minimum_nights_log,neighbourhood_cleansed,occupancy_class,occupancy_rate,price_class,price_extra_fees_sqrt,price_extra_people,price_log,property_type,review_scores_class,review_scores_rating_sqrt,room_type,text_len_sqrt,wk_mth_discount,zipcode
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1
3176,4,2.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,1.0,0.0,strict,62.633857,0,0,16.248077,52.535,13.41758,1125,7.874008,Prenzlauer Berg Südwest,0,0.0,3,20.0,16.666667,4.382027,Apartment,2,6.298521,Entire home/apt,0.771052,0.23545,zip_10405
3309,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,45,0.0,1.0,0.0,strict,49.081565,0,0,17.058722,52.49885,13.34906,35,2.645751,Schöneberg-Nord,0,0.0,0,16.733201,0.0,3.332205,Apartment,1,5.627647,Private room,0.863851,0.197024,zip_10777
6883,2,2.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0,0.0,1.0,0.0,moderate,60.687725,0,0,5.477226,52.51171,13.45477,90,1.732051,Frankfurter Allee Süd FK,0,0.0,4,6.244998,0.0,4.828314,Apartment,4,7.944272,Entire home/apt,0.627994,0.471181,zip_10243


**Export data_engineered**

In [47]:
# Export dataset for further use in 4_Predictive Modeling
data.to_pickle("saves/data_engineered.pkl")

In [48]:
# Alternative: Export with to_csv and save dtypes separately
#data.to_csv(r'saves/data_engineered.csv', index = True)
#data.dtypes.to_frame('types').to_csv('saves/types_engineered.csv')