In [96]:
import pandas as pd
import numpy as np

# Load the data
airbnb_df = pd.read_csv("../data/processed/airbnb_listings.csv")
airbnb_df.head(5)

  airbnb_df = pd.read_csv("../data/processed/airbnb_listings.csv")


Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,country,...,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,...,193.0,10.0,9.0,2021-10-19,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,...,28.0,30.0,45.0,2022-05-21,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,unconfirmed,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,...,124.0,3.0,0.0,,0.0,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,...,74.0,30.0,270.0,2019-07-05,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,...,41.0,10.0,9.0,2018-11-19,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [97]:
airbnb_df.isna().sum()

id                                     0
name                                 270
host_id                                0
host_identity_verified                 0
host_name                            404
neighbourhood_group                    0
neighbourhood                          0
lat                                    0
long                                   0
country                                0
country_code                           0
instant_bookable                       0
cancellation_policy                    0
room_type                              0
construction_year                    214
price                                  0
service_fee                            0
minimum_nights                         0
number_of_reviews                      0
last_review                        15832
reviews_per_month                      0
review_rate_number                     0
calculated_host_listings_count         0
availability_365                       0
house_rules     

In [98]:
"""
    Create a new feature for name column which is the length of the name.
    Drop the name column once the new feature is created
"""

airbnb_df["name_length"] = airbnb_df["name"].fillna("").apply(len)
airbnb_df.drop(columns = ["name"], inplace = True)
airbnb_df["name_length"].head(5)

0    34
1    21
2    35
3     0
4    48
Name: name_length, dtype: int64

In [99]:
"""
    Change the host_identity_verified Categorical values to 1 and 0. 1 -> verified, 0 -> unconfirmed
"""

airbnb_df["host_identity_verified"] = airbnb_df["host_identity_verified"].map({"verified": 1, "unconfirmed": 0})
airbnb_df["host_identity_verified"].unique()

array([0, 1])

In [100]:
"""
    Change the instant_bookable Categorical(boolean) values to 1 and 0. 1 -> True, 0 -> False
"""

airbnb_df["instant_bookable"] = airbnb_df["instant_bookable"].map({ True: 1, False: 0 })
airbnb_df["instant_bookable"].unique()

array([0, 1])

In [101]:
"""
    Performing one hot encoding to cancellation_policy feature to convert the three 
    different common values to individual features.
"""

airbnb_df = pd.get_dummies(airbnb_df, columns = ["cancellation_policy"], prefix = "policy", dtype = int)
airbnb_df[["policy_flexible", "policy_moderate", "policy_strict"]].head(5)

Unnamed: 0,policy_flexible,policy_moderate,policy_strict
0,0,0,1
1,0,1,0
2,1,0,0
3,0,1,0
4,0,1,0


In [102]:
"""
    Performing one hot encoding to room_type feature to convert the four 
    different common values to individual features.
"""

airbnb_df = pd.get_dummies(
    airbnb_df, 
    columns=["room_type"], 
    prefix="", 
    prefix_sep="",
    dtype=int
)
airbnb_df[["Entire home/apt", "Hotel room", "Private room", "Shared room"]].head(5)

Unnamed: 0,Entire home/apt,Hotel room,Private room,Shared room
0,0,0,1,0
1,1,0,0,0
2,0,0,1,0
3,1,0,0,0
4,1,0,0,0


In [103]:
"""
    Updating the neighbourhood_group values to lower case
    fixing the name of the neighbourhood and also fixing duplicate
"""

airbnb_df["neighbourhood_group"] = airbnb_df["neighbourhood_group"].str.lower()

replace_map = {
    "brookln": "brooklyn",
    "manhatan": "manhattan"
}
airbnb_df["neighbourhood_group"] = airbnb_df["neighbourhood_group"].replace(replace_map)
airbnb_df["neighbourhood_group"].unique()

array(['brooklyn', 'manhattan', 'queens', 'williamsburg', 'staten island',
       'bronx'], dtype=object)

In [104]:
"""
    Update the neighbourhood_group to categorial value using one hot encoding
"""

airbnb_df = pd.get_dummies(airbnb_df, columns = ["neighbourhood_group"], prefix = "neighbourhood_group", dtype = int)

In [105]:
"""
    Using target encoding as we have enough samples per neighborhood.
    Creates a new column (neigh_encoded) where each row gets the average price of all listings within the same neighborhood.
"""

airbnb_df["neigh_encoded"] = airbnb_df.groupby("neighbourhood")["price"].transform("mean")
airbnb_df["neigh_encoded"].head(5)

0    611.188011
1    609.853143
2    626.977210
3    632.416740
4    627.555125
Name: neigh_encoded, dtype: float64

In [106]:
"""
    Group nearby listings into clusters based on coordinates.
"""

from sklearn.cluster import KMeans

coords = airbnb_df[["lat", "long"]]
kmeans = KMeans(n_clusters = 10, random_state = 42)
airbnb_df["location_cluster"] = kmeans.fit_predict(coords)
airbnb_df["location_cluster"].head(5)

0    9
1    0
2    7
3    1
4    8
Name: location_cluster, dtype: int32

In [107]:
airbnb_df["location_cluster"].value_counts()

location_cluster
0    25156
1    19411
6    13017
8    12893
9    10183
7     7876
3     6941
2     3417
4     2196
5      968
Name: count, dtype: int64

In [108]:
# Temporal feature extraction
airbnb_df["last_review"] = pd.to_datetime(airbnb_df["last_review"], errors = "coerce")

airbnb_df["days_since_last_review"] = (pd.Timestamp.today() - airbnb_df["last_review"]).dt.days
airbnb_df["days_since_last_review"] = airbnb_df['days_since_last_review'].fillna(-1)

airbnb_df["last_review_year"] = airbnb_df["last_review"].dt.year
airbnb_df["last_review_year"] = airbnb_df['last_review_year'].fillna(-1)

airbnb_df["last_review_month"] = airbnb_df["last_review"].dt.month
airbnb_df["last_review_month"] = airbnb_df['last_review_month'].fillna(-1)

airbnb_df["last_review_dayofweek"] = airbnb_df["last_review"].dt.dayofweek
airbnb_df["last_review_dayofweek"] = airbnb_df['last_review_dayofweek'].fillna(-1)

In [109]:
# Domain-specific feature engineering
airbnb_df["availability_ratio"] = airbnb_df["availability_365"] / 365
airbnb_df["price_per_min_stay"] = airbnb_df['price'] / airbnb_df['minimum_nights'].replace(0, 1)

airbnb_df["property_age"] = 2025 - airbnb_df["construction_year"]
airbnb_df["property_age"] = airbnb_df['property_age'].fillna(-1)

airbnb_df["has_house_rules"] = airbnb_df["house_rules"].notna().astype(int)
airbnb_df["has_license"] = airbnb_df["license"].notna().astype(int)
airbnb_df["total_cost"] = airbnb_df["price"] + airbnb_df["service_fee"]
airbnb_df["popularity_score"] = airbnb_df["review_rate_number"] * airbnb_df["number_of_reviews"]

In [110]:
airbnb_df.columns

Index(['id', 'host_id', 'host_identity_verified', 'host_name', 'neighbourhood',
       'lat', 'long', 'country', 'country_code', 'instant_bookable',
       'construction_year', 'price', 'service_fee', 'minimum_nights',
       'number_of_reviews', 'last_review', 'reviews_per_month',
       'review_rate_number', 'calculated_host_listings_count',
       'availability_365', 'house_rules', 'license', 'name_length',
       'policy_flexible', 'policy_moderate', 'policy_strict',
       'Entire home/apt', 'Hotel room', 'Private room', 'Shared room',
       'neighbourhood_group_bronx', 'neighbourhood_group_brooklyn',
       'neighbourhood_group_manhattan', 'neighbourhood_group_queens',
       'neighbourhood_group_staten island', 'neighbourhood_group_williamsburg',
       'neigh_encoded', 'location_cluster', 'days_since_last_review',
       'last_review_year', 'last_review_month', 'last_review_dayofweek',
       'availability_ratio', 'price_per_min_stay', 'property_age',
       'has_house_rules',

In [111]:
# Advanced Engineering
airbnb_df["price_relative_to_neighbourhood"] = airbnb_df.groupby("neighbourhood")["price"].transform(lambda pr: pr / pr.median())
airbnb_df["avg_reviews_per_listing"] = airbnb_df["number_of_reviews"] / (airbnb_df["calculated_host_listings_count"] + 1)

In [112]:
airbnb_df.isna().sum()[20:]

house_rules                           51842
license                              102056
name_length                               0
policy_flexible                           0
policy_moderate                           0
policy_strict                             0
Entire home/apt                           0
Hotel room                                0
Private room                              0
Shared room                               0
neighbourhood_group_bronx                 0
neighbourhood_group_brooklyn              0
neighbourhood_group_manhattan             0
neighbourhood_group_queens                0
neighbourhood_group_staten island         0
neighbourhood_group_williamsburg          0
neigh_encoded                             0
location_cluster                          0
days_since_last_review                    0
last_review_year                          0
last_review_month                         0
last_review_dayofweek                     0
availability_ratio              

In [113]:
airbnb_df.drop(columns= ["id", "host_id", "neighbourhood", "host_name", "lat", "long", "country", "country_code", "house_rules", "license"], inplace = True)

In [None]:
airbnb_df.to_csv("../data/transformed/airbnb_listings.csv", index = False)

: 