In [None]:
import pandas as pd
import numpy as np
import sqlite3
import os

db_path = "../data/airbnb.db"

base_query = """
SELECT 
    l.*,
    n.borough,
    n.neighbourhood_name
FROM listing l
LEFT JOIN neighbourhood n
    ON l.neighbourhood_id = n.neighbourhood_id
"""

with sqlite3.connect(db_path) as conn:
    df = pd.read_sql_query(base_query, conn)

print(f"Rows returned from DB: {df.shape[0]}")
df.head()

Rows returned from DB: 20631


Unnamed: 0,listing_id,neighbourhood_id,city,host_id,host_name,host_since,host_is_superhost,room_type,property_type,accommodates,...,availability_365,estimated_revenue,first_review,last_review,review_scores_rating,instant_bookable,calculated_host_listings_count,reviews_per_month,borough,neighbourhood_name
0,2595,115,NYC,2845,Jennifer,2008-09-09,0,Entire home/apt,Entire rental unit,1,...,289,0.0,2009-11-21,2022-06-21,4.68,0,3,0.24,Manhattan,Midtown
1,3344,268,Washington DC,4957,A.J.,2008-12-10,0,Entire home/apt,Entire condo,2,...,362,0.0,2009-05-09,2016-08-31,5.0,0,2,0.05,,"Downtown, Chinatown, Penn Quarters, Mount Vern..."
2,3686,276,Washington DC,4645,Vita,2008-11-26,0,Private room,Private room in home,1,...,298,0.0,2010-11-01,2023-08-30,4.64,0,1,0.47,,Historic Anacostia
3,3781,240,Boston,4804,Frank,2008-12-03,1,Entire home/apt,Entire rental unit,2,...,326,0.0,2015-07-10,2024-08-09,4.96,0,1,0.21,,East Boston
4,3943,271,Washington DC,5059,Vasa,2008-12-12,1,Private room,Private room in townhouse,2,...,331,19434.0,2009-05-10,2025-05-27,4.86,0,5,2.78,,"Edgewood, Bloomingdale, Truxton Circle, Eckington"


In [None]:
target_col = "price"

id_cols = ["listing_id", "neighbourhood_id", "host_id"]
leaky_cols = ["estimated_revenue"]
text_id_cols = ["host_name"]

drop_cols = id_cols + leaky_cols + text_id_cols

df_model = df.drop(columns=[c for c in drop_cols if c in df.columns])

df_features = df_model.copy()

df_features.head()

Unnamed: 0,city,host_since,host_is_superhost,room_type,property_type,accommodates,bedrooms,beds,bathrooms,bathrooms_text,...,number_of_reviews,availability_365,first_review,last_review,review_scores_rating,instant_bookable,calculated_host_listings_count,reviews_per_month,borough,neighbourhood_name
0,NYC,2008-09-09,0,Entire home/apt,Entire rental unit,1,0,1,1.0,1 bath,...,47,289,2009-11-21,2022-06-21,4.68,0,3,0.24,Manhattan,Midtown
1,Washington DC,2008-12-10,0,Entire home/apt,Entire condo,2,1,3,1.0,1 bath,...,10,362,2009-05-09,2016-08-31,5.0,0,2,0.05,,"Downtown, Chinatown, Penn Quarters, Mount Vern..."
2,Washington DC,2008-11-26,0,Private room,Private room in home,1,1,2,1.0,1 shared bath,...,84,298,2010-11-01,2023-08-30,4.64,0,1,0.47,,Historic Anacostia
3,Boston,2008-12-03,1,Entire home/apt,Entire rental unit,2,1,1,1.0,1 bath,...,26,326,2015-07-10,2024-08-09,4.96,0,1,0.21,,East Boston
4,Washington DC,2008-12-12,1,Private room,Private room in townhouse,2,1,1,1.0,1 private bath,...,546,331,2009-05-10,2025-05-27,4.86,0,5,2.78,,"Edgewood, Bloomingdale, Truxton Circle, Eckington"


In [None]:
df_features["log_price"] = np.log1p(df_features["price"])

accom = df_features["accommodates"].replace(0, np.nan)
beds = df_features["beds"].replace(0, np.nan)
bedrooms = df_features["bedrooms"].replace(0, np.nan)

df_features["price_per_accommodate"] = df_features["price"] / accom
df_features["price_per_bed"] = df_features["price"] / beds
df_features["price_per_bedroom"] = df_features["price"] / bedrooms

df_features["available_days_365"] = df_features["availability_365"]
df_features["availability_rate_365"] = df_features["available_days_365"] / 365.0
df_features["blocked_or_booked_days_365"] = 365 - df_features["available_days_365"]
df_features["blocked_or_booked_rate_365"] = (
    df_features["blocked_or_booked_days_365"] / 365.0
)

df_features["log_number_of_reviews"] = np.log1p(df_features["number_of_reviews"])
rpm = df_features["reviews_per_month"].clip(lower=0)
df_features["log_reviews_per_month"] = np.log1p(rpm)

df_features["availability_ratio"] = df_features["availability_rate_365"]
df_features["is_high_rating"] = (df_features["review_scores_rating"] >= 4.8).astype(int)
df_features["is_active_host"] = (df_features["reviews_per_month"] > 0).astype(int)

df_features.head()

Unnamed: 0,city,host_since,host_is_superhost,room_type,property_type,accommodates,bedrooms,beds,bathrooms,bathrooms_text,...,price_per_bedroom,available_days_365,availability_rate_365,blocked_or_booked_days_365,blocked_or_booked_rate_365,log_number_of_reviews,log_reviews_per_month,availability_ratio,is_high_rating,is_active_host
0,NYC,2008-09-09,0,Entire home/apt,Entire rental unit,1,0,1,1.0,1 bath,...,,289,0.791781,76,0.208219,3.871201,0.215111,0.791781,0,1
1,Washington DC,2008-12-10,0,Entire home/apt,Entire condo,2,1,3,1.0,1 bath,...,150.0,362,0.991781,3,0.008219,2.397895,0.04879,0.991781,1,1
2,Washington DC,2008-11-26,0,Private room,Private room in home,1,1,2,1.0,1 shared bath,...,60.0,298,0.816438,67,0.183562,4.442651,0.385262,0.816438,0,1
3,Boston,2008-12-03,1,Entire home/apt,Entire rental unit,2,1,1,1.0,1 bath,...,125.0,326,0.893151,39,0.106849,3.295837,0.19062,0.893151,1,1
4,Washington DC,2008-12-12,1,Private room,Private room in townhouse,2,1,1,1.0,1 private bath,...,79.0,331,0.906849,34,0.093151,6.304449,1.329724,0.906849,1,1


In [None]:
cap_bins = [0, 2, 4, 6, 8, 12, np.inf]
cap_labels = ["1-2", "3-4", "5-6", "7-8", "9-12", "13+"]

df_features["capacity_bucket"] = pd.cut(
    df_features["accommodates"],
    bins=cap_bins,
    labels=cap_labels
)

df_features["host_since_dt"] = pd.to_datetime(
    df_features["host_since"], errors="coerce"
)
ref_date = pd.to_datetime(df_features["last_review"].max())
df_features["host_years"] = (
    (ref_date - df_features["host_since_dt"]).dt.days / 365.25
)

df_features["host_listings_bucket"] = pd.cut(
    df_features["calculated_host_listings_count"],
    bins=[0, 1, 3, 10, 50, np.inf],
    labels=["1", "2-3", "4-10", "11-50", "50+"]
)

df_features["rating_bucket"] = pd.cut(
    df_features["review_scores_rating"],
    bins=[0, 4.0, 4.5, 4.8, 5.1],
    labels=["<4.0", "4.0-4.5", "4.5-4.8", "4.8-5.0"]
)

for col in [
    "neigh_avg_price", "neigh_median_price", "neigh_listing_count",
    "price_minus_neigh_mean", "price_over_neigh_mean",
    "price_minus_neigh_median", "price_over_neigh_median"
]:
    if col in df_features.columns:
        df_features = df_features.drop(columns=col)

neigh_stats = (
    df_features
    .groupby("neighbourhood_name")["price"]
    .agg(
        neigh_avg_price="mean",
        neigh_median_price="median",
        neigh_listing_count="size",
    )
    .reset_index()
)

df_features = df_features.merge(neigh_stats, on="neighbourhood_name", how="left")

df_features["price_minus_neigh_mean"] = (
    df_features["price"] - df_features["neigh_avg_price"]
)
df_features["price_over_neigh_mean"] = (
    df_features["price"] / df_features["neigh_avg_price"]
)
df_features["price_minus_neigh_median"] = (
    df_features["price"] - df_features["neigh_median_price"]
)
df_features["price_over_neigh_median"] = (
    df_features["price"] / df_features["neigh_median_price"]
)

df_features.head()


Unnamed: 0,city,host_since,host_is_superhost,room_type,property_type,accommodates,bedrooms,beds,bathrooms,bathrooms_text,...,host_years,host_listings_bucket,rating_bucket,neigh_avg_price,neigh_median_price,neigh_listing_count,price_minus_neigh_mean,price_over_neigh_mean,price_minus_neigh_median,price_over_neigh_median
0,NYC,2008-09-09,0,Entire home/apt,Entire rental unit,1,0,1,1.0,1 bath,...,17.059548,2-3,4.5-4.8,324.81558,281.0,629,-84.81558,0.738881,-41.0,0.854093
1,Washington DC,2008-12-10,0,Entire home/apt,Entire condo,2,1,3,1.0,1 bath,...,16.807666,2-3,4.8-5.0,262.678322,270.0,143,-112.678322,0.571041,-120.0,0.555556
2,Washington DC,2008-11-26,0,Private room,Private room in home,1,1,2,1.0,1 shared bath,...,16.845996,1,4.5-4.8,150.518519,98.0,27,-90.518519,0.398622,-38.0,0.612245
3,Boston,2008-12-03,1,Entire home/apt,Entire rental unit,2,1,1,1.0,1 bath,...,16.826831,1,4.8-5.0,183.130435,157.5,138,-58.130435,0.682574,-32.5,0.793651
4,Washington DC,2008-12-12,1,Private room,Private room in townhouse,2,1,1,1.0,1 private bath,...,16.80219,4-10,4.8-5.0,128.83165,119.0,297,-49.83165,0.613203,-40.0,0.663866


In [None]:
city_env = (
    df_features
    .groupby("city")
    .agg(
        city_listing_count=("city", "size"),
        city_superhost_rate=("host_is_superhost", "mean"),
        city_avg_rating=("review_scores_rating", "mean"),
        city_avg_reviews_per_month=("reviews_per_month", "mean"),
    )
    .reset_index()
)

if "is_entire_home" in df_features.columns:
    entire_share = (
        df_features
        .groupby("city")["is_entire_home"]
        .mean()
        .rename("city_entire_home_share")
        .reset_index()
    )
    city_env = city_env.merge(entire_share, on="city", how="left")
else:
    entire_share = (
        df_features
        .assign(is_entire_home=(df_features["room_type"] == "Entire home/apt").astype(int))
        .groupby("city")["is_entire_home"]
        .mean()
        .rename("city_entire_home_share")
        .reset_index()
    )
    city_env = city_env.merge(entire_share, on="city", how="left")

city_env["log_city_listing_count"] = np.log1p(city_env["city_listing_count"])

df_features = df_features.merge(city_env, on="city", how="left")

df_features.head()


Unnamed: 0,city,host_since,host_is_superhost,room_type,property_type,accommodates,bedrooms,beds,bathrooms,bathrooms_text,...,price_minus_neigh_mean,price_over_neigh_mean,price_minus_neigh_median,price_over_neigh_median,city_listing_count,city_superhost_rate,city_avg_rating,city_avg_reviews_per_month,city_entire_home_share,log_city_listing_count
0,NYC,2008-09-09,0,Entire home/apt,Entire rental unit,1,0,1,1.0,1 bath,...,-84.81558,0.738881,-41.0,0.854093,14122,0.391092,4.735708,1.086862,0.557287,9.55556
1,Washington DC,2008-12-10,0,Entire home/apt,Entire condo,2,1,3,1.0,1 bath,...,-112.678322,0.571041,-120.0,0.555556,3985,0.52522,4.781591,2.241325,0.803262,8.290544
2,Washington DC,2008-11-26,0,Private room,Private room in home,1,1,2,1.0,1 shared bath,...,-90.518519,0.398622,-38.0,0.612245,3985,0.52522,4.781591,2.241325,0.803262,8.290544
3,Boston,2008-12-03,1,Entire home/apt,Entire rental unit,2,1,1,1.0,1 bath,...,-58.130435,0.682574,-32.5,0.793651,2524,0.433439,4.731446,1.910008,0.701664,7.833996
4,Washington DC,2008-12-12,1,Private room,Private room in townhouse,2,1,1,1.0,1 private bath,...,-49.83165,0.613203,-40.0,0.663866,3985,0.52522,4.781591,2.241325,0.803262,8.290544


In [None]:
df_features["is_entire_home"] = (df_features["room_type"] == "Entire home/apt").astype(int)
df_features["is_private_room"] = (df_features["room_type"] == "Private room").astype(int)
df_features["is_shared_room"] = (df_features["room_type"] == "Shared room").astype(int)
df_features["is_hotel_room"] = (df_features["room_type"] == "Hotel room").astype(int)

top_props = df_features["property_type"].value_counts().nlargest(8).index
df_features["property_type_grouped"] = np.where(
    df_features["property_type"].isin(top_props),
    df_features["property_type"],
    "Other"
)

df_features.head()


Unnamed: 0,city,host_since,host_is_superhost,room_type,property_type,accommodates,bedrooms,beds,bathrooms,bathrooms_text,...,city_superhost_rate,city_avg_rating,city_avg_reviews_per_month,city_entire_home_share,log_city_listing_count,is_entire_home,is_private_room,is_shared_room,is_hotel_room,property_type_grouped
0,NYC,2008-09-09,0,Entire home/apt,Entire rental unit,1,0,1,1.0,1 bath,...,0.391092,4.735708,1.086862,0.557287,9.55556,1,0,0,0,Entire rental unit
1,Washington DC,2008-12-10,0,Entire home/apt,Entire condo,2,1,3,1.0,1 bath,...,0.52522,4.781591,2.241325,0.803262,8.290544,1,0,0,0,Entire condo
2,Washington DC,2008-11-26,0,Private room,Private room in home,1,1,2,1.0,1 shared bath,...,0.52522,4.781591,2.241325,0.803262,8.290544,0,1,0,0,Private room in home
3,Boston,2008-12-03,1,Entire home/apt,Entire rental unit,2,1,1,1.0,1 bath,...,0.433439,4.731446,1.910008,0.701664,7.833996,1,0,0,0,Entire rental unit
4,Washington DC,2008-12-12,1,Private room,Private room in townhouse,2,1,1,1.0,1 private bath,...,0.52522,4.781591,2.241325,0.803262,8.290544,0,1,0,0,Private room in townhouse


In [None]:
os.makedirs("../data/processed", exist_ok=True)

out_path = "../data/processed/listing_features.csv"
df_features.to_csv(out_path, index=False)

print(f"Features saved to: {out_path}")
print(f"Shape: {df_features.shape}")
print(f"\nColumns: {list(df_features.columns)}")


Features saved to: ../data/processed/listing_features.csv
Shape: (20631, 59)

Columns: ['city', 'host_since', 'host_is_superhost', 'room_type', 'property_type', 'accommodates', 'bedrooms', 'beds', 'bathrooms', 'bathrooms_text', 'latitude', 'longitude', 'price', 'number_of_reviews', 'availability_365', 'first_review', 'last_review', 'review_scores_rating', 'instant_bookable', 'calculated_host_listings_count', 'reviews_per_month', 'borough', 'neighbourhood_name', 'log_price', 'price_per_accommodate', 'price_per_bed', 'price_per_bedroom', 'available_days_365', 'availability_rate_365', 'blocked_or_booked_days_365', 'blocked_or_booked_rate_365', 'log_number_of_reviews', 'log_reviews_per_month', 'availability_ratio', 'is_high_rating', 'is_active_host', 'capacity_bucket', 'host_since_dt', 'host_years', 'host_listings_bucket', 'rating_bucket', 'neigh_avg_price', 'neigh_median_price', 'neigh_listing_count', 'price_minus_neigh_mean', 'price_over_neigh_mean', 'price_minus_neigh_median', 'price_ov