In [14]:
# Feature Factory: Official Feature Engineering Notebook
# 
# This notebook is the single source of truth for feature engineering.
# It reads from the SQLite database and creates listing_features.csv
# which is then consumed by model.ipynb for training.
#
# Pipeline: DB → features.ipynb → listing_features.csv → model.ipynb

import pandas as pd
import numpy as np
import sqlite3
import os

# 1. Connect to the SQLite database
db_path = "../data/airbnb.db"

query = """
SELECT 
    l.listing_id,
    l.price,
    n.borough,
    n.neighbourhood_name,
    
    -- basic size / capacity
    l.accommodates,
    l.bedrooms,
    l.beds,
    l.bathrooms,
    
    -- host / quality info
    l.host_is_superhost,
    l.number_of_reviews,
    l.review_scores_rating,
    l.availability_365,
    l.reviews_per_month,
    l.estimated_revenue,
    
    -- categorical info for later
    l.room_type,
    l.property_type
    
FROM listing l
JOIN neighbourhood n 
    ON l.neighbourhood_id = n.neighbourhood_id
WHERE l.price IS NOT NULL
"""

with sqlite3.connect(db_path) as conn:
    df = pd.read_sql_query(query, conn)

print(f"Rows returned from DB: {df.shape[0]}")
df.head()

Rows returned from DB: 14436


Unnamed: 0,listing_id,price,borough,neighbourhood_name,accommodates,bedrooms,beds,bathrooms,host_is_superhost,number_of_reviews,review_scores_rating,availability_365,reviews_per_month,estimated_revenue,room_type,property_type
0,2595,240.0,Manhattan,Midtown,1,0,1,1.0,0,47,4.68,289,0.24,0.0,Entire home/apt,Entire rental unit
1,6848,96.0,Brooklyn,Williamsburg,3,2,1,1.0,1,195,4.59,285,0.98,17280.0,Entire home/apt,Entire rental unit
2,6872,59.0,Manhattan,East Harlem,1,1,1,1.0,0,1,5.0,83,0.02,0.0,Private room,Private room in condo
3,6990,73.0,Manhattan,East Harlem,1,2,2,1.0,0,249,4.88,186,1.28,17520.0,Private room,Private room in rental unit
4,7097,216.0,Brooklyn,Fort Greene,2,1,2,1.0,1,423,4.89,0,2.21,55080.0,Private room,Private room in guest suite


In [15]:
# 2. Start the feature table
listing_features = pd.DataFrame()

# ID and target
listing_features["listing_id"] = df["listing_id"]
listing_features["price"] = df["price"].astype(float)

# Location features
listing_features["borough"] = df["borough"]
listing_features["neighbourhood_name"] = df["neighbourhood_name"]

# Size / capacity
listing_features["accommodates"] = df["accommodates"].astype(float)
listing_features["bedrooms"] = df["bedrooms"].astype(float)
listing_features["beds"] = df["beds"].astype(float)
listing_features["bathrooms"] = df["bathrooms"].astype(float)

# Host / demand info
listing_features["host_is_superhost"] = df["host_is_superhost"]
listing_features["number_of_reviews"] = df["number_of_reviews"].astype(float)
listing_features["review_scores_rating"] = df["review_scores_rating"].astype(float)
listing_features["availability_365"] = df["availability_365"].astype(float)
listing_features["reviews_per_month"] = df["reviews_per_month"].astype(float)
listing_features["estimated_revenue"] = df["estimated_revenue"].astype(float)

# Categorical listing types
listing_features["room_type"] = df["room_type"]
listing_features["property_type"] = df["property_type"]

listing_features.head()

Unnamed: 0,listing_id,price,borough,neighbourhood_name,accommodates,bedrooms,beds,bathrooms,host_is_superhost,number_of_reviews,review_scores_rating,availability_365,reviews_per_month,estimated_revenue,room_type,property_type
0,2595,240.0,Manhattan,Midtown,1.0,0.0,1.0,1.0,0,47.0,4.68,289.0,0.24,0.0,Entire home/apt,Entire rental unit
1,6848,96.0,Brooklyn,Williamsburg,3.0,2.0,1.0,1.0,1,195.0,4.59,285.0,0.98,17280.0,Entire home/apt,Entire rental unit
2,6872,59.0,Manhattan,East Harlem,1.0,1.0,1.0,1.0,0,1.0,5.0,83.0,0.02,0.0,Private room,Private room in condo
3,6990,73.0,Manhattan,East Harlem,1.0,2.0,2.0,1.0,0,249.0,4.88,186.0,1.28,17520.0,Private room,Private room in rental unit
4,7097,216.0,Brooklyn,Fort Greene,2.0,1.0,2.0,1.0,1,423.0,4.89,0.0,2.21,55080.0,Private room,Private room in guest suite


In [16]:
# 3. Engineered features
# 3.1 Price per accommodate (already had this)
accom = listing_features["accommodates"].replace(0, np.nan)
listing_features["price_per_accommodate"] = listing_features["price"] / accom

# 3.2 Price per bedroom
bedrooms_nonzero = listing_features["bedrooms"].replace(0, np.nan)
listing_features["price_per_bedroom"] = listing_features["price"] / bedrooms_nonzero

# 3.3 Price per bed
beds_nonzero = listing_features["beds"].replace(0, np.nan)
listing_features["price_per_bed"] = listing_features["price"] / beds_nonzero

# 3.4 Log price (helps linear models handle big price differences)
listing_features["log_price"] = np.log1p(listing_features["price"])

# 3.5 Log number of reviews (reviews often follow a long tail)
listing_features["log_number_of_reviews"] = np.log1p(listing_features["number_of_reviews"])

# 3.6 Availability ratio: available days out of 365
listing_features["availability_ratio"] = listing_features["availability_365"] / 365.0

# 3.7 High rating flag (1 if rating >= 4.8, else 0)
listing_features["is_high_rating"] = (listing_features["review_scores_rating"] >= 4.8).astype(int)

# 3.8 Active host flag (1 if reviews_per_month > 0, else 0)
listing_features["is_active_host"] = (listing_features["reviews_per_month"] > 0).astype(int)

listing_features.head()

Unnamed: 0,listing_id,price,borough,neighbourhood_name,accommodates,bedrooms,beds,bathrooms,host_is_superhost,number_of_reviews,...,room_type,property_type,price_per_accommodate,price_per_bedroom,price_per_bed,log_price,log_number_of_reviews,availability_ratio,is_high_rating,is_active_host
0,2595,240.0,Manhattan,Midtown,1.0,0.0,1.0,1.0,0,47.0,...,Entire home/apt,Entire rental unit,240.0,,240.0,5.484797,3.871201,0.791781,0,1
1,6848,96.0,Brooklyn,Williamsburg,3.0,2.0,1.0,1.0,1,195.0,...,Entire home/apt,Entire rental unit,32.0,48.0,96.0,4.574711,5.278115,0.780822,0,1
2,6872,59.0,Manhattan,East Harlem,1.0,1.0,1.0,1.0,0,1.0,...,Private room,Private room in condo,59.0,59.0,59.0,4.094345,0.693147,0.227397,1,1
3,6990,73.0,Manhattan,East Harlem,1.0,2.0,2.0,1.0,0,249.0,...,Private room,Private room in rental unit,73.0,36.5,36.5,4.304065,5.521461,0.509589,1,1
4,7097,216.0,Brooklyn,Fort Greene,2.0,1.0,2.0,1.0,1,423.0,...,Private room,Private room in guest suite,108.0,216.0,108.0,5.379897,6.049733,0.0,1,1


In [17]:
# 4. Save features to CSV
os.makedirs("../data/processed", exist_ok=True)

out_path = "../data/processed/listing_features.csv"
listing_features.to_csv(out_path, index=False)

print(f"Features saved to: {out_path}")
out_path


Features saved to: ../data/processed/listing_features.csv


'../data/processed/listing_features.csv'