In [19]:
import pandas as pd
import numpy as np
from math import sqrt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, root_mean_squared_error

In [20]:
# Load features from the feature factory output
# This CSV is generated by notebooks/features.ipynb from the SQLite database
df = pd.read_csv("../data/processed/listing_features.csv")

print(f"Loaded {len(df)} rows from listing_features.csv")
df["price"].describe()

Loaded 20631 rows from listing_features.csv


count    20631.000000
mean       184.845136
std        138.551476
min         10.000000
25%         93.000000
50%        147.000000
75%        229.000000
max       1000.000000
Name: price, dtype: float64

In [21]:
# Data cleaning: filter outliers (same as model2.ipynb)
rows_before = len(df)

# Keep reasonable prices only
df = df[df["price"].between(10, 1000)]

# Additional outlier filters on key numeric columns
df = df[
    (df["accommodates"].between(1, 10)) &
    (df["bedrooms"].between(0, 8)) &
    (df["beds"].between(0, 10)) &
    (df["bathrooms"].between(0, 5)) &
    (df["review_scores_rating"].between(1, 5)) &
    (df["availability_365"].between(0, 365))
]

# Filter host_years and reviews_per_month if they exist
if "host_years" in df.columns:
    df = df[df["host_years"].between(0, 20)]
if "reviews_per_month" in df.columns:
    df = df[df["reviews_per_month"].between(0, 20)]

print(f"Rows before cleaning: {rows_before}")
print(f"Rows after cleaning:  {len(df)}")
df["price"].describe()

Rows before cleaning: 20631
Rows after cleaning:  20631


count    20631.000000
mean       184.845136
std        138.551476
min         10.000000
25%         93.000000
50%        147.000000
75%        229.000000
max       1000.000000
Name: price, dtype: float64

In [22]:
# Prepare y (target) - will be used with X in next cell after feature definition
y = df["price"]

In [23]:
# Define feature groups (excluding price-derived features to avoid data leakage)
target_col = "price"

# Categorical features (include city)
categorical_features = [
    "city",
    "borough",
    "neighbourhood_name",
    "room_type",
    "property_type_grouped",
    "capacity_bucket",
    "host_listings_bucket",
    "rating_bucket",
]

# Binary features
binary_features = [
    "host_is_superhost",
    "instant_bookable",
    "is_entire_home",
    "is_private_room",
    "is_shared_room",
    "is_hotel_room",
]

# Price-derived leakage features (do NOT feed to model)
price_leak_cols = [
    "price_per_accommodate",
    "price_per_bed",
    "price_per_bedroom",
    "price_minus_neigh_mean",
    "price_over_neigh_mean",
    "price_minus_neigh_median",
    "price_over_neigh_median",
    "neigh_avg_price",
    "neigh_median_price",
    "estimated_revenue",
]

# Other columns to exclude
exclude_cols = (
    [target_col, "log_price"]  # targets
    + categorical_features
    + binary_features
    + price_leak_cols
    + ["host_since", "first_review", "last_review"]  # date strings
    + ["property_type", "bathrooms_text", "host_since_dt", "host_name"]  # raw text
    + ["city_listing_count", "city_avg_rating"]  # city features we don't want
)

# Build numeric_features = all numeric columns minus excluded sets
numeric_features = [
    c for c in df.columns
    if c not in exclude_cols
    and pd.api.types.is_numeric_dtype(df[c])
]

print(f"Numeric features ({len(numeric_features)}):")
print(numeric_features)
print(f"\nCategorical features ({len(categorical_features)}):")
print(categorical_features)
print(f"\nBinary features ({len(binary_features)}):")
print(binary_features)

# Build X using only safe features (no price-derived columns)
X = df[categorical_features + binary_features + numeric_features]

# Now do train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Baseline 1: Global mean price
global_mean = y_train.mean()
print("Global mean price:", global_mean)

y_pred_global = np.full_like(y_test, fill_value=global_mean, dtype=float)

rmse_global = root_mean_squared_error(y_test, y_pred_global)
print("Global mean baseline RMSE:", rmse_global)

Numeric features (26):
['accommodates', 'bedrooms', 'beds', 'bathrooms', 'latitude', 'longitude', 'number_of_reviews', 'availability_365', 'review_scores_rating', 'calculated_host_listings_count', 'reviews_per_month', 'available_days_365', 'availability_rate_365', 'blocked_or_booked_days_365', 'blocked_or_booked_rate_365', 'log_number_of_reviews', 'log_reviews_per_month', 'availability_ratio', 'is_high_rating', 'is_active_host', 'host_years', 'neigh_listing_count', 'city_superhost_rate', 'city_avg_reviews_per_month', 'city_entire_home_share', 'log_city_listing_count']

Categorical features (8):
['city', 'borough', 'neighbourhood_name', 'room_type', 'property_type_grouped', 'capacity_bucket', 'host_listings_bucket', 'rating_bucket']

Binary features (6):
['host_is_superhost', 'instant_bookable', 'is_entire_home', 'is_private_room', 'is_shared_room', 'is_hotel_room']

Training set size: 16504
Test set size: 4127
Global mean price: 184.17074648570045
Global mean baseline RMSE: 143.0870102

In [24]:
# Baseline 2: Neighbourhood mean price
# Need to use the original df to get neighbourhood names for train/test
train_indices = X_train.index
test_indices = X_test.index

neigh_means = (
    df.loc[train_indices]
    .groupby("neighbourhood_name")["price"]
    .mean()
)

y_pred_neigh = df.loc[test_indices]["neighbourhood_name"].map(neigh_means)
y_pred_neigh = y_pred_neigh.fillna(global_mean)

rmse_neigh = root_mean_squared_error(y_test, y_pred_neigh)
print("Neighbourhood mean baseline RMSE:", rmse_neigh)

Neighbourhood mean baseline RMSE: 124.57374477100998


In [25]:
# Linear Regression Model
# Build the preprocessor
# - numeric + binary columns: impute missing with median, then standardize
# - categorical columns: impute missing with most frequent then one-hot encode
num_features = numeric_features + binary_features
cat_features = categorical_features

numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features),
    ]
)

# Build the full pipeline: preprocessing + linear regression
linreg_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

# Fit the model on the training data
linreg_model.fit(X_train, y_train)

# Predict on the test data
y_pred_lr = linreg_model.predict(X_test)

# Evaluate metrics for linear regression
rmse_lr = sqrt(mean_squared_error(y_test, y_pred_lr))
mae_lr = mean_absolute_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Global mean baseline RMSE:      ", rmse_global)
print("Neighbourhood mean baseline RMSE:", rmse_neigh)
print("Linear regression RMSE:         ", rmse_lr)
print("Linear regression MAE:          ", mae_lr)
print("Linear regression R^2:          ", r2_lr)

Global mean baseline RMSE:       143.08701026598013
Neighbourhood mean baseline RMSE: 124.57374477100998
Linear regression RMSE:          94.61117696759183
Linear regression MAE:           63.26836005160132
Linear regression R^2:           0.5625528138666831


In [26]:
# Random Forest Regressor (tree-based, non-linear model)
# Using the same preprocessing pipeline as linear regression
rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", RandomForestRegressor(
        n_estimators=200,
        max_depth=None,        # let trees grow fully
        n_jobs=-1,             # use all CPU cores
        random_state=42
    ))
])

# Fit on training data
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate metrics for Random Forest
rmse_rf = sqrt(mean_squared_error(y_test, y_pred_rf))
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest RMSE:", rmse_rf)
print("Random Forest MAE: ", mae_rf)
print("Random Forest R^2: ", r2_rf)

Random Forest RMSE: 75.69445785138063
Random Forest MAE:  46.60657487278895
Random Forest R^2:  0.71999299101695


In [27]:
print("\n=== Model comparison on test set ===")
print(f"Global mean baseline RMSE:        {rmse_global:.2f}")
print(f"Neighbourhood mean baseline RMSE: {rmse_neigh:.2f}")
print(f"Linear Regression RMSE:           {rmse_lr:.2f}")
print(f"Random Forest RMSE:               {rmse_rf:.2f}")

print("\nMAE scores:")
print(f"Linear Regression MAE:            {mae_lr:.2f}")
print(f"Random Forest MAE:                {mae_rf:.2f}")

print("\nR^2 scores:")
print(f"Linear Regression R^2:            {r2_lr:.3f}")
print(f"Random Forest R^2:                {r2_rf:.3f}")



=== Model comparison on test set ===
Global mean baseline RMSE:        143.09
Neighbourhood mean baseline RMSE: 124.57
Linear Regression RMSE:           94.61
Random Forest RMSE:               75.69

MAE scores:
Linear Regression MAE:            63.27
Random Forest MAE:                46.61

R^2 scores:
Linear Regression R^2:            0.563
Random Forest R^2:                0.720
