In [135]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [136]:
# Load features from the feature factory output
# This CSV is generated by notebooks/features.ipynb from the SQLite database
df = pd.read_csv("../data/processed/listing_features.csv")

print(f"Loaded {len(df)} rows from listing_features.csv")
df["price"].describe()

Loaded 14436 rows from listing_features.csv


count    14436.000000
mean       315.920477
std       2321.743611
min         10.000000
25%         91.000000
50%        149.000000
75%        237.000000
max      50052.000000
Name: price, dtype: float64

In [137]:
df = df[df["price"].between(20, 1000)]

df["price"].describe()

count    14236.000000
mean       187.866044
std        147.111421
min         22.000000
25%         90.000000
50%        147.000000
75%        230.000000
max       1000.000000
Name: price, dtype: float64

In [138]:
# Prepare y (target) - will be used with X in next cell after feature definition
y = df["price"]

In [139]:
# Define feature lists (excluding price-derived features to avoid data leakage)
numeric_features = [
    "accommodates",
    "bedrooms",
    "beds",
    "bathrooms",
    "number_of_reviews",
    "review_scores_rating",
    "availability_365",
    "reviews_per_month",
    "log_number_of_reviews",
    "availability_ratio",
]

categorical_features = [
    "borough",
    "neighbourhood_name",
    "host_is_superhost",
    "room_type",
    "property_type",
    "is_high_rating",
    "is_active_host",
]

# Build X using only safe features (no price-derived columns)
X = df[numeric_features + categorical_features]

# Now do train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

# Baseline 1: Global mean price
global_mean = y_train.mean()
print("Global mean price:", global_mean)

y_pred_global = np.full_like(y_test, fill_value=global_mean, dtype=float)

rmse_global = root_mean_squared_error(y_test, y_pred_global)
print("Global mean baseline RMSE:", rmse_global)

Training set size: 11388
Test set size: 2848
Global mean price: 187.49420442571127
Global mean baseline RMSE: 146.05705276456985


In [140]:
# Baseline 2: Neighbourhood mean price
# Need to use the original df to get neighbourhood names for train/test
train_indices = X_train.index
test_indices = X_test.index

neigh_means = (
    df.loc[train_indices]
    .groupby("neighbourhood_name")["price"]
    .mean()
)

y_pred_neigh = df.loc[test_indices]["neighbourhood_name"].map(neigh_means)

y_pred_neigh = y_pred_neigh.fillna(global_mean)

rmse_neigh = root_mean_squared_error(y_test, y_pred_neigh)
print("Neighbourhood mean baseline RMSE:", rmse_neigh)

Neighbourhood mean baseline RMSE: 128.68244689850806


In [141]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Note: numeric_features and categorical_features are already defined in cell 4
# They exclude price-derived features to avoid data leakage:
# - NO price_per_accommodate, price_per_bedroom, price_per_bed
# - NO log_price
# - NO estimated_revenue

# Build the preprocessor
# - numeric columns: impute missing with median
# - categorical columns: impute missing with most frequent then one-hot encode
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Build the full pipeline: preprocessing + linear regression
linreg_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", LinearRegression())
])

# X_train and X_test are already defined from the train/test split in cell 4
# Fit the model on the training data
linreg_model.fit(X_train, y_train)

# Predict on the test data
y_pred_lr = linreg_model.predict(X_test)

# Evaluate RMSE for linear regression
rmse_lr = root_mean_squared_error(y_test, y_pred_lr)

print("Global mean baseline RMSE:      ", rmse_global)
print("Neighbourhood mean baseline RMSE:", rmse_neigh)
print("Linear regression RMSE:         ", rmse_lr)

Global mean baseline RMSE:       146.05705276456985
Neighbourhood mean baseline RMSE: 128.68244689850806
Linear regression RMSE:          103.3552766916125


In [142]:
# Random Forest Regressor (tree-based, non-linear model)
rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("regressor", RandomForestRegressor(
        n_estimators=200,
        max_depth=None,        # let trees grow fully (you can tune this)
        min_samples_leaf=2,
        n_jobs=-1,             # use all CPU cores
        random_state=42
    ))
])

# Fit on training data
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate
rmse_rf = root_mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest RMSE:", rmse_rf)
print("Random Forest R^2:", r2_rf)

Random Forest RMSE: 92.32315626273984
Random Forest R^2: 0.6003806054970098


In [143]:
print("\n=== Model comparison on test set ===")
print(f"Global mean baseline RMSE:        {rmse_global:.2f}")
print(f"Neighbourhood mean baseline RMSE: {rmse_neigh:.2f}")
print(f"Linear Regression RMSE:           {rmse_lr:.2f}")
print(f"Random Forest RMSE:               {rmse_rf:.2f}")

print("\nR^2 scores:")
print(f"Linear Regression R^2:            {r2_score(y_test, y_pred_lr):.3f}")
print(f"Random Forest R^2:                {r2_rf:.3f}")



=== Model comparison on test set ===
Global mean baseline RMSE:        146.06
Neighbourhood mean baseline RMSE: 128.68
Linear Regression RMSE:           103.36
Random Forest RMSE:               92.32

R^2 scores:
Linear Regression R^2:            0.499
Random Forest R^2:                0.600
