In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error
import numpy as np

In [20]:
df = pd.read_csv("../data/processed/listing_features.csv")

##df.head()

df["price"].describe()

count    21328.000000
mean       680.526819
std       4480.453282
min         10.000000
25%         89.000000
50%        154.000000
75%        279.000000
max      50104.000000
Name: price, dtype: float64

In [21]:
df = df[df["price"].between(20, 1000)]

df["price"].describe()

count    20736.000000
mean       203.730131
std        165.994987
min         22.000000
25%         87.000000
50%        150.000000
75%        261.000000
max       1000.000000
Name: price, dtype: float64

In [22]:
train_df, test_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

len(train_df), len(test_df)

y_train = train_df["price"]
y_test = test_df["price"]

In [23]:
# Baseline 1: Global mean price
global_mean = y_train.mean()
print("Global mean price:", global_mean)

y_pred_global = np.full_like(y_test, fill_value=global_mean, dtype=float)

rmse_global = root_mean_squared_error(y_test, y_pred_global)
print("Global mean baseline RMSE:", rmse_global)

Global mean price: 203.70707740535326
Global mean baseline RMSE: 166.63082548652102


In [24]:
# Baseline 2: Neighbourhood mean price
neigh_means = (
    train_df
    .groupby("neighbourhood_name")["price"]
    .mean()
)

y_pred_neigh = test_df["neighbourhood_name"].map(neigh_means)

y_pred_neigh = y_pred_neigh.fillna(global_mean)

rmse_neigh = root_mean_squared_error(y_test, y_pred_neigh)
print("Neighbourhood mean baseline RMSE:", rmse_neigh)

Neighbourhood mean baseline RMSE: 137.71205353325914
