## Imports

In [38]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import re
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

## Processing data

In [39]:
df = pd.read_csv("resale_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 223415 entries, 0 to 223414
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   month                223415 non-null  object 
 1   town                 223415 non-null  object 
 2   flat_type            223415 non-null  object 
 3   block                223415 non-null  object 
 4   street_name          223415 non-null  object 
 5   storey_range         223415 non-null  object 
 6   floor_area_sqm       223415 non-null  float64
 7   flat_model           223415 non-null  object 
 8   lease_commence_date  223415 non-null  int64  
 9   remaining_lease      223415 non-null  object 
 10  resale_price         223415 non-null  float64
dtypes: float64(2), int64(1), object(8)
memory usage: 18.7+ MB


In [40]:
df["month"] = pd.to_datetime(df["month"], format="%Y-%m")
def lease_to_years(s):
    if pd.isna(s):
        return np.nan
    
    s = str(s).lower()
    
    years = re.search(r"(\d+)\s*year", s)
    months = re.search(r"(\d+)\s*month", s)
    
    y = int(years.group(1)) if years else 0
    m = int(months.group(1)) if months else 0
    
    return y + m/12

df["remaining_lease_years"] = df["remaining_lease"].apply(lease_to_years)

## Baseline Linear Regression

To predict the resale price

In [41]:
df["log_price"] = np.log(df["resale_price"])

#features
X = df[["town","flat_type","storey_range","floor_area_sqm", "remaining_lease_years"]].copy()

#predictors
y = df[["log_price"]]

cat_cols = ["town","flat_type","storey_range"]
num_cols = ["floor_area_sqm", "remaining_lease_years"]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
        ("num", "passthrough", num_cols)
    ]
)

model = Pipeline(steps=[("preprocess", preprocess),("regressor", LinearRegression())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)
pred = model.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = mean_squared_error(y_test, pred, squared=False)

print("MAE:", mae)
print("RMSE:", rmse)

MAE: 0.16838644912959766
RMSE: 0.1997412587241161




## Baseline model

What are the main drivers of price using simple OLS regression

In [42]:
model = smf.ols(
    "log_price ~ floor_area_sqm + C(flat_type) + remaining_lease_years",
    data=df
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.464
Model:                            OLS   Adj. R-squared:                  0.464
Method:                 Least Squares   F-statistic:                 2.416e+04
Date:                Mon, 19 Jan 2026   Prob (F-statistic):               0.00
Time:                        15:46:55   Log-Likelihood:                -11006.
No. Observations:              223415   AIC:                         2.203e+04
Df Residuals:                  223406   BIC:                         2.212e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep

Simple interpretation

A 2-room flat is associated with a 0.0991 higher log(price) than a 1-room flat (the baseline category), holding all other variables constant. OR
A 2-room flat is associated with about 10.4% higher resale price compared to a 1-room flat, holding other variables constant.

## Model + all other factors

Helps me understand if towns and other supporting features are significant predictors of price

In [43]:
model = smf.ols(
    "log_price ~ floor_area_sqm + C(town) + C(flat_type) + C(storey_range) + remaining_lease_years",
    data=df
).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:              log_price   R-squared:                       0.667
Model:                            OLS   Adj. R-squared:                  0.667
Method:                 Least Squares   F-statistic:                     9119.
Date:                Mon, 19 Jan 2026   Prob (F-statistic):               0.00
Time:                        15:46:59   Log-Likelihood:                 42103.
No. Observations:              223415   AIC:                        -8.411e+04
Df Residuals:                  223365   BIC:                        -8.359e+04
Df Model:                          49                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
Intercep