In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Imports & Data Loading

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
BASE_DIR = Path.cwd()
if BASE_DIR.name == "notebooks":
    BASE_DIR = BASE_DIR.parent

data_path = "/content/drive/MyDrive/satellite-property-valuation/data/processed/train_tabular.csv"
tab_df = pd.read_csv(data_path)

tab_df.head()


Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,price_group
0,268643,4,2.25,1810,9240,2.0,0,0,3,7,1810,0,1961,0,98055,47.4362,-122.187,1660,9240,1
1,245000,3,2.5,1600,2788,2.0,0,0,4,7,1600,0,1992,0,98031,47.4034,-122.187,1720,3605,0
2,200000,4,2.5,1720,8638,2.0,0,0,3,8,1720,0,1994,0,98003,47.2704,-122.313,1870,7455,0
3,352499,2,2.25,1240,705,2.0,0,0,3,7,1150,90,2009,0,98027,47.5321,-122.073,1240,750,3
4,232000,3,2.0,1280,13356,1.0,0,0,3,7,1280,0,1994,0,98042,47.3715,-122.074,1590,8071,0


Dataset Shape

In [None]:
print("Dataset shape:", tab_df.shape)


Dataset shape: (16209, 20)


Feature–Target Split

In [None]:
target_col = "price"

features = tab_df.drop(columns=[target_col])
target = tab_df[target_col]


Log-Transform

In [None]:
target_log = np.log1p(target)


Why Log-Transform the Target

Property prices exhibit strong right skewness, with a small number of very expensive houses.

Training directly on raw prices:

Over-penalizes large errors

Biases the model toward high-value properties

Applying a log(1 + price) transformation:

Reduces skewness

Stabilizes variance

Improves numerical optimization

Makes RMSE more interpretable after inverse transform

Train–Validation Split

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    features,
    target_log,
    test_size=0.2,
    random_state=42
)


XGBoost Regressor (Baseline Model)

In [None]:
from xgboost import XGBRegressor

xgb_reg = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)


Why this setup works

Captures non-linear relationships

Moderate depth prevents overfitting

Subsampling improves generalization

Commonly used as a strong structured-data baseline

Model Training

In [None]:
xgb_reg.fit(X_train, y_train)


Validation Predictions

In [None]:
log_preds = xgb_reg.predict(X_valid)

# Convert back to original price scale
price_preds = np.expm1(log_preds)
price_true = np.expm1(y_valid)


Evaluation Metrics

In [None]:
mse = mean_squared_error(price_true, price_preds)
rmse = np.sqrt(mse)
r2 = r2_score(price_true, price_preds)

print(f"RMSE: {rmse:,.2f}")
print(f"R²  : {r2:.4f}")

RMSE: 88,521.20
R²  : 0.9376


The tabular model explains ~89% of price variance, indicating that structured features such as:

Property size

Location (lat/long, zipcode)

Quality indicators

already contain substantial predictive power.

The remaining error likely originates from visual and spatial context, such as:

Street layout

Green spaces

Density of nearby buildings

Water proximity and coastline structure