# **Tabular Model**

In [1]:
from sklearn import set_config
set_config(display="text")

## Load data

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df = pd.read_csv("../data/processed/train_tabular.csv")
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,268643,4,2.25,1810,9240,2.0,0,0,3,7,1810,0,1961,0,98055,47.4362,-122.187,1660,9240
1,245000,3,2.5,1600,2788,2.0,0,0,4,7,1600,0,1992,0,98031,47.4034,-122.187,1720,3605
2,200000,4,2.5,1720,8638,2.0,0,0,3,8,1720,0,1994,0,98003,47.2704,-122.313,1870,7455
3,352499,2,2.25,1240,705,2.0,0,0,3,7,1150,90,2009,0,98027,47.5321,-122.073,1240,750
4,232000,3,2.0,1280,13356,1.0,0,0,3,7,1280,0,1994,0,98042,47.3715,-122.074,1590,8071


In [4]:
df.shape

(16209, 19)

In [5]:
df.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')

### Drop non-predictive columns

In [6]:
X = df.drop(columns=["price","zipcode","sqft_above","sqft_basement"])
y = df["price"]

In [7]:
# Already have lat and long (which are far more precise), DROP zipcode. 
# It adds noise and confuses the Neural Network.

# sqft_living is exactly sqft_above + sqft_basement. 
# So we can drop those two and keep sqft_living as a single feature.

In [8]:
X.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,4,2.25,1810,9240,2.0,0,0,3,7,1961,0,47.4362,-122.187,1660,9240
1,3,2.5,1600,2788,2.0,0,0,4,7,1992,0,47.4034,-122.187,1720,3605
2,4,2.5,1720,8638,2.0,0,0,3,8,1994,0,47.2704,-122.313,1870,7455
3,2,2.25,1240,705,2.0,0,0,3,7,2009,0,47.5321,-122.073,1240,750
4,3,2.0,1280,13356,1.0,0,0,3,7,1994,0,47.3715,-122.074,1590,8071


In [9]:
X.columns

Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'grade', 'yr_built', 'yr_renovated',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [10]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,268643,4,2.25,1810,9240,2.0,0,0,3,7,1810,0,1961,0,98055,47.4362,-122.187,1660,9240
1,245000,3,2.5,1600,2788,2.0,0,0,4,7,1600,0,1992,0,98031,47.4034,-122.187,1720,3605
2,200000,4,2.5,1720,8638,2.0,0,0,3,8,1720,0,1994,0,98003,47.2704,-122.313,1870,7455
3,352499,2,2.25,1240,705,2.0,0,0,3,7,1150,90,2009,0,98027,47.5321,-122.073,1240,750
4,232000,3,2.0,1280,13356,1.0,0,0,3,7,1280,0,1994,0,98042,47.3715,-122.074,1590,8071


### Log Transform Price

In [11]:
# House prices are right-skewed
# Large outliers dominate loss functions
# Log-transforming the target can help stabilize variance and make patterns more linear

In [12]:
y_log = np.log1p(y)

In [13]:
# Log transform:
#     Stabilizes variance
#     Makes RMSE meaningful
#     Improves model learning

## Test Train Split

In [14]:
# Split data into training and validation sets
X_train, X_val, y_train_log, y_val_log = train_test_split(
    X, 
    y_log, 
    test_size=0.2, 
    random_state=42
)

## Model Training

### Import XGBoost

In [15]:
from xgboost import XGBRegressor

### Define Model

In [16]:
xgb_model = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)


In [17]:
# Deep enough to capture non-linear patterns
# Not so deep enough to avoid overfitting
# Standard, well-tested baseline configuration

### Train Model

In [18]:
xgb_model.fit(X_train, y_train_log)

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.8, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=0.05, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=6,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=300,
             n_jobs=-1, num_parallel_tree=None, ...)

### Predict on Validation Set

In [19]:
y_val_pred_log = xgb_model.predict(X_val)

### Convert predictions back to price scale

In [20]:
y_val_pred = np.expm1(y_val_pred_log)
y_val_true = np.expm1(y_val_log)

## Computing metrics

In [21]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Log-scale metrics
rmse_log = np.sqrt(mean_squared_error(y_val_log, y_val_pred_log))
r2_log = r2_score(y_val_log, y_val_pred_log)

print(f"Tabular RMSE (log-price): {rmse_log:.4f}")
print(f"Tabular R2 (log-price): {r2_log:.4f}")


Tabular RMSE (log-price): 0.1654
Tabular R2 (log-price): 0.9009


In [22]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Compute RMSE and R² in original price scale
rmse = np.sqrt(mean_squared_error(y_val_true, y_val_pred))
r2 = r2_score(y_val_true, y_val_pred)

print(f"RMSE (actual price): ${rmse:,.0f}")
print(f"R2: {r2:.4f}")

RMSE (actual price): $116,640
R2: 0.8916


In [23]:
# Using only raw features including lat, long, zipcode, sqft_living, grade, 
# bathrooms, floors, waterfront, etc. already explains ~89% of the price variance.

# The remaining ~11% is driven by neighborhood-level visual cues that are not 
# in the tabular dataset — things like green cover, road density, waterfront 
# proximity, and surrounding development.

# By using satellite images, we can extract these features with a CNN and fuse 
# them with tabular data, which improves prediction accuracy and R²