# Multimodal Regression for Property Valuation

This notebook:
- Loads tabular housing data
- Loads satellite image embeddings
- Merges both modalities
- Trains regression models
- Evaluates performance


In [66]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


In [67]:
df = pd.read_excel("../data/train.xlsx")

df.head()


Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,9117000170,20150505T000000,268643,4,2.25,1810,9240,2.0,0,0,...,7,1810,0,1961,0,98055,47.4362,-122.187,1660,9240
1,6700390210,20140708T000000,245000,3,2.5,1600,2788,2.0,0,0,...,7,1600,0,1992,0,98031,47.4034,-122.187,1720,3605
2,7212660540,20150115T000000,200000,4,2.5,1720,8638,2.0,0,0,...,8,1720,0,1994,0,98003,47.2704,-122.313,1870,7455
3,8562780200,20150427T000000,352499,2,2.25,1240,705,2.0,0,0,...,7,1150,90,2009,0,98027,47.5321,-122.073,1240,750
4,7760400350,20141205T000000,232000,3,2.0,1280,13356,1.0,0,0,...,7,1280,0,1994,0,98042,47.3715,-122.074,1590,8071


In [68]:
y = np.log1p(df["price"])   # log transform improves stability


In [69]:
tab_cols = [
    'bedrooms','bathrooms','sqft_living','sqft_lot','floors',
    'waterfront','view','condition','grade','sqft_above',
    'sqft_basement','yr_built','yr_renovated',
    'lat','long'
]

X_tab = df[tab_cols]


In [70]:
X_tab

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long
0,4,2.25,1810,9240,2.0,0,0,3,7,1810,0,1961,0,47.4362,-122.187
1,3,2.50,1600,2788,2.0,0,0,4,7,1600,0,1992,0,47.4034,-122.187
2,4,2.50,1720,8638,2.0,0,0,3,8,1720,0,1994,0,47.2704,-122.313
3,2,2.25,1240,705,2.0,0,0,3,7,1150,90,2009,0,47.5321,-122.073
4,3,2.00,1280,13356,1.0,0,0,3,7,1280,0,1994,0,47.3715,-122.074
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16204,3,1.50,1000,6914,1.0,0,0,3,7,1000,0,1947,0,47.7144,-122.319
16205,3,2.50,3087,5002,2.0,0,0,3,8,3087,0,2014,0,47.2974,-122.349
16206,3,2.50,2120,4780,2.0,0,0,3,7,2120,0,2004,0,47.6810,-122.032
16207,1,0.75,380,15000,1.0,0,0,3,5,380,0,1963,0,47.4810,-122.323


In [71]:
df_img = pd.read_parquet("../data/img_feature/image_features.parquet")
df_img.head()

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,1000102,0.257129,0.320597,0.120155,0.344228,0.136882,0.121618,0.060441,0.152371,0.190521,...,0.429108,0.263669,0.042307,0.096816,0.003009,0.092518,0.152732,0.018077,0.140195,0.156745
1,1001200050,0.073292,0.248591,0.077239,0.235161,0.168075,0.228327,0.041731,0.101655,0.162084,...,0.354346,0.185244,0.045582,0.007816,0.004493,0.136605,0.060961,0.04741,0.013306,0.073361
2,1003000175,0.263289,0.253705,0.11324,0.680229,0.375511,0.315341,0.044904,0.178718,0.086665,...,0.574743,0.704541,0.139124,0.077753,0.0,0.08983,0.321267,0.015184,0.033403,0.079402
3,100300280,0.578832,0.278685,0.401034,0.709438,0.232942,0.571633,0.262911,0.214601,0.269513,...,0.013319,0.04561,0.008362,0.251824,0.004043,0.17783,0.402788,0.06906,0.24923,0.025955
4,100300530,0.402126,0.339667,0.379926,0.591612,0.288472,0.181848,0.224858,0.233257,0.199456,...,0.025757,0.071166,0.0,0.034269,0.0,0.226515,0.141304,0.034364,0.070205,0.06077


In [72]:
df_img.shape

(16110, 2049)

In [73]:
print(df['id'].dtype)
print(df_img['id'].dtype)

int64
object


In [74]:
df["id"] = df["id"].astype(str)
df_img["id"] = df_img["id"].astype(str)


In [75]:
df_merged = df.merge(df_img, on="id", how="inner")

df_merged.shape


(16209, 2069)

In [76]:
df_merged

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,9117000170,20150505T000000,268643,4,2.25,1810,9240,2.0,0,0,...,0.060832,0.197148,0.005389,0.106641,0.000000,0.362716,0.017366,0.016099,0.054685,0.005876
1,6700390210,20140708T000000,245000,3,2.50,1600,2788,2.0,0,0,...,0.222785,0.033474,0.001999,0.161575,0.000000,0.254332,0.214466,0.048806,0.000127,0.091100
2,7212660540,20150115T000000,200000,4,2.50,1720,8638,2.0,0,0,...,0.537664,0.214857,0.006505,0.376466,0.136128,0.152024,0.216121,0.016846,0.394611,0.238134
3,8562780200,20150427T000000,352499,2,2.25,1240,705,2.0,0,0,...,0.267290,0.063320,0.032388,0.174089,0.002263,0.066675,0.196765,0.089156,0.277090,0.310584
4,7760400350,20141205T000000,232000,3,2.00,1280,13356,1.0,0,0,...,0.444486,0.458603,0.604391,0.312128,0.130964,0.060813,0.345420,0.128399,0.440191,0.239602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16204,5272200045,20141113T000000,378000,3,1.50,1000,6914,1.0,0,0,...,0.115836,0.201739,0.014101,0.002647,0.000185,0.075929,0.228097,0.063749,0.098495,0.161787
16205,9578500790,20141111T000000,399950,3,2.50,3087,5002,2.0,0,0,...,0.447218,0.189082,0.002992,0.645481,0.146737,0.152827,0.114365,0.228244,0.453144,0.218650
16206,7202350480,20140930T000000,575000,3,2.50,2120,4780,2.0,0,0,...,0.365593,0.087172,0.124831,0.201408,0.034993,0.021933,0.113571,0.107071,0.323724,0.763456
16207,1723049033,20140620T000000,245000,1,0.75,380,15000,1.0,0,0,...,0.114406,0.119242,0.000000,0.148863,0.000000,0.336662,0.152129,0.019048,0.057890,0.088658


In [77]:
X_img

Unnamed: 0,sqft_living15,sqft_lot15,0,1,2,3,4,5,6,7,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,1660,9240,0.069994,0.290391,0.156399,0.472710,0.469350,0.325156,0.112747,0.104530,...,0.060832,0.197148,0.005389,0.106641,0.000000,0.362716,0.017366,0.016099,0.054685,0.005876
1,1720,3605,0.259561,0.345883,0.115698,0.302584,0.337284,0.253996,0.134438,0.001832,...,0.222785,0.033474,0.001999,0.161575,0.000000,0.254332,0.214466,0.048806,0.000127,0.091100
2,1870,7455,0.445148,0.131718,0.364544,0.643274,0.065708,0.547754,0.028139,0.743246,...,0.537664,0.214857,0.006505,0.376466,0.136128,0.152024,0.216121,0.016846,0.394611,0.238134
3,1240,750,0.576097,0.201570,0.483533,0.336757,0.007107,0.398880,0.012243,0.342272,...,0.267290,0.063320,0.032388,0.174089,0.002263,0.066675,0.196765,0.089156,0.277090,0.310584
4,1590,8071,0.138230,0.082702,0.113726,0.366163,0.076401,0.253645,0.075423,0.020905,...,0.444486,0.458603,0.604391,0.312128,0.130964,0.060813,0.345420,0.128399,0.440191,0.239602
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16204,1000,6947,0.204772,0.379732,0.164697,0.085016,0.030824,0.039148,0.022015,0.178544,...,0.115836,0.201739,0.014101,0.002647,0.000185,0.075929,0.228097,0.063749,0.098495,0.161787
16205,2927,5183,0.284716,0.048989,0.015262,0.133309,0.149946,0.075291,0.337124,0.170051,...,0.447218,0.189082,0.002992,0.645481,0.146737,0.152827,0.114365,0.228244,0.453144,0.218650
16206,1690,2650,0.677522,0.012031,0.072849,0.038986,0.054745,0.026570,0.047431,0.267906,...,0.365593,0.087172,0.124831,0.201408,0.034993,0.021933,0.113571,0.107071,0.323724,0.763456
16207,1170,15000,0.257962,0.116049,0.176640,0.435463,0.456163,0.238442,0.038409,0.083506,...,0.114406,0.119242,0.000000,0.148863,0.000000,0.336662,0.152129,0.019048,0.057890,0.088658


In [78]:
X_tab = df_merged[tab_cols]
X_tab
X_img = df_merged.drop(columns=["id", "date","zipcode","price"] + tab_cols)
X = np.hstack([X_tab.values, X_img.values])

In [79]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [80]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)


### Baseline Model (Tabular Only) – For Comparison

In [81]:
X_tab_only = df[tab_cols]
X_tr, X_v, y_tr, y_v = train_test_split(
    X_tab_only, y, test_size=0.2, random_state=42
)

model_tab = RandomForestRegressor(n_estimators=300, random_state=42)
model_tab.fit(X_tr, y_tr)


In [85]:
pred = model_tab.predict(X_v)

rmse = np.sqrt(mean_squared_error(np.expm1(y_v),
                                  np.expm1(pred)))
r2 = r2_score(np.expm1(y_v),
              np.expm1(pred))

rmse.round(2), r2.round(2)

(131092.76, 0.86)

### Multimodal Model (Tabular + Image)

In [None]:
model_multi = RandomForestRegressor(
    n_estimators=400,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

model_multi.fit(X_train, y_train)


In [None]:
pred = model_multi.predict(X_val)

rmse = np.sqrt(mean_squared_error(np.expm1(y_val),
                                  np.expm1(pred)))
mae = mean_absolute_error(np.expm1(y_val),
                          np.expm1(pred))
r2 = r2_score(np.expm1(y_val),
              np.expm1(pred))

rmse, mae, r2
