In [3]:
import os
import numpy as np
import pandas as pd
from PIL import Image

import torch
import torchvision.models as models
import torchvision.transforms as transforms

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib


In [4]:
DATA_DIR = "../data"
IMG_DIR = os.path.join(DATA_DIR, "images")

df = pd.read_csv(os.path.join(DATA_DIR, "housing.csv"))
df.head()


Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [15]:
df = df.iloc[:21].reset_index(drop=True)


In [16]:
X_tabular = df.drop("median_house_value", axis=1)
y = df["median_house_value"]


In [17]:
X_tabular = pd.get_dummies(X_tabular)

# Scale numerical features
scaler = StandardScaler()
X_tabular_scaled = scaler.fit_transform(X_tabular)


In [18]:
cnn = models.resnet18(pretrained=True)
cnn = torch.nn.Sequential(*list(cnn.children())[:-1])  
cnn.eval()




Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Con

In [19]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])


In [22]:
image_features = []

for i in range(len(df)):
    img_path = os.path.join(IMG_DIR, f"{i}.jpeg")
    image = Image.open(img_path).convert("RGB")
    image = transform(image).unsqueeze(0)

    with torch.no_grad():
        features = cnn(image)
    
    image_features.append(features.squeeze().numpy())

image_features = np.array(image_features)
print("Image feature shape:", image_features.shape)


Image feature shape: (21, 512)


In [25]:
X_final = np.hstack([X_tabular_scaled, image_features])
print("Final feature shape:", X_final.shape)


Final feature shape: (21, 521)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42
)


In [27]:
model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

model.fit(X_train, y_train)


In [28]:
preds = model.predict(X_test)

mae = mean_absolute_error(y_test, preds)
rmse = np.sqrt(mean_squared_error(y_test, preds))

print("Mean Absolute Error (MAE):", mae)
print("Root Mean Squared Error (RMSE):", rmse)


Mean Absolute Error (MAE): 102145.9
Root Mean Squared Error (RMSE): 119514.29081557569


In [29]:
joblib.dump(model, "../models/multimodal_model.pkl")


['../models/multimodal_model.pkl']

# This project implemented a multimodal learning approach for housing price prediction by combining tabular features from the California Housing dataset with visual features extracted from house images using a pretrained ResNet18 CNN. Image embeddings were fused with normalized tabular features and used to train a Random Forest regression model. The model was evaluated using MAE and RMSE, demonstrating the effectiveness of multimodal feature fusion for regression tasks.