In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
Mounted at /content/drive


In [2]:
PROJECT_ROOT = "/content/drive/MyDrive/property_valuation_project"

In [3]:
import os

IMAGE_VERSION = "v2"
IMAGE_DIR = f"{PROJECT_ROOT}/data/images_{IMAGE_VERSION}"

OUTPUT_DIR = f"{PROJECT_ROOT}/outputs"

os.makedirs(IMAGE_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Folders ready:")
print(IMAGE_DIR)
print(OUTPUT_DIR)


Folders ready:
/content/drive/MyDrive/property_valuation_project/data/images_v2
/content/drive/MyDrive/property_valuation_project/outputs
Folders ready:
/content/drive/MyDrive/property_valuation_project/data/images_v2
/content/drive/MyDrive/property_valuation_project/outputs


In [4]:
len(os.listdir(IMAGE_DIR)), os.listdir(IMAGE_DIR)[:5]

(7979,
 ['4139440460.0.png',
  '7663700030.0.png',
  '1323059143.0.png',
  '1338800280.0.png',
  '290000055.0.png'])

Data augmentation techniques were not applied, as the convolutional neural network was employed solely as a fixed feature extractor. Additionally, aerial imagery exhibits strong spatial orientation semantics, and geometric transformations such as rotations and flips could distort geographic context.

In [5]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input


In [6]:
tf.random.set_seed(42)

In [7]:
print("Image directory:", IMAGE_DIR)
print("Output directory:", OUTPUT_DIR)
print("Number of images:", len(os.listdir(IMAGE_DIR)))


Image directory: /content/drive/MyDrive/property_valuation_project/data/images_v2
Output directory: /content/drive/MyDrive/property_valuation_project/outputs
Number of images: 7979


In [8]:
cnn_model = EfficientNetB0(
    weights="imagenet",
    include_top=False,
    pooling="avg"
)

cnn_model.trainable = False
cnn_model.summary()


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step


In [9]:
IMG_SIZE = 224

def load_and_preprocess_image(img_path):
    img = Image.open(img_path).convert("RGB")
    img = img.resize((IMG_SIZE, IMG_SIZE))
    img = np.array(img, dtype=np.float32)
    img = np.expand_dims(img, axis=0)
    img = preprocess_input(img)
    return img


In [10]:
import os
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from tqdm import tqdm


In [11]:
def green_ratio(img):
    img = img.astype("float32") / 255.0
    green = img[:, :, 1]
    red = img[:, :, 0]
    blue = img[:, :, 2]
    return np.mean((green > red) & (green > blue))

def water_ratio(img):
    img = img.astype("float32") / 255.0
    blue = img[:, :, 2]
    red = img[:, :, 0]
    green = img[:, :, 1]
    return np.mean((blue > red) & (blue > green))

def edge_density(img):
    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    return np.mean(edges > 0)


In [12]:
image_embeddings = []
image_ids = []
green_vals = []
water_vals = []
edge_vals = []

image_files = sorted([
    f for f in os.listdir(IMAGE_DIR)
    if f.endswith(".png")
])

cnn_mean_vals = []
cnn_std_vals  = []
cnn_max_vals  = []
cnn_min_vals  = []


for img_name in tqdm(image_files):
    try:
        property_id = int(float(img_name.replace(".png", "")))
        img_path = os.path.join(IMAGE_DIR, img_name)

        # CNN embedding
        img_cnn = load_and_preprocess_image(img_path)
        features = cnn_model(img_cnn, training=False).numpy().flatten()


        # Raw image
        img_raw = Image.open(img_path).convert("RGB").resize((224,224))
        img_raw = np.array(img_raw)

        cnn_mean_vals.append(features.mean())
        cnn_std_vals.append(features.std())
        cnn_max_vals.append(features.max())
        cnn_min_vals.append(features.min())
        image_ids.append(property_id)

        green_vals.append(green_ratio(img_raw))
        water_vals.append(water_ratio(img_raw))
        edge_vals.append(edge_density(img_raw))

        if len(image_ids) % 500 == 0:
            print(f"Processed {len(image_ids)} images...")

    except Exception as e:
        print("Skipping:", img_name, "Reason:", e)


  6%|▋         | 500/7979 [08:47<1:19:58,  1.56it/s]

Processed 500 images...


 13%|█▎        | 1000/7979 [13:27<56:25,  2.06it/s]

Processed 1000 images...


 19%|█▉        | 1500/7979 [18:20<58:07,  1.86it/s]

Processed 1500 images...


 25%|██▌       | 2000/7979 [22:56<50:08,  1.99it/s]

Processed 2000 images...


 31%|███▏      | 2500/7979 [27:23<44:05,  2.07it/s]

Processed 2500 images...


 38%|███▊      | 3000/7979 [32:02<42:04,  1.97it/s]

Processed 3000 images...


 44%|████▍     | 3500/7979 [36:42<47:24,  1.57it/s]

Processed 3500 images...


 50%|█████     | 4000/7979 [41:31<34:32,  1.92it/s]

Processed 4000 images...


 56%|█████▋    | 4500/7979 [46:04<30:15,  1.92it/s]

Processed 4500 images...


 63%|██████▎   | 5000/7979 [50:37<32:24,  1.53it/s]

Processed 5000 images...


 69%|██████▉   | 5500/7979 [55:13<21:07,  1.96it/s]

Processed 5500 images...


 75%|███████▌  | 6000/7979 [59:46<22:38,  1.46it/s]

Processed 6000 images...


 81%|████████▏ | 6500/7979 [1:04:19<13:48,  1.78it/s]

Processed 6500 images...


 88%|████████▊ | 7000/7979 [1:09:00<10:35,  1.54it/s]

Processed 7000 images...


 94%|█████████▍| 7500/7979 [1:13:35<04:02,  1.98it/s]

Processed 7500 images...


100%|██████████| 7979/7979 [1:18:19<00:00,  1.70it/s]


In [13]:
image_features_df = pd.DataFrame({
    "id": image_ids,
    "cnn_mean": cnn_mean_vals,
    "cnn_std": cnn_std_vals,
    "cnn_max": cnn_max_vals,
    "cnn_min": cnn_min_vals,
    "green_ratio": green_vals,
    "water_ratio": water_vals,
    "edge_density": edge_vals
})

image_features_df["built_up"] = 1 - image_features_df["green_ratio"]

image_features_df.shape

(7979, 9)

(7979, 9)

In [14]:
# Normalize semantic features (important for 8k scale)
for col in ["green_ratio", "water_ratio", "edge_density", "built_up"]:
    image_features_df[col] = (
        image_features_df[col] - image_features_df[col].mean()
    ) / image_features_df[col].std()

In [15]:
# Interaction feature
image_features_df["green_edge_interaction"] = (
    image_features_df["green_ratio"] * image_features_df["edge_density"]
)

In [16]:
len(image_embeddings), len(image_ids), len(green_vals), len(water_vals), len(edge_vals)


(0, 7979, 7979, 7979, 7979)

In [18]:
image_features_df.to_csv(f"{OUTPUT_DIR}/image_features_{IMAGE_VERSION}.csv",index=False)
print("Saved image_features.csv with compressed CNN + semantic features")



Saved image_features.csv with compressed CNN + semantic features


## CNN-Based Visual Feature Extraction

Satellite imagery was processed using a hybrid visual feature extraction strategy designed to balance representational power, interpretability, and robustness under limited supervision.

A pretrained EfficientNet-B0 convolutional neural network was employed as a fixed feature extractor. The classification head was removed and global average pooling was applied to generate high-level spatial representations for each aerial image. All network weights were frozen to prevent overfitting, as the task involves regression and the visual dataset size remains moderate relative to typical image classification benchmarks.

Rather than directly using the full 1280-dimensional CNN embeddings, the activations were statistically compressed into four descriptive summary features (mean, standard deviation, maximum, and minimum activation). This compression reduces redundancy and noise while producing compact features better suited for tree-based regression models such as XGBoost.

To complement the abstract CNN representations, domain-specific semantic visual features were engineered directly from raw RGB imagery. These include vegetation coverage (green ratio), water proximity (water ratio), road and parcel structure (edge density), and built-up density. An additional interaction feature capturing the joint effect of greenery and structural density was included to better represent suburban neighborhood patterns.

The final visual representation for each property consists of a small set of compact CNN descriptors and interpretable environmental indicators. This design enables effective multimodal fusion with tabular housing attributes while maintaining model transparency and robustness.