In [1]:
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.models import Model
import numpy as np
from PIL import Image
import os
from tqdm import tqdm

In [2]:
base_model = EfficientNetB0(
    weights="imagenet",
    include_top=False,
    input_shape=(224, 224, 3)
)

x = GlobalAveragePooling2D()(base_model.output)
model = Model(inputs=base_model.input, outputs=x)

model.trainable = False


In [3]:
def load_image(path):
    img = Image.open(path).convert("RGB")
    img = img.resize((224, 224))
    img = np.array(img, dtype=np.float32)
    img = preprocess_input(img)
    return img

In [4]:
IMAGE_DIR = "images/train"
OUT_DIR = "images/embeddings"
os.makedirs(OUT_DIR, exist_ok=True)

BATCH_SIZE = 16


In [5]:
image_files = sorted(os.listdir(IMAGE_DIR))

batch = []
names = []

for i, img_name in enumerate(tqdm(image_files)):
    img_path = os.path.join(IMAGE_DIR, img_name)
    batch.append(load_image(img_path))
    names.append(img_name)

    if len(batch) == BATCH_SIZE or i == len(image_files) - 1:
        batch = np.array(batch)
        embeddings = model.predict(batch, verbose=0)

        for name, emb in zip(names, embeddings):
            np.save(
                os.path.join(OUT_DIR, name.replace(".png", ".npy")),
                emb
            )

        batch, names = [], []

100%|██████████| 16110/16110 [15:28<00:00, 17.35it/s]


In [None]:
X = []
ids = []

for file in sorted(os.listdir(OUT_DIR)):
    X.append(np.load(os.path.join(OUT_DIR, file)))
    ids.append(file.replace(".npy", ""))

X = np.array(X)
np.save("train_embeddings_using_efficientnetb0.npy", X)


Lets do same for test images

In [7]:
IMAGE_DIR = "images/test"
OUT_DIR = "images/embeddings_test"
os.makedirs(OUT_DIR, exist_ok=True)

BATCH_SIZE = 16


In [8]:
image_files = sorted(os.listdir(IMAGE_DIR))

batch = []
names = []

for i, img_name in enumerate(tqdm(image_files)):
    img_path = os.path.join(IMAGE_DIR, img_name)
    batch.append(load_image(img_path))
    names.append(img_name)

    if len(batch) == BATCH_SIZE or i == len(image_files) - 1:
        batch = np.array(batch)
        embeddings = model.predict(batch, verbose=0)

        for name, emb in zip(names, embeddings):
            np.save(
                os.path.join(OUT_DIR, name.replace(".png", ".npy")),
                emb
            )

        batch, names = [], []

100%|██████████| 5396/5396 [04:13<00:00, 21.25it/s]


In [None]:
X1 = []
ids1 = []

for file in sorted(os.listdir(OUT_DIR)):
    X1.append(np.load(os.path.join(OUT_DIR, file)))
    ids1.append(file.replace(".npy", ""))

X1 = np.array(X1)
np.save("test_embeddings_using_efficientnetb0.npy", X1)


Lets combine the tabular data with these generated embeddings


In [9]:
import pandas as pd
train_data = pd.read_csv("train.csv")
test_data  = pd.read_csv("test.csv")


In [2]:
def load_embeddings_dict(folder):
    emb_dict = {}
    for f in os.listdir(folder):
        if f.endswith(".npy"):
            key = f.replace(".npy", "")
            emb_dict[key] = np.load(os.path.join(folder, f))
    return emb_dict


In [3]:
train_emb = load_embeddings_dict("images/embeddings_train")
test_emb  = load_embeddings_dict("images/embeddings_test")


In [6]:
mean_embedding = np.mean(np.stack(list(train_emb.values())), axis=0)

os.makedirs("data/processed", exist_ok=True)
np.save("data/processed/mean_embedding.npy", mean_embedding)

In [10]:
train_data['date'] = pd.to_datetime(train_data['date'])
test_data["date"] = pd.to_datetime(test_data["date"])

In [12]:
# Extracting Year
train_data['year'] = train_data['date'].dt.year
test_data['year'] = test_data['date'].dt.year
# Extracting Month
train_data['month'] = train_data['date'].dt.month
test_data['month'] = test_data['date'].dt.month

# Extracting Day
train_data['day'] = train_data['date'].dt.day
test_data['day'] = test_data['date'].dt.day

# You can even extract the day of the week (0=Monday, 6=Sunday)
train_data['day_of_week'] = train_data['date'].dt.dayofweek
test_data['day_of_week'] = test_data['date'].dt.dayofweek


In [16]:
tabular_cols = list(train_data.columns)
tabular_cols

['id',
 'date',
 'price',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15',
 'year',
 'month',
 'day',
 'day_of_week']

In [18]:
del tabular_cols[0:3]

In [20]:
len(tabular_cols)

22

In [21]:
X_train = []
y_train = []

for _, row in train_data.iterrows():
    rid = str(row["id"])
    
    if rid not in train_emb:
        continue  # drop row

    tab = row[tabular_cols].values.astype("float32")
    img = train_emb[rid]

    X_train.append(np.concatenate([tab, img]))
    y_train.append(row["price"])


In [22]:
X_train = np.array(X_train, dtype="float32")
y_train = np.array(y_train, dtype="float32")

In [24]:
X_test = []
test_ids = []

for _, row in test_data.iterrows():
    rid = str(row["id"])

    tab = row[tabular_cols].values.astype("float32")

    if rid in test_emb:
        img = test_emb[rid]
    else:
        img = mean_embedding

    X_test.append(np.concatenate([tab, img]))
    test_ids.append(rid)

In [25]:
X_test = np.array(X_test, dtype="float32")
test_ids = np.array(test_ids)

In [27]:
X_train.shape

(16209, 1302)

In [28]:
np.save("data/processed/X_train.npy", X_train)
np.save("data/processed/y_train.npy", y_train)

np.save("data/processed/X_test.npy", X_test)
np.save("data/processed/test_ids.npy", test_ids)