# **Test SET Prediction**

In [1]:
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms


In [2]:
# Load test data
test_df = pd.read_excel("../data/raw/test2.xlsx")

print(test_df.shape)
test_df.head()


(5404, 20)


Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,2591820310,20141006T000000,4,2.25,2070,8893,2.0,0,0,4,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,7974200820,20140821T000000,5,3.0,2900,6730,1.0,0,0,5,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283
2,7701450110,20140815T000000,4,2.5,3770,10893,2.0,0,2,3,11,3770,0,1997,0,98006,47.5646,-122.129,3710,9685
3,9522300010,20150331T000000,3,3.5,4560,14608,2.0,0,2,3,12,4560,0,1990,0,98034,47.6995,-122.228,4050,14226
4,9510861140,20140714T000000,3,2.5,2550,5376,2.0,0,0,3,9,2550,0,2004,0,98052,47.6647,-122.083,2250,4050


In [3]:
print(test_df[["lat", "long"]].describe())
print("Unique IDs:", test_df["id"].nunique())


               lat         long
count  5404.000000  5404.000000
mean     47.558091  -122.213575
std       0.139228     0.143023
min      47.155900  -122.515000
25%      47.465800  -122.328000
50%      47.570150  -122.231000
75%      47.677325  -122.124000
max      47.777500  -121.315000
Unique IDs: 5396


In [4]:
test_df.isnull().sum()

id               0
date             0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [5]:
output_path = "../data/processed/test_with_images.csv"

test_df.to_csv(output_path, index=False)

print(f"Saved test_with_images.csv to {output_path}")


Saved test_with_images.csv to ../data/processed/test_with_images.csv


In [6]:
from pathlib import Path

IMAGE_DIR = Path("../data/test_images")

test_df["id"] = test_df["id"].astype(float)
test_df["image_path"] = test_df["id"].apply(
    lambda x: IMAGE_DIR / f"{x}.png"
)

test_df["image_exists"] = test_df["image_path"].apply(lambda x: x.exists())
print(test_df["image_exists"].value_counts())


image_exists
True    5404
Name: count, dtype: int64


In [7]:
dup_ids = test_df["id"][test_df["id"].duplicated()]
print("Number of duplicate rows:", len(dup_ids))
print("Unique duplicate IDs:", dup_ids.unique())


Number of duplicate rows: 8
Unique duplicate IDs: [4.20240008e+09 3.30300013e+09 9.21150062e+09 7.50402131e+09
 3.96930003e+09 7.85342011e+09 9.82820046e+09 6.14300002e+09]


In [8]:
before = len(test_df)

test_df = test_df.drop_duplicates(subset="id").reset_index(drop=True)

after = len(test_df)

print(f"Rows before deduplication: {before}")
print(f"Rows after deduplication:  {after}")
print("Unique IDs:", test_df['id'].nunique())


Rows before deduplication: 5404
Rows after deduplication:  5396
Unique IDs: 5396


In [9]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image

class TestImageDataset(Dataset):
    def __init__(self, dataframe, transform):
        self.df = dataframe.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        img_path = self.df.loc[idx, "image_path"]
        img = Image.open(img_path).convert("RGB")
        img = self.transform(img)
        return img


In [10]:
image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


In [11]:
test_image_dataset = TestImageDataset(
    dataframe=test_df,
    transform=image_transform
)

test_image_loader = DataLoader(
    test_image_dataset,
    batch_size=32,
    shuffle=False,
    num_workers=0
)

len(test_image_dataset)


5396

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device


device(type='cpu')

In [13]:
from torchvision import models
import torch.nn as nn

cnn_model = models.resnet18(pretrained=True)
cnn_model.fc = nn.Identity()  # 512-d embeddings

cnn_model = cnn_model.to(device)
cnn_model.eval()




ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [14]:
import torch
import numpy as np
from tqdm import tqdm

test_embeddings = []

with torch.no_grad():
    for batch in tqdm(test_image_loader, desc="Extracting test embeddings"):
        batch = batch.to(device)
        emb = cnn_model(batch)
        emb = emb.view(emb.size(0), -1)
        test_embeddings.append(emb.cpu().numpy())

X_test_img = np.vstack(test_embeddings)

print("Test image embeddings shape:", X_test_img.shape)


Extracting test embeddings: 100%|██████████| 169/169 [02:10<00:00,  1.30it/s]

Test image embeddings shape: (5396, 512)





In [15]:
print(test_df.shape)
print(X_test_img.shape)


(5396, 22)
(5396, 512)


In [16]:
test_df.head()

Unnamed: 0,id,date,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,image_path,image_exists
0,2591820000.0,20141006T000000,4,2.25,2070,8893,2.0,0,0,4,...,0,1986,0,98058,47.4388,-122.162,2390,7700,../data/test_images/2591820310.0.png,True
1,7974201000.0,20140821T000000,5,3.0,2900,6730,1.0,0,0,5,...,1070,1977,0,98115,47.6784,-122.285,2370,6283,../data/test_images/7974200820.0.png,True
2,7701450000.0,20140815T000000,4,2.5,3770,10893,2.0,0,2,3,...,0,1997,0,98006,47.5646,-122.129,3710,9685,../data/test_images/7701450110.0.png,True
3,9522300000.0,20150331T000000,3,3.5,4560,14608,2.0,0,2,3,...,0,1990,0,98034,47.6995,-122.228,4050,14226,../data/test_images/9522300010.0.png,True
4,9510861000.0,20140714T000000,3,2.5,2550,5376,2.0,0,0,3,...,0,2004,0,98052,47.6647,-122.083,2250,4050,../data/test_images/9510861140.0.png,True


In [17]:
import joblib

pca = joblib.load("../models/image_pca.joblib")
print("PCA loaded.")


PCA loaded.


In [18]:
X_test_img_pca = pca.transform(X_test_img)
print("X_test_img_pca shape:", X_test_img_pca.shape)


X_test_img_pca shape: (5396, 128)


In [19]:
def compute_green_cover_ratio(img):
    img = np.array(img)
    r, g, b = img[:,:,0], img[:,:,1], img[:,:,2]
    green_mask = (g > r) & (g > b) & (g > 60)
    return green_mask.mean()

def compute_road_density_ratio(img):
    img = np.array(img)
    gray = np.mean(img, axis=2)
    road_mask = gray > 160
    return road_mask.mean()

def compute_built_up_density_ratio(img):
    img = np.array(img)
    gray = np.mean(img, axis=2)
    built_mask = (gray > 80) & (gray < 160)
    return built_mask.mean()


In [20]:
green_list = []
road_list = []
built_list = []

for path in tqdm(test_df["image_path"], desc="Computing neighborhood features"):
    img = Image.open(path).convert("RGB")

    green_list.append(compute_green_cover_ratio(img))
    road_list.append(compute_road_density_ratio(img))
    built_list.append(compute_built_up_density_ratio(img))

test_df["green_cover_ratio"] = green_list
test_df["road_density_ratio"] = road_list
test_df["built_up_density_ratio"] = built_list


Computing neighborhood features: 100%|██████████| 5396/5396 [00:11<00:00, 452.22it/s]


In [21]:
print(test_df[
    ["green_cover_ratio", "road_density_ratio", "built_up_density_ratio"]
].describe())

print("Missing values:")
print(test_df[
    ["green_cover_ratio", "road_density_ratio", "built_up_density_ratio"]
].isnull().sum())


       green_cover_ratio  road_density_ratio  built_up_density_ratio
count        5396.000000         5396.000000             5396.000000
mean            0.272366            0.186950                0.295787
std             0.090347            0.120387                0.080941
min             0.016708            0.003006                0.042496
25%             0.211304            0.089756                0.242962
50%             0.262894            0.171524                0.291641
75%             0.325012            0.263393                0.343369
max             0.800781            0.682648                0.807037
Missing values:
green_cover_ratio         0
road_density_ratio        0
built_up_density_ratio    0
dtype: int64


In [22]:
tabular_features = [
    "bedrooms", "bathrooms", "sqft_living",
    "floors", "waterfront", "view",
    "condition", "grade",
    "sqft_living15", "sqft_lot15",
    "lat", "long"
]

X_test_tab = test_df[tabular_features].values
print("X_test_tab shape:", X_test_tab.shape)


X_test_tab shape: (5396, 12)


In [23]:
X_test_env = test_df[
    ["green_cover_ratio", "road_density_ratio", "built_up_density_ratio"]
].values

print("X_test_env shape:", X_test_env.shape)


X_test_env shape: (5396, 3)


In [24]:
import numpy as np

X_test_fused = np.hstack([
    X_test_tab,
    X_test_img_pca,
    X_test_env
])

print("X_test_fused shape:", X_test_fused.shape)


X_test_fused shape: (5396, 143)


In [26]:
from xgboost import XGBRegressor

fusion_model = XGBRegressor()
fusion_model.load_model("../models/fusion_xgb.json")

print("Fusion model loaded.")

Fusion model loaded.


In [31]:
print(fusion_model)

XGBRegressor(base_score='1.3050385E1', booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=None,
             n_jobs=None, num_parallel_tree=None, ...)


In [27]:
y_test_pred_log = fusion_model.predict(X_test_fused)
print("Predictions (log) shape:", y_test_pred_log.shape)


Predictions (log) shape: (5396,)


In [28]:
y_test_pred_price = np.expm1(y_test_pred_log)

print(
    "Predicted price range:",
    y_test_pred_price.min(),
    y_test_pred_price.max()
)


Predicted price range: 88713.695 2809165.0


In [29]:
submission_df = pd.DataFrame({
    "id": test_df["id"].values,
    "predicted_price": y_test_pred_price
})

submission_df.head()


Unnamed: 0,id,predicted_price
0,2591820000.0,356218.1
1,7974201000.0,934747.3
2,7701450000.0,1030662.0
3,9522300000.0,1772473.0
4,9510861000.0,732046.8


In [30]:
submission_df.to_csv(
    "../data/processed/23119058_final.csv",
    index=False
)

print("Saved 23119058_final.csv")


Saved 23119058_final.csv
