**PREPROCESSING**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
df = pd.read_csv("train_cdc.csv")

TARGET = "price"

df["log_price"] = np.log(df[TARGET])

# ✅ Keep id for image mapping
house_ids = df["id"].astype(str).values

# ❌ Remove non-tabular columns
df = df.drop(columns=[TARGET, "id", "date", "zipcode"])

In [None]:
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
num_cols.remove("log_price")

cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

print("Numerical columns:", num_cols)
print("Categorical columns:", cat_cols)

X = df.drop(columns=["log_price"])
y = df["log_price"].values

X_train, X_val, y_train, y_val, ids_train, ids_val = train_test_split(
    X, y, house_ids,
    test_size=0.2,
    random_state=42
)

Numerical columns: ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'lat', 'long', 'sqft_living15', 'sqft_lot15']
Categorical columns: []


In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
    ]
)

X_train_proc = preprocessor.fit_transform(X_train)
X_val_proc   = preprocessor.transform(X_val)

input_dim = X_train_proc.shape[1]

**MODEL TRAINING**

In [None]:
from torchvision import models, transforms
from PIL import Image
import os

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from torchvision import models
import torch.nn as nn

inception = models.inception_v3(
    weights=models.Inception_V3_Weights.IMAGENET1K_V1
)

inception.fc = nn.Identity()
inception.AuxLogits.fc = nn.Identity()

inception.eval().to(device)

for p in inception.parameters():
    p.requires_grad = False


Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth


100%|██████████| 104M/104M [00:00<00:00, 119MB/s] 


In [None]:
img_transform = transforms.Compose([
    transforms.Resize((299, 299)),
    transforms.ToTensor(),   # values in [0,1]
])

In [None]:
IMAGE_DIR = "/content/drive/MyDrive/satell_images_cdc_zoom15/"

def build_image_path_dict(image_dir):
    """
    Returns: {house_id: image_path}
    """
    img_map = {}
    for fname in os.listdir(image_dir):
        if fname.lower().endswith((".jpg", ".png", ".jpeg")):
            house_id = fname.split(".")[0]
            img_map[house_id] = os.path.join(image_dir, fname)
    return img_map
image_path_dict = build_image_path_dict(IMAGE_DIR)
print("Total images found:", len(image_path_dict))


Total images found: 16110


In [None]:
def extract_embeddings_batched(
    image_path_dict,
    batch_size=32
):
    image_embeddings = {}

    ids = list(image_path_dict.keys())

    for i in tqdm(range(0, len(ids), batch_size), desc="Extracting embeddings"):
        batch_ids = ids[i:i + batch_size]
        batch_imgs = []

        for hid in batch_ids:
            img = Image.open(image_path_dict[hid]).convert("RGB")
            img = img_transform(img)
            batch_imgs.append(img)

        batch_imgs = torch.stack(batch_imgs).to(device)

        with torch.no_grad():
            emb = inception(batch_imgs)  # (B, 2048)

        emb = emb.cpu().numpy()

        for j, hid in enumerate(batch_ids):
            image_embeddings[hid] = emb[j]

    return image_embeddings


In [None]:
image_embeddings = extract_embeddings_batched(
    image_path_dict,
    batch_size=32  # ideal for T4
)
np.save("/content/drive/MyDrive/image_embeddings.npy", image_embeddings)
print("Saved embeddings:", len(image_embeddings))

Extracting embeddings: 100%|██████████| 504/504 [02:43<00:00,  3.08it/s]


Saved embeddings: 16110


In [None]:
from scipy.sparse import issparse

class FusionDataset(Dataset):
    def __init__(self, X_tab, ids, y, image_embeddings):
        if issparse(X_tab):
            X_tab = X_tab.toarray()

        self.X_tab = torch.tensor(X_tab, dtype=torch.float32)
        self.X_img = torch.tensor(
            np.vstack([image_embeddings[i] for i in ids]),
            dtype=torch.float32
        )
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_tab[idx], self.X_img[idx], self.y[idx]

In [None]:
image_embeddings = np.load(
    "/content/drive/MyDrive/image_embeddings.npy",
    allow_pickle=True
).item()

for k in image_embeddings:
    v = image_embeddings[k]
    image_embeddings[k] = v / np.linalg.norm(v)

train_ds = FusionDataset(
    X_train_proc, ids_train, y_train, image_embeddings
)

val_ds = FusionDataset(
    X_val_proc, ids_val, y_val, image_embeddings
)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False)

In [None]:
class TabularFCNet(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.net(x).squeeze(1)


class ImageHead(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(2048, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, 64),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.net(x)


class TabularHead(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 40),
            nn.ReLU()
        )

    def forward(self, x):
        return self.net(x)


class FusionRegressor(nn.Module):
    def __init__(self, tab_input_dim):
        super().__init__()
        self.image_head = ImageHead()
        self.tabular_head = TabularHead(tab_input_dim)
        self.regressor = nn.Linear(64 + 40, 1)

    def forward(self, x_tab, x_img):
        img_feat = self.image_head(x_img)
        tab_feat = self.tabular_head(x_tab)
        fused = torch.cat([img_feat, tab_feat], dim=1)
        return self.regressor(fused).squeeze(1)


In [None]:
tab_model = TabularFCNet(X_train_proc.shape[1]).to(device)

In [None]:


optimizer_tab = torch.optim.Adam(tab_model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

for epoch in range(198):
    tab_model.train()
    total_loss = 0

    for x_tab, _, y in train_loader:
        x_tab, y = x_tab.to(device), y.to(device)

        optimizer_tab.zero_grad()
        y_pred = tab_model(x_tab)
        loss = criterion(y_pred, y)
        loss.backward()
        optimizer_tab.step()

        total_loss += loss.item()

    print(f"[Tab] Epoch {epoch+1} | Loss: {total_loss/len(train_loader):.4f}")


[Tab] Epoch 1 | Loss: 33.1910
[Tab] Epoch 2 | Loss: 1.4414
[Tab] Epoch 3 | Loss: 1.2948
[Tab] Epoch 4 | Loss: 1.2044
[Tab] Epoch 5 | Loss: 1.1062
[Tab] Epoch 6 | Loss: 1.0920
[Tab] Epoch 7 | Loss: 1.0077
[Tab] Epoch 8 | Loss: 0.9827
[Tab] Epoch 9 | Loss: 0.9773
[Tab] Epoch 10 | Loss: 0.9379
[Tab] Epoch 11 | Loss: 0.8830
[Tab] Epoch 12 | Loss: 0.8974
[Tab] Epoch 13 | Loss: 0.8434
[Tab] Epoch 14 | Loss: 0.8288
[Tab] Epoch 15 | Loss: 0.8181
[Tab] Epoch 16 | Loss: 0.7751
[Tab] Epoch 17 | Loss: 0.7627
[Tab] Epoch 18 | Loss: 0.7541
[Tab] Epoch 19 | Loss: 0.7496
[Tab] Epoch 20 | Loss: 0.7124
[Tab] Epoch 21 | Loss: 0.7261
[Tab] Epoch 22 | Loss: 0.7154
[Tab] Epoch 23 | Loss: 0.6856
[Tab] Epoch 24 | Loss: 0.6763
[Tab] Epoch 25 | Loss: 0.6649
[Tab] Epoch 26 | Loss: 0.6500
[Tab] Epoch 27 | Loss: 0.6319
[Tab] Epoch 28 | Loss: 0.6423
[Tab] Epoch 29 | Loss: 0.6322
[Tab] Epoch 30 | Loss: 0.6096
[Tab] Epoch 31 | Loss: 0.5997
[Tab] Epoch 32 | Loss: 0.5986
[Tab] Epoch 33 | Loss: 0.5741
[Tab] Epoch 34 | L

In [None]:
tab_model.eval()
for p in tab_model.parameters():
    p.requires_grad = False

In [None]:
fusion_model = FusionRegressor(X_train_proc.shape[1]).to(device)

In [None]:

from sklearn.metrics import r2_score
import numpy as np

criterion = nn.MSELoss()
optimizer_fusion = torch.optim.Adam(fusion_model.parameters(), lr=5e-4, weight_decay = 1e-4)

for epoch in range(200):
    # ---------- TRAIN ----------
    fusion_model.train()
    train_loss = 0.0

    for x_tab, x_img, y in train_loader:
        x_tab = x_tab.to(device)
        x_img = x_img.to(device)
        y = y.to(device)

        with torch.no_grad():
            y_tab = tab_model(x_tab)
            residual = y - y_tab

        optimizer_fusion.zero_grad()
        r_pred = fusion_model(x_tab, x_img)
        loss = criterion(r_pred, residual)
        loss.backward()
        optimizer_fusion.step()

        train_loss += loss.item()

    train_loss /= len(train_loader)

    # ---------- VALIDATION (R² on FINAL prediction) ----------
    fusion_model.eval()
    val_preds, val_targets = [], []

    with torch.no_grad():
        for x_tab, x_img, y in val_loader:
            x_tab = x_tab.to(device)
            x_img = x_img.to(device)

            y_tab = tab_model(x_tab)
            r_pred = fusion_model(x_tab, x_img)

            y_final = y_tab + r_pred

            val_preds.append(y_final.cpu())
            val_targets.append(y)

    val_preds = torch.cat(val_preds).numpy()
    val_targets = torch.cat(val_targets).numpy()

    val_r2 = r2_score(val_targets, val_preds)

    print(
        f"[Epoch {epoch+1:02d}] "
        f"Fusion Train Loss: {train_loss:.4f} | "
        f"Val R² (FINAL): {val_r2:.4f}"
    )


[Epoch 01] Fusion Train Loss: 0.0356 | Val R² (FINAL): 0.8822
[Epoch 02] Fusion Train Loss: 0.0290 | Val R² (FINAL): 0.8837
[Epoch 03] Fusion Train Loss: 0.0278 | Val R² (FINAL): 0.8853
[Epoch 04] Fusion Train Loss: 0.0274 | Val R² (FINAL): 0.8868
[Epoch 05] Fusion Train Loss: 0.0265 | Val R² (FINAL): 0.8869
[Epoch 06] Fusion Train Loss: 0.0260 | Val R² (FINAL): 0.8876
[Epoch 07] Fusion Train Loss: 0.0258 | Val R² (FINAL): 0.8885
[Epoch 08] Fusion Train Loss: 0.0253 | Val R² (FINAL): 0.8857
[Epoch 09] Fusion Train Loss: 0.0250 | Val R² (FINAL): 0.8886
[Epoch 10] Fusion Train Loss: 0.0247 | Val R² (FINAL): 0.8872
[Epoch 11] Fusion Train Loss: 0.0244 | Val R² (FINAL): 0.8891
[Epoch 12] Fusion Train Loss: 0.0242 | Val R² (FINAL): 0.8881
[Epoch 13] Fusion Train Loss: 0.0242 | Val R² (FINAL): 0.8875
[Epoch 14] Fusion Train Loss: 0.0242 | Val R² (FINAL): 0.8897
[Epoch 15] Fusion Train Loss: 0.0239 | Val R² (FINAL): 0.8890
[Epoch 16] Fusion Train Loss: 0.0238 | Val R² (FINAL): 0.8886
[Epoch 1

In [None]:
def predict_final(tab_model, fusion_model, loader):
    tab_model.eval()
    fusion_model.eval()

    preds, targets = [], []

    with torch.no_grad():
        for x_tab, x_img, y in loader:
            x_tab = x_tab.to(device)
            x_img = x_img.to(device)

            y_tab = tab_model(x_tab)
            r_pred = fusion_model(x_tab, x_img)

            y_final = y_tab + r_pred

            preds.append(y_final.cpu())
            targets.append(y)

    preds = torch.cat(preds).numpy()
    targets = torch.cat(targets).numpy()
    return preds, targets


In [None]:
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

preds, targets = predict_final(tab_model, fusion_model, val_loader)

rmse = np.sqrt(mean_squared_error(targets, preds))
r2 = r2_score(targets, preds)

print("Final RMSE:", rmse)
print("Final R2:", r2)

Final RMSE: 0.1787359293154698
Final R2: 0.884232759475708


In [None]:
# Save only the weights (recommended)
torch.save(fusion_model.state_dict(), "/content/drive/MyDrive/fusion_model.pth")


In [None]:
# Save only the weights (recommended)
torch.save(tab_model.state_dict(), "/content/drive/MyDrive/tabular_model.pth")

**PREDICTION**

In [None]:
tab_model_path = "/content/drive/MyDrive/tabular_model.pth"
fusion_model_path = "/content/drive/MyDrive/fusion_model.pth"

tab_model.load_state_dict(torch.load(tab_model_path, map_location=device))
fusion_model.load_state_dict(torch.load(fusion_model_path, map_location=device))

<All keys matched successfully>

In [None]:
IMAGE_DIR = '/content/drive/MyDrive/satell_images_cdc_zoom15_test'

In [None]:
# 1. Load the test dataset
test_df = pd.read_csv("test_cdc.csv")

# ✅ Keep id for image mapping and store for final submission
test_house_ids = test_df["id"].astype(str).values

# ❌ Remove non-tabular columns consistent with training
X_test = test_df.drop(columns=["id", "date", "zipcode"])

# 2. Preprocess tabular data using the pre-fitted 'preprocessor'
X_test_proc = preprocessor.transform(X_test)

# 3. Extract Image Embeddings for the Test Set
# We use the existing build_image_path_dict and extract_embeddings_batched functions
test_image_path_dict = build_image_path_dict(IMAGE_DIR)

# Filter the dictionary to only include IDs present in the test set
test_image_map = {hid: test_image_path_dict[hid] for hid in test_house_ids if hid in test_image_path_dict}

print(f"Extracting embeddings for {len(test_image_map)} test images...")
test_embeddings = extract_embeddings_batched(test_image_map, batch_size=32)

# Apply L2 Normalization (same as done for training embeddings)
for k in test_embeddings:
    v = test_embeddings[k]
    test_embeddings[k] = v / np.linalg.norm(v)

# 4. Create a Prediction Dataset (Modified FusionDataset to handle missing 'y')
class TestFusionDataset(Dataset):
    def __init__(self, X_tab, ids, image_embeddings):
        if issparse(X_tab):
            X_tab = X_tab.toarray()
        self.X_tab = torch.tensor(X_tab, dtype=torch.float32)
        # Handle cases where an image might be missing by using a zero vector
        self.X_img = torch.tensor(
            np.vstack([image_embeddings.get(i, np.zeros(2048)) for i in ids]),
            dtype=torch.float32
        )

    def __len__(self):
        return len(self.X_tab)

    def __getitem__(self, idx):
        return self.X_tab[idx], self.X_img[idx]

test_ds = TestFusionDataset(X_test_proc, test_house_ids, test_embeddings)
test_loader = DataLoader(test_ds, batch_size=64, shuffle=False)

# 5. Run Inference
tab_model.eval()
fusion_model.eval()
all_preds = []

with torch.no_grad():
    for x_tab, x_img in tqdm(test_loader, desc="Predicting"):
        x_tab, x_img = x_tab.to(device), x_img.to(device)

        # Step 1: Base tabular prediction
        y_tab = tab_model(x_tab)

        # Step 2: Image-based residual prediction
        r_pred = fusion_model(x_tab, x_img)

        # Final log-price prediction = Tabular + Residual
        y_final_log = y_tab + r_pred

        # Convert back from log-scale to original price scale
        y_final_price = torch.exp(y_final_log)

        all_preds.append(y_final_price.cpu())

# 6. Save Results
test_predictions = torch.cat(all_preds).numpy()
submission = pd.DataFrame({
    "id": test_house_ids,
    "price": test_predictions
})

submission.to_csv("test_predictions.csv", index=False)
print("Predictions saved to test_predictions.csv")

Extracting embeddings for 5396 test images...


Extracting embeddings: 100%|██████████| 169/169 [02:36<00:00,  1.08it/s]
Predicting: 100%|██████████| 85/85 [00:00<00:00, 479.27it/s]

Predictions saved to test_predictions.csv





In [None]:
df_pred = pd.read_csv("test_predictions.csv")
df_pred.shape

(5404, 2)