## Multimodal Catboost (using PCA)

In [1]:
!pip install catboost --q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import pandas as pd

# Load data
df = pd.read_csv("train.csv")
emb = np.load("embeddings_final.npy")

TARGET = "price"
DROP_COLS = ["id", "date"]

df["renovated_post_2000"] = np.where(
df["yr_renovated"] > 0,
"yes",
"no"
)

df.drop(columns=["yr_renovated"], inplace=True)


CAT_COLS = [
    "view",
    "condition",
    "waterfront",
    "floors",
    "renovated_post_2000",
]

# Cast categoricals to string (CatBoost requirement)
for col in CAT_COLS:
    df[col] = df[col].astype(str)

y = df[TARGET]
X_tab = df.drop(columns=[TARGET] + DROP_COLS)


In [None]:
from sklearn.model_selection import train_test_split

X_tab_tr, X_tab_te, emb_tr, emb_te, y_tr, y_te = train_test_split(
    X_tab,
    emb,
    y,
    test_size=0.2,
    random_state=42
)


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
emb_tr_scaled = scaler.fit_transform(emb_tr)
emb_te_scaled = scaler.transform(emb_te)


In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=128, random_state=42)
emb_tr_pca = pca.fit_transform(emb_tr_scaled)
emb_te_pca = pca.transform(emb_te_scaled)

print("PCA explained variance:", pca.explained_variance_ratio_.sum())


PCA explained variance: 0.8699722


In [None]:
emb_tr_df = pd.DataFrame(
    emb_tr_pca,
    columns=[f"img_pca_{i}" for i in range(emb_tr_pca.shape[1])]
)

emb_te_df = pd.DataFrame(
    emb_te_pca,
    columns=[f"img_pca_{i}" for i in range(emb_te_pca.shape[1])]
)

X_tr = pd.concat([X_tab_tr.reset_index(drop=True), emb_tr_df], axis=1)
X_te = pd.concat([X_tab_te.reset_index(drop=True), emb_te_df], axis=1)


In [None]:
cat_features = [
    X_tr.columns.get_loc(col)
    for col in CAT_COLS
]

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.03,
    depth=8,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42,
    verbose=200
)

model.fit(
    X_tr, y_tr,
    eval_set=(X_te, y_te),
    cat_features=cat_features,
    use_best_model=True
)


0:	learn: 354900.2539120	test: 347626.7814804	best: 347626.7814804 (0)	total: 307ms	remaining: 15m 21s
200:	learn: 110557.4713177	test: 131473.4282432	best: 131473.4282432 (200)	total: 32s	remaining: 7m 26s
400:	learn: 89688.7534140	test: 120367.7324321	best: 120367.7324321 (400)	total: 1m	remaining: 6m 30s
600:	learn: 76513.1284840	test: 116529.7224736	best: 116529.7224736 (600)	total: 1m 29s	remaining: 5m 56s
800:	learn: 67860.5789016	test: 115918.7807015	best: 115891.2342418 (797)	total: 1m 59s	remaining: 5m 28s
1000:	learn: 60798.6875342	test: 115401.5407492	best: 115401.5407492 (1000)	total: 2m 29s	remaining: 4m 58s
1200:	learn: 54716.2122960	test: 114954.7628484	best: 114943.7194737 (1194)	total: 2m 59s	remaining: 4m 28s
1400:	learn: 49519.4714494	test: 114771.0315237	best: 114755.3150175 (1393)	total: 3m 28s	remaining: 3m 58s
1600:	learn: 45061.5691968	test: 114632.2710506	best: 114616.0859052 (1590)	total: 3m 57s	remaining: 3m 27s
1800:	learn: 40997.7261205	test: 114514.0976336

<catboost.core.CatBoostRegressor at 0x7d30f3dae5a0>

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

y_pred = model.predict(X_te)

rmse = np.sqrt(mean_squared_error(y_te, y_pred))
r2 = r2_score(y_te, y_pred)

print(f"Test RMSE: {rmse:.4f}")
print(f"Test R²:   {r2:.4f}")


Test RMSE: 114444.5768
Test R²:   0.8956


## Multimodal Regressor 1

In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [None]:
TARGET = "price"
LOG_TARGET = "log_price"
DROP_COLS = ["id", "date"]

CAT_COLS = [
    "view",
    "condition",
    "waterfront",
    "floors",
    "renovated_post_2000",
]

df[LOG_TARGET] = np.log1p(df[TARGET])

for col in CAT_COLS:
    df[col] = df[col].astype(str)

X = df.drop(columns=[TARGET, LOG_TARGET] + DROP_COLS)
y = df[LOG_TARGET]


In [None]:
df = pd.read_csv("train.csv")
emb = np.load("embeddings_final.npy")  # ALIGNED embeddings

In [None]:
TARGET = "price"
LOG_TARGET = "log_price"
DROP_COLS = ["id", "date"]

CAT_COLS = [
    "view",
    "condition",
    "waterfront",
    "floors",
    "renovated_post_2000",
]


In [None]:
df[LOG_TARGET] = np.log1p(df[TARGET])

y = df[LOG_TARGET]
X_tab = df.drop(columns=[TARGET, LOG_TARGET] + DROP_COLS)

In [None]:
X_tab_tr, X_tab_te, emb_tr, emb_te, y_tr, y_te = train_test_split(
    X_tab,
    emb,
    y,
    test_size=0.2,
    random_state=42
)


In [None]:
scaler = StandardScaler()
emb_tr_scaled = scaler.fit_transform(emb_tr)
emb_te_scaled = scaler.transform(emb_te)

In [None]:
pca = PCA(n_components=128, random_state=42)
emb_tr_pca = pca.fit_transform(emb_tr_scaled)
emb_te_pca = pca.transform(emb_te_scaled)

print("PCA explained variance:", pca.explained_variance_ratio_.sum())

PCA explained variance: 0.8699722


In [None]:
class MultimodalDataset(Dataset):
    def __init__(self, X_tab, X_img, y):
        self.X_tab = torch.tensor(X_tab.values, dtype=torch.float32)
        self.X_img = torch.tensor(X_img, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X_tab[idx], self.X_img[idx], self.y[idx]


In [None]:
class MultimodalRegressor(nn.Module):
    def __init__(self, tab_dim, img_dim):
        super().__init__()

        self.tab_net = nn.Sequential(
            nn.LayerNorm(tab_dim),
            nn.Linear(tab_dim, 256),
            nn.SiLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.SiLU()
        )

        self.img_net = nn.Sequential(
            nn.LayerNorm(img_dim),
            nn.Linear(img_dim, 512),
            nn.SiLU(),
            nn.Dropout(0.4),
            nn.Linear(512, 256),
            nn.SiLU()
        )

        self.head = nn.Sequential(
            nn.Linear(128 + 256, 256),
            nn.SiLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 1)
        )

    def forward(self, x_tab, x_img):
        t = self.tab_net(x_tab)
        i = self.img_net(x_img)
        x = torch.cat([t, i], dim=1)
        return self.head(x).squeeze(1)


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

model = MultimodalRegressor(
    tab_dim=X_tab_tr.shape[1],
    img_dim=emb_tr_pca.shape[1]
).to(device)

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=3e-4,
    weight_decay=1e-4
)

criterion = nn.MSELoss()


In [None]:
train_ds = MultimodalDataset(X_tab_tr, emb_tr_pca, y_tr)
val_ds   = MultimodalDataset(X_tab_te, emb_te_pca, y_te)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_ds, batch_size=512, shuffle=False)


In [None]:
def train_epoch(model, loader):
    model.train()
    losses = []

    for xt, xi, y in loader:
        xt, xi, y = xt.to(device), xi.to(device), y.to(device)

        optimizer.zero_grad()
        preds = model(xt, xi)
        loss = criterion(preds, y)
        loss.backward()
        optimizer.step()

        losses.append(loss.item())

    return np.mean(losses)


def eval_epoch(model, loader):
    model.eval()
    preds, targets = [], []

    with torch.no_grad():
        for xt, xi, y in loader:
            xt, xi = xt.to(device), xi.to(device)
            out = model(xt, xi)
            preds.append(out.cpu())
            targets.append(y)

    preds = torch.cat(preds)
    targets = torch.cat(targets)

    rmse = torch.sqrt(criterion(preds, targets)).item()
    r2 = r2_score(targets.numpy(), preds.numpy())

    return rmse, r2


In [None]:
best_rmse = float("inf")
patience = 10
wait = 0

for epoch in range(1, 1000):
    train_loss = train_epoch(model, train_loader)
    val_rmse, val_r2 = eval_epoch(model, val_loader)

    print(
        f"Epoch {epoch:03d} | "
        f"Train MSE {train_loss:.4f} | "
        f"Val RMSE {val_rmse:.4f} | "
        f"Val R² {val_r2:.4f}"
    )

    if val_rmse < best_rmse:
        best_rmse = val_rmse
        wait = 0
        torch.save(model.state_dict(), "best_nn_model.pt")
    else:
        wait += 1
        if wait >= patience:
            print("Early stopping")
            break


Epoch 001 | Train MSE 89.2457 | Val RMSE 2.5352 | Val R² -22.2912
Epoch 002 | Train MSE 2.7307 | Val RMSE 0.7142 | Val R² -0.8483
Epoch 003 | Train MSE 1.7215 | Val RMSE 0.6044 | Val R² -0.3237
Epoch 004 | Train MSE 1.5326 | Val RMSE 0.5657 | Val R² -0.1597
Epoch 005 | Train MSE 1.4463 | Val RMSE 0.5326 | Val R² -0.0281
Epoch 006 | Train MSE 1.2947 | Val RMSE 0.5225 | Val R² 0.0105
Epoch 007 | Train MSE 1.2236 | Val RMSE 0.5198 | Val R² 0.0209
Epoch 008 | Train MSE 1.0968 | Val RMSE 0.4944 | Val R² 0.1143
Epoch 009 | Train MSE 1.0522 | Val RMSE 0.4819 | Val R² 0.1585
Epoch 010 | Train MSE 1.0184 | Val RMSE 0.4821 | Val R² 0.1579
Epoch 011 | Train MSE 0.9766 | Val RMSE 0.4758 | Val R² 0.1795
Epoch 012 | Train MSE 0.9300 | Val RMSE 0.4631 | Val R² 0.2228
Epoch 013 | Train MSE 0.8826 | Val RMSE 0.4650 | Val R² 0.2164
Epoch 014 | Train MSE 0.8433 | Val RMSE 0.4541 | Val R² 0.2528
Epoch 015 | Train MSE 0.8149 | Val RMSE 0.4472 | Val R² 0.2753
Epoch 016 | Train MSE 0.7912 | Val RMSE 0.4477 |

In [None]:
model.load_state_dict(torch.load("best_nn_model.pt"))
model.eval()

with torch.no_grad():
    log_preds = model(
        torch.tensor(X_tab_te.values, dtype=torch.float32).to(device),
        torch.tensor(emb_te_pca, dtype=torch.float32).to(device)
    ).cpu().numpy()

price_preds = np.expm1(log_preds)
price_true = np.expm1(y_te.values)

rmse_price = np.sqrt(mean_squared_error(price_true, price_preds))
r2_price = r2_score(price_true, price_preds)

print("FINAL PRICE RMSE:", rmse_price)
print("FINAL PRICE R²:", r2_price)


FINAL PRICE RMSE: 206753.6735429027
FINAL PRICE R²: 0.6593548894910731


In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("train.csv")
emb_new = np.load("/content/image_embeddings.npy")

print(df.shape[0], emb_new.shape[0])


16209 16209


In [None]:
emb_df = pd.read_csv("embeddings.csv")

def extract_idx(x):
    return int(str(x).split("_")[0])

row_ids = emb_df.iloc[:, 0].apply(extract_idx).values


In [None]:
assert set(row_ids) == set(range(len(df)))


In [None]:
emb_aligned = np.zeros_like(emb_new)

for i, rid in enumerate(row_ids):
    emb_aligned[rid] = emb_new[i]

np.save("new_embeddings_aligned.npy", emb_aligned)


## Multimodal Catboost with 90% var PCA

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score, mean_squared_error

from catboost import CatBoostRegressor


In [None]:
df = pd.read_csv("train.csv")

df["renovated_post_2000"] = np.where(
df["yr_renovated"] > 0,
"yes",
"no"
)

TARGET = "price"
LOG_TARGET = "log_price"
DROP_COLS = ["id", "date"]



CAT_COLS = [
    "view",
    "condition",
    "waterfront",
    "floors",
    "renovated_post_2000",
]

# categorical -> string (CatBoost requirement)
for c in CAT_COLS:
    df[c] = df[c].astype(str)

# log target
df[LOG_TARGET] = np.log1p(df[TARGET])


In [None]:
emb = np.load("/content/image_embeddings.npy")

In [None]:
X_tab = df.drop(columns=[TARGET, LOG_TARGET] + DROP_COLS)
y = df[LOG_TARGET]

In [None]:
X_tab_tr, X_tab_val, emb_tr, emb_val, y_tr, y_val = train_test_split(
    X_tab,
    emb,
    y,
    test_size=0.2,
    random_state=42
)


In [None]:
emb_scaler = StandardScaler()
emb_tr = emb_scaler.fit_transform(emb_tr)
emb_val = emb_scaler.transform(emb_val)

pca = PCA(n_components=0.90, random_state=42)
emb_tr = pca.fit_transform(emb_tr)
emb_val = pca.transform(emb_val)

print("PCA components used:", pca.n_components_)
print("PCA variance retained:", pca.explained_variance_ratio_.sum())


PCA components used: 237
PCA variance retained: 0.90033203


In [None]:
emb_cols = [f"img_pca_{i}" for i in range(emb_tr.shape[1])]

X_tr = pd.concat(
    [X_tab_tr.reset_index(drop=True),
     pd.DataFrame(emb_tr, columns=emb_cols)],
    axis=1
)

X_val = pd.concat(
    [X_tab_val.reset_index(drop=True),
     pd.DataFrame(emb_val, columns=emb_cols)],
    axis=1
)


In [None]:
cat_features = [
    X_tr.columns.get_loc(c)
    for c in CAT_COLS
    if c in X_tr.columns
]

model = CatBoostRegressor(
    iterations=3000,
    learning_rate=0.03,
    depth=8,
    loss_function="RMSE",
    eval_metric="RMSE",
    task_type="GPU",     # 🔥 GPU ENABLED
    devices="0",
    random_seed=42,
    verbose=200
)


In [None]:
model.fit(
    X_tr, y_tr,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    use_best_model=True
)


0:	learn: 0.5121065	test: 0.5139197	best: 0.5139197 (0)	total: 142ms	remaining: 7m 4s
200:	learn: 0.1750969	test: 0.1820105	best: 0.1820105 (200)	total: 15.8s	remaining: 3m 40s
400:	learn: 0.1662595	test: 0.1756170	best: 0.1756170 (400)	total: 27s	remaining: 2m 55s
600:	learn: 0.1586181	test: 0.1718496	best: 0.1718496 (600)	total: 41s	remaining: 2m 43s
800:	learn: 0.1525459	test: 0.1701199	best: 0.1701198 (799)	total: 55.6s	remaining: 2m 32s
1000:	learn: 0.1477681	test: 0.1691262	best: 0.1691260 (998)	total: 1m 9s	remaining: 2m 18s
1200:	learn: 0.1436744	test: 0.1683421	best: 0.1683402 (1197)	total: 1m 22s	remaining: 2m 3s
1400:	learn: 0.1387271	test: 0.1674843	best: 0.1674698 (1393)	total: 1m 37s	remaining: 1m 50s
1600:	learn: 0.1339933	test: 0.1672166	best: 0.1672166 (1600)	total: 1m 52s	remaining: 1m 38s
1800:	learn: 0.1303802	test: 0.1670072	best: 0.1670072 (1800)	total: 2m 7s	remaining: 1m 25s
2000:	learn: 0.1269955	test: 0.1666852	best: 0.1666790 (1994)	total: 2m 23s	remaining: 1

<catboost.core.CatBoostRegressor at 0x7948ec181550>

In [None]:
log_preds = model.predict(X_val)
r2_log = r2_score(y_val, log_preds)
rmse_log = np.sqrt(mean_squared_error(y_val, log_preds))

print("LOG R²:", r2_log)
print("LOG RMSE:", rmse_log)


LOG R²: 0.8998553395046286
LOG RMSE: 0.16623864987367876


In [None]:
price_preds = np.expm1(log_preds)
price_true = np.expm1(y_val.values)

rmse_price = np.sqrt(mean_squared_error(price_true, price_preds))
r2_price = r2_score(price_true, price_preds)

print("FINAL PRICE RMSE:", rmse_price)
print("FINAL PRICE R²:", r2_price)


FINAL PRICE RMSE: 111620.53024993911
FINAL PRICE R²: 0.9007149513933973


## Multimodal Meta Learner

In [2]:
!pip install catboost --q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [14]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm.auto import tqdm
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import Ridge

from catboost import CatBoostRegressor


In [4]:
df = pd.read_csv("train.csv")
emb = np.load("embeddings_aligned.npy")  # aligned


In [5]:

df["renovated_post_2000"] = np.where(df["yr_renovated"] > 0, "yes", "no")

df.drop(columns=["yr_renovated"], inplace=True)

CAT_COLS = [
"view",
"condition",
"waterfront",
"floors",
"renovated_post_2000",

]

TARGET = "price"
LOG_TARGET = "log_price"
DROP_COLS = ["id", "date"]

In [6]:
df[LOG_TARGET] = np.log1p(df[TARGET])

for c in CAT_COLS:
    df[c] = df[c].astype(str)

X_tab = df.drop(columns=[TARGET, LOG_TARGET] + DROP_COLS)
y = df[LOG_TARGET].values


In [7]:
class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout=0.2):
        super().__init__()
        self.fc1 = nn.Linear(dim, dim)
        self.fc2 = nn.Linear(dim, dim)
        self.norm1 = nn.LayerNorm(dim)
        self.norm2 = nn.LayerNorm(dim)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        r = x
        x = self.fc1(x)
        x = self.norm1(x)
        x = F.silu(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.norm2(x)
        return F.silu(x + r)


class ImageRegressorStrong(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.LayerNorm(512),
            nn.SiLU(),
            nn.Dropout(0.3),
        )
        self.res1 = ResidualBlock(512)
        self.res2 = ResidualBlock(512)
        self.head = nn.Sequential(
            nn.Linear(512, 128),
            nn.LayerNorm(128),
            nn.SiLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )

    def forward(self, x):
        x = self.proj(x)
        x = self.res1(x)
        x = self.res2(x)
        return self.head(x).squeeze(1)


In [8]:
N_SPLITS = 5
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_tab = np.zeros(len(df))
oof_img = np.zeros(len(df))

device = "cuda" if torch.cuda.is_available() else "cpu"


In [15]:
epochs_img = 30

for fold, (tr_idx, val_idx) in enumerate(tqdm(kf.split(X_tab), total=N_SPLITS, desc="K-Fold")):
    print(f"\nFold {fold+1}/{N_SPLITS}")

    # ---------- TABULAR (CatBoost) ----------
    X_tr, X_val = X_tab.iloc[tr_idx], X_tab.iloc[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    cat_features = [X_tr.columns.get_loc(c) for c in CAT_COLS]

    cb = CatBoostRegressor(
        iterations=3000,
        learning_rate=0.03,
        depth=8,
        loss_function="RMSE",
        task_type="GPU",
        devices="0",
        random_seed=42,
        verbose=300
    )

    cb.fit(
        X_tr, y_tr,
        eval_set=(X_val, y_val),
        cat_features=cat_features,
        use_best_model=True
    )

    oof_tab[val_idx] = cb.predict(X_val)

    # ---------- IMAGE MODEL ----------
    X_img_tr = torch.tensor(emb[tr_idx], dtype=torch.float32).to(device)
    X_img_val = torch.tensor(emb[val_idx], dtype=torch.float32).to(device)
    y_tr_t = torch.tensor(y_tr, dtype=torch.float32).to(device)

    img_model = ImageRegressorStrong(emb.shape[1]).to(device)
    opt = torch.optim.AdamW(img_model.parameters(), lr=3e-4, weight_decay=1e-4)
    loss_fn = nn.MSELoss()

    for _ in tqdm(range(epochs_img), desc="Image NN", leave=False):
      img_model.train()
      opt.zero_grad()
      preds = img_model(X_img_tr)
      loss = loss_fn(preds, y_tr_t)
      loss.backward()
      opt.step()

    img_model.eval()
    with torch.no_grad():
        oof_img[val_idx] = img_model(X_img_val).cpu().numpy()


K-Fold:   0%|          | 0/5 [00:00<?, ?it/s]


Fold 1/5
0:	learn: 0.5120258	test: 0.5138818	best: 0.5138818 (0)	total: 82.5ms	remaining: 4m 7s
300:	learn: 0.1694133	test: 0.1766888	best: 0.1766888 (300)	total: 10.6s	remaining: 1m 35s
600:	learn: 0.1585911	test: 0.1694051	best: 0.1694051 (600)	total: 20s	remaining: 1m 19s
900:	learn: 0.1518871	test: 0.1660992	best: 0.1660959 (898)	total: 30s	remaining: 1m 9s
1200:	learn: 0.1483521	test: 0.1651513	best: 0.1651430 (1180)	total: 39.1s	remaining: 58.6s
1500:	learn: 0.1460256	test: 0.1646522	best: 0.1646434 (1417)	total: 47.5s	remaining: 47.5s
1800:	learn: 0.1436826	test: 0.1642707	best: 0.1642538 (1779)	total: 57.5s	remaining: 38.3s
2100:	learn: 0.1418668	test: 0.1640004	best: 0.1639959 (2093)	total: 1m 7s	remaining: 28.9s
2400:	learn: 0.1406551	test: 0.1639500	best: 0.1639395 (2267)	total: 1m 16s	remaining: 19.2s
2700:	learn: 0.1396509	test: 0.1638129	best: 0.1638028 (2682)	total: 1m 24s	remaining: 9.35s
2999:	learn: 0.1387676	test: 0.1636639	best: 0.1636589 (2995)	total: 1m 33s	remai

Image NN:   0%|          | 0/30 [00:00<?, ?it/s]


Fold 2/5
0:	learn: 0.5141298	test: 0.5051863	best: 0.5051863 (0)	total: 35.4ms	remaining: 1m 46s
300:	learn: 0.1691034	test: 0.1728812	best: 0.1728812 (300)	total: 10.7s	remaining: 1m 35s
600:	learn: 0.1587306	test: 0.1654488	best: 0.1654441 (599)	total: 20.7s	remaining: 1m 22s
900:	learn: 0.1538365	test: 0.1628747	best: 0.1628747 (900)	total: 30.7s	remaining: 1m 11s
1200:	learn: 0.1510309	test: 0.1617975	best: 0.1617975 (1200)	total: 39.3s	remaining: 58.9s
1500:	learn: 0.1482115	test: 0.1608840	best: 0.1608840 (1500)	total: 49.7s	remaining: 49.6s
1800:	learn: 0.1457386	test: 0.1602089	best: 0.1602084 (1797)	total: 1m	remaining: 40s
2100:	learn: 0.1440751	test: 0.1597503	best: 0.1597503 (2100)	total: 1m 10s	remaining: 30s
2400:	learn: 0.1428472	test: 0.1594564	best: 0.1594564 (2400)	total: 1m 20s	remaining: 20.2s
2700:	learn: 0.1416271	test: 0.1592784	best: 0.1592647 (2690)	total: 1m 30s	remaining: 10s
2999:	learn: 0.1404174	test: 0.1591434	best: 0.1591326 (2994)	total: 1m 40s	remaini

Image NN:   0%|          | 0/30 [00:00<?, ?it/s]


Fold 3/5
0:	learn: 0.5081461	test: 0.5293123	best: 0.5293123 (0)	total: 68.5ms	remaining: 3m 25s
300:	learn: 0.1715598	test: 0.1786010	best: 0.1786010 (300)	total: 9.82s	remaining: 1m 28s
600:	learn: 0.1599679	test: 0.1700241	best: 0.1700241 (600)	total: 19.3s	remaining: 1m 17s
900:	learn: 0.1559548	test: 0.1677527	best: 0.1677525 (899)	total: 28.4s	remaining: 1m 6s
1200:	learn: 0.1519397	test: 0.1658030	best: 0.1657998 (1194)	total: 38.5s	remaining: 57.6s
1500:	learn: 0.1497539	test: 0.1649209	best: 0.1649209 (1500)	total: 47.3s	remaining: 47.2s
1800:	learn: 0.1480116	test: 0.1645083	best: 0.1645073 (1796)	total: 57.2s	remaining: 38.1s
2100:	learn: 0.1463863	test: 0.1640251	best: 0.1640200 (2091)	total: 1m 6s	remaining: 28.5s
2400:	learn: 0.1451745	test: 0.1637070	best: 0.1637026 (2398)	total: 1m 15s	remaining: 18.8s
2700:	learn: 0.1440631	test: 0.1634367	best: 0.1634241 (2687)	total: 1m 23s	remaining: 9.22s
2999:	learn: 0.1434042	test: 0.1631841	best: 0.1631841 (2999)	total: 1m 31s	

Image NN:   0%|          | 0/30 [00:00<?, ?it/s]


Fold 4/5
0:	learn: 0.5130830	test: 0.5092303	best: 0.5092303 (0)	total: 35.3ms	remaining: 1m 46s
300:	learn: 0.1692880	test: 0.1783023	best: 0.1783023 (300)	total: 10.1s	remaining: 1m 30s
600:	learn: 0.1572945	test: 0.1695896	best: 0.1695896 (600)	total: 20.5s	remaining: 1m 22s
900:	learn: 0.1519351	test: 0.1669328	best: 0.1669282 (899)	total: 31.6s	remaining: 1m 13s
1200:	learn: 0.1484617	test: 0.1654208	best: 0.1654176 (1197)	total: 42.7s	remaining: 1m 3s
1500:	learn: 0.1460496	test: 0.1645492	best: 0.1645432 (1497)	total: 52.4s	remaining: 52.3s
1800:	learn: 0.1440274	test: 0.1640144	best: 0.1640101 (1798)	total: 1m 3s	remaining: 42.2s
2100:	learn: 0.1425950	test: 0.1635621	best: 0.1635615 (2096)	total: 1m 13s	remaining: 31.6s
2400:	learn: 0.1416697	test: 0.1633866	best: 0.1633789 (2398)	total: 1m 23s	remaining: 20.8s
2700:	learn: 0.1406954	test: 0.1632229	best: 0.1632143 (2688)	total: 1m 34s	remaining: 10.4s
2999:	learn: 0.1399914	test: 0.1630895	best: 0.1630895 (2999)	total: 1m 44

Image NN:   0%|          | 0/30 [00:00<?, ?it/s]


Fold 5/5
0:	learn: 0.5143827	test: 0.5042450	best: 0.5042450 (0)	total: 35.2ms	remaining: 1m 45s
300:	learn: 0.1695314	test: 0.1794867	best: 0.1794865 (299)	total: 10.3s	remaining: 1m 32s
600:	learn: 0.1587196	test: 0.1724691	best: 0.1724675 (599)	total: 18.9s	remaining: 1m 15s
900:	learn: 0.1531640	test: 0.1694614	best: 0.1694552 (898)	total: 28.8s	remaining: 1m 7s
1200:	learn: 0.1487583	test: 0.1674280	best: 0.1674280 (1200)	total: 39s	remaining: 58.4s
1500:	learn: 0.1456265	test: 0.1663365	best: 0.1663365 (1500)	total: 49.8s	remaining: 49.7s
1800:	learn: 0.1430271	test: 0.1656308	best: 0.1656308 (1800)	total: 1m	remaining: 40s
2100:	learn: 0.1411434	test: 0.1650486	best: 0.1650486 (2100)	total: 1m 9s	remaining: 29.6s
2400:	learn: 0.1394986	test: 0.1645603	best: 0.1645603 (2400)	total: 1m 19s	remaining: 19.9s
2700:	learn: 0.1381879	test: 0.1643235	best: 0.1643120 (2697)	total: 1m 30s	remaining: 9.97s
2999:	learn: 0.1370028	test: 0.1639905	best: 0.1639897 (2997)	total: 1m 40s	remaini

Image NN:   0%|          | 0/30 [00:00<?, ?it/s]

In [26]:
meta_X = np.vstack([oof_tab, oof_img]).T
meta = Ridge(alpha=1.0)
meta.fit(meta_X, y)

meta_preds = meta.predict(meta_X)

In [48]:
print("STACKED LOG R²:", r2_score(y, meta_preds))

price_true = np.expm1(y)
price_pred = np.expm1(meta_preds)

print("STACKED PRICE R²:", r2_score(price_true, price_pred))
print("STACKED PRICE RMSE:", np.sqrt(mean_squared_error(price_true, price_pred)))


STACKED LOG R²: 0.9036610918850141
STACKED PRICE R²: 0.8954970178700414
STACKED PRICE RMSE: 116471.45984162705


In [28]:
meta.coef_

array([0.99988947, 0.01005528])

# Conclusion: Use tabular only data as it is already pretty rich in nature

Final Model Training and Prediction on 'test.csv'

In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [30]:
# Load training data
df = pd.read_csv("train.csv")

# Feature engineering
df["renovated_post_2000"] = np.where(
    df["yr_renovated"] > 0,
    "yes",
    "no"
)
df.drop(columns=["yr_renovated"], inplace=True)

CAT_COLS = [
    "view",
    "condition",
    "waterfront",
    "floors",
    "renovated_post_2000",
]

for c in CAT_COLS:
    df[c] = df[c].astype(str)

TARGET = "price"
DROP_COLS = ["id", "date"]

X = df.drop(columns=[TARGET] + DROP_COLS)
y = np.log1p(df[TARGET])


In [33]:
# Train / validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [40]:
# Model
cat_model = CatBoostRegressor(
    iterations=1500,
    depth=8,
    learning_rate=0.05,
    loss_function="RMSE",
    random_seed=42,
    verbose=300
)

cat_model.fit(
    X_train,
    y_train,
    cat_features=CAT_COLS,
    eval_set=(X_val, y_val),
    use_best_model=True
)


0:	learn: 0.5049103	test: 0.5067194	best: 0.5067194 (0)	total: 25.6ms	remaining: 38.4s
300:	learn: 0.1477156	test: 0.1649844	best: 0.1649844 (300)	total: 7.41s	remaining: 29.5s
600:	learn: 0.1296163	test: 0.1616623	best: 0.1616502 (579)	total: 13.6s	remaining: 20.3s
900:	learn: 0.1183797	test: 0.1609566	best: 0.1609315 (878)	total: 21.2s	remaining: 14.1s
1200:	learn: 0.1091065	test: 0.1607081	best: 0.1606962 (1178)	total: 27.6s	remaining: 6.87s
1499:	learn: 0.1015701	test: 0.1610842	best: 0.1606763 (1215)	total: 35s	remaining: 0us

bestTest = 0.1606762985
bestIteration = 1215

Shrink model to first 1216 iterations.


<catboost.core.CatBoostRegressor at 0x7925c058c680>

In [43]:
# Validation metrics
val_preds = cat_model.predict(X_val)
print("VAL LOG RMSE:", np.sqrt(mean_squared_error(y_val, val_preds)))
print("VAL LOG R²:", r2_score(y_val, val_preds))

# Price-space (optional)
price_preds = np.expm1(val_preds)
price_true = np.expm1(y_val)

print("VAL PRICE RMSE:", np.sqrt(mean_squared_error(price_true, price_preds)))
print("VAL PRICE R²:", r2_score(price_true, price_preds))


VAL LOG RMSE: 0.16067629368861616
VAL LOG R²: 0.9064449138306332
VAL PRICE RMSE: 110263.66030382177
VAL PRICE R²: 0.9031141172536263


In [36]:
# Load test data
df_test = pd.read_csv("test.csv")

# Apply SAME feature engineering
df_test["renovated_post_2000"] = np.where(
    df_test["yr_renovated"] > 0,
    "yes",
    "no"
)
df_test.drop(columns=["yr_renovated"], inplace=True)

for c in CAT_COLS:
    df_test[c] = df_test[c].astype(str)

X_test_final = df_test.drop(columns=DROP_COLS)


In [44]:
cat_model_full = CatBoostRegressor(
    iterations=1400,
    depth=8,
    learning_rate=0.05,
    loss_function="RMSE",
    random_seed=42,
    verbose=300
)

cat_model_full.fit(
    X,
    y,
    cat_features=CAT_COLS
)


0:	learn: 0.5061924	total: 55.7ms	remaining: 1m 17s
300:	learn: 0.1484618	total: 7.58s	remaining: 27.7s
600:	learn: 0.1323976	total: 15.7s	remaining: 20.9s
900:	learn: 0.1211266	total: 22.8s	remaining: 12.6s
1200:	learn: 0.1121763	total: 31.1s	remaining: 5.16s
1399:	learn: 0.1069909	total: 36.6s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x792464a5c650>

In [45]:
log_test_preds = cat_model_full.predict(X_test_final)
test_price_preds = np.expm1(log_test_preds)

In [47]:
submission = pd.DataFrame({
    "id": df_test["id"],
    "predicted_price": test_price_preds
})

submission.to_csv("final_prediction.csv", index=False)

print("Saved submission.csv")


Saved submission.csv
