In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import xgboost as xgb

In [None]:
train_df = pd.read_csv("/content/data_with_embeddings.csv")
test_df  = pd.read_csv("/content/validation_data_with_embeddings.csv")

In [None]:
print(train_df.shape, test_df.shape)
train_df.isnull().sum()

(16209, 533) (5404, 532)


Unnamed: 0,0
id,0
date,0
price,0
bedrooms,0
bathrooms,0
...,...
img_emb_507,0
img_emb_508,0
img_emb_509,0
img_emb_510,0


In [None]:
train_df["log_price"] = np.log1p(train_df["price"])
target = "log_price"

In [None]:
tabular_cols = [
    'bedrooms','bathrooms','sqft_living','sqft_lot',
    'floors','waterfront','view','condition','grade',
    'sqft_above','sqft_basement','yr_built','yr_renovated','zipcode','lat','long','sqft_living15','sqft_lot15'
]

In [None]:
embedding_cols = [c for c in train_df.columns if c.startswith("img_emb_")]
print(len(embedding_cols))  # should be 512

512


In [None]:
X_tab = train_df[tabular_cols]
X_img = train_df[embedding_cols]
y     = train_df[target]

X_tab_test = test_df[tabular_cols]
X_img_test = test_df[embedding_cols]

X_tab_tr, X_tab_val, X_img_tr, X_img_val, y_tr, y_val = train_test_split(
    X_tab, X_img, y, test_size=0.2, random_state=42
)

In [None]:
pca = PCA(n_components=60, random_state=42)

X_img_tr_pca  = pca.fit_transform(X_img_tr)
X_img_val_pca = pca.transform(X_img_val)
X_img_test_pca = pca.transform(X_img_test)

print(X_img_tr_pca.shape)

(12967, 60)


In [None]:
X_train = np.hstack([X_tab_tr.values,  X_img_tr_pca])
X_val   = np.hstack([X_tab_val.values, X_img_val_pca])
X_test  = np.hstack([X_tab_test.values, X_img_test_pca])

In [None]:
model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

model.fit(
    X_train, y_tr,
    eval_set=[(X_val, y_val)],
    verbose=50
)

[0]	validation_0-rmse:0.50516
[50]	validation_0-rmse:0.19152
[100]	validation_0-rmse:0.17291
[150]	validation_0-rmse:0.17008
[200]	validation_0-rmse:0.16956
[250]	validation_0-rmse:0.16934
[300]	validation_0-rmse:0.16893
[350]	validation_0-rmse:0.16880
[400]	validation_0-rmse:0.16888
[450]	validation_0-rmse:0.16922
[499]	validation_0-rmse:0.16948


In [None]:
X_tab_full = train_df[tabular_cols]
X_img_full = train_df[embedding_cols]

X_img_full_pca = pca.fit_transform(X_img_full)
X_full = np.hstack([X_tab_full.values, X_img_full_pca])

final_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_full, y)

In [None]:
val_preds = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, val_preds))
mae  = mean_absolute_error(y_val, val_preds)
r2   = r2_score(y_val, val_preds)

print(f"RMSE (log): {rmse:.4f}")
print(f"MAE  (log): {mae:.4f}")
print(f"R²        : {r2:.4f}")

RMSE (log): 0.1695
MAE  (log): 0.1227
R²        : 0.8959


In [None]:
# Identify feature groups
tabular_feature_names = tabular_cols
pca_feature_names = [f"pca_{i}" for i in range(pca.n_components)]
all_feature_names = tabular_feature_names + pca_feature_names

# Get feature importances from the trained model
importance_data = model.feature_importances_

# Create a DataFrame for importances
importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance': importance_data
})

tabular_importance = importance_df[
    importance_df['feature'].isin(tabular_feature_names)
]['importance'].sum()

image_importance = importance_df[
    importance_df['feature'].isin(pca_feature_names)
]['importance'].sum()

print("Total Tabular Importance :", tabular_importance)
print("Total Image Importance   :", image_importance)
print("Image Contribution (%)   :",
      100 * image_importance / (tabular_importance + image_importance))

Total Tabular Importance : 0.85462534
Total Image Importance   : 0.14537473
Image Contribution (%)   : 14.537471


In [None]:
# Convert log_price back to normal price
train_df["price"] = np.expm1(train_df["log_price"])

In [None]:
train_df[["log_price", "price"]].head()

Unnamed: 0,log_price,price
0,12.501142,268643.0
1,12.409018,245000.0
2,12.206078,200000.0
3,12.772806,352499.0
4,12.354497,232000.0


In [None]:
# Target (NORMAL price)
y_full = train_df["price"]

# Tabular features
X_tab_full = train_df[tabular_cols]

# Image embeddings
X_img_full = train_df[embedding_cols]

# PCA transform (already fitted earlier)
X_img_full_pca = pca.transform(X_img_full)

# Final feature matrix
X_full = np.hstack([
    X_tab_full.values,
    X_img_full_pca
])

In [None]:
import xgboost as xgb

final_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

final_model.fit(X_full, y_full)

In [None]:
# Tabular
X_tab_test = test_df[tabular_cols]

# Image embeddings
X_img_test = test_df[embedding_cols]

# PCA transform
X_img_test_pca = pca.transform(X_img_test)

# Final test matrix
X_test_final = np.hstack([
    X_tab_test.values,
    X_img_test_pca
])

In [None]:
price_preds = final_model.predict(X_test_final)

In [None]:
price_preds = np.clip(price_preds, 0, None)

In [None]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "predicted_price": price_preds
})

submission.to_csv("final_predictions.csv", index=False)

In [None]:
submission.head()

Unnamed: 0,id,predicted_price
0,2591820310,388363.3
1,7974200820,853775.9
2,7701450110,1188968.0
3,9522300010,2056309.0
4,9510861140,735494.8


In [None]:
submission.describe()

Unnamed: 0,id,predicted_price
count,5404.0,5404.0
mean,4593891000.0,546917.9
std,2882493000.0,367445.7
min,1000102.0,127936.7
25%,2123814000.0,326131.6
50%,3904902000.0,458414.2
75%,7338000000.0,646187.7
max,9842300000.0,4943568.0
