In [17]:
!pip install xgboost --quiet

In [18]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [19]:
df = pd.read_excel("../data/train.xlsx")

In [20]:
df_img = pd.read_parquet("../data/img_feature/image_features.parquet")
df["id"] = df["id"].astype(str)
df_img["id"] = df_img["id"].astype(str)



df_merged = df.merge(df_img, on="id", how="inner")

In [21]:
y = np.log1p(df_merged["price"])

In [22]:
tab_cols = [
'bedrooms','bathrooms','sqft_living','sqft_lot','floors',
'condition','grade','view','waterfront',
'sqft_above','sqft_basement','yr_built','yr_renovated',
'lat','long'
]

X_tab = df_merged[tab_cols]

In [23]:
cnn_cols = [c for c in df_merged.columns
if c not in tab_cols + ["id","date","price"]]

X_cnn = df_merged[cnn_cols]

print("Tabular:", X_tab.shape)
print("CNN :", X_cnn.shape)

Tabular: (16209, 15)
CNN : (16209, 2051)


In [24]:
X_tab_train, X_tab_val, X_cnn_train, X_cnn_val, y_train, y_val = train_test_split(
X_tab, X_cnn, y, test_size=0.2, random_state=42
)

In [25]:
scaler = StandardScaler()
X_cnn_train = scaler.fit_transform(X_cnn_train)
X_cnn_val = scaler.transform(X_cnn_val)

In [26]:
pca = PCA(n_components=100, random_state=42)
X_cnn_train_pca = pca.fit_transform(X_cnn_train)
X_cnn_val_pca = pca.transform(X_cnn_val)

print("CNN after PCA:", X_cnn_train_pca.shape)

CNN after PCA: (12967, 100)


In [27]:
X_train = np.hstack([X_tab_train.values, X_cnn_train_pca])
X_val = np.hstack([X_tab_val.values, X_cnn_val_pca])

print("Final train shape:", X_train.shape)

Final train shape: (12967, 115)


In [28]:
model = xgb.XGBRegressor(
n_estimators=900,
learning_rate=0.03,
max_depth=7,
min_child_weight=1,
subsample=0.8,
colsample_bytree=0.8,
objective="reg:squarederror",
random_state=42,
n_jobs=-1
)

In [29]:
model.fit(X_train, y_train)

In [30]:
y_pred = model.predict(X_val)

rmse = mean_squared_error(y_val, y_pred) ** 0.5
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print("Multimodal XGBoost")
print("RMSE:", round(rmse, 4))
print("MAE :", round(mae, 4))
print("R2 :", round(r2, 4))

Multimodal XGBoost
RMSE: 0.1647
MAE : 0.1162
R2 : 0.9014
