In [None]:
import os
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [None]:
# PATHS

CSV_PATH = "Data/train(1)(train(1)).csv"
FEATURE_PATH = "features/resnet50_features.npy"
MULTI_FEATURE_PATH = "Data/multimodal_features.csv"
MODEL_DIR = "models"
MODEL_PATH = f"{MODEL_DIR}/xgb_multimodal.json"

os.makedirs(MODEL_DIR, exist_ok=True)

In [None]:
# LOAD DATA

df = pd.read_csv(CSV_PATH)
from sklearn.decomposition import PCA

image_features = np.load(FEATURE_PATH)

pca = PCA(n_components=100, random_state=42)
image_features = pca.fit_transform(image_features)

print("Reduced CNN features:", image_features.shape)

print("Tabular shape:", df.shape)
print("Image features shape:", image_features.shape)

assert len(df) == image_features.shape[0]

In [None]:
y = df["price"].values


# TABULAR FEATURES

X_tab = df.drop(columns=["price", "date", "id" ], errors="ignore")


# CONCAT TABULAR + IMAGE

X = np.hstack([X_tab.values, image_features])
print("Final feature matrix:", X.shape)

In [None]:
# SAVE MULTIMODAL FEATURES 

tabular_cols = X_tab.columns.tolist()
cnn_cols = [f"cnn_feat_{i}" for i in range(image_features.shape[1])]

multimodal_df = pd.DataFrame(
    X,
    columns=tabular_cols + cnn_cols
)
multimodal_df["price"] = y

multimodal_df.to_csv(MULTI_FEATURE_PATH, index=False)
print(f"\n Multimodal feature CSV saved at: {MULTI_FEATURE_PATH}")

In [None]:
# FEATURE SCALING 

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# TRAIN / VALID SPLIT

X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

In [None]:
# XGBOOST MODEL

model = xgb.XGBRegressor(
    n_estimators=600,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)

print("\n Training XGBoost...")
model.fit(X_train, y_train)

In [None]:
# EVALUATION

y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
r2 = r2_score(y_val, y_pred)

print("\n===== RESULTS =====")
print("RMSE:", rmse)
print("RÂ² Score:", r2)

In [None]:
# SAVE MODEL

model.get_booster().save_model(MODEL_PATH)
print(f"\n Model saved at: {MODEL_PATH}")