In [1]:
import numpy as np
from sklearn.decomposition import PCA


In [4]:

# Load fused embeddings
fused = np.load('fused_x75ktrain.npy')
print("Original shape:", fused.shape)

Original shape: (75000, 512)


In [5]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=256, random_state=42)
fused_reduced = pca.fit_transform(fused)

print("Reduced shape:", fused_reduced.shape)

# Save reduced embeddings
np.save('fused_x_reduced_image.npy', fused_reduced)
print("Saved reduced embeddings as 'fused_x_reduced.npy'")

Reduced shape: (75000, 256)
Saved reduced embeddings as 'fused_x_reduced.npy'


In [6]:

import os
from pathlib import Path
from typing import List


import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

# XGBoost import
from xgboost import XGBRegressor


In [7]:
def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # avoid division by zero
    mask = denominator != 0
    smape_value = np.mean(np.abs(y_true[mask] - y_pred[mask]) / denominator[mask]) * 100
    return smape_value

In [8]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import numpy as np
import os

def train_and_evaluate(X: np.ndarray, y: np.ndarray, out_dir: str = 'outputs', seed: int = 42):
    os.makedirs(out_dir, exist_ok=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # XGBoost regressor with training progress
    model = XGBRegressor(
        n_estimators=2000,
        max_depth=6,
        learning_rate=0.04,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_weight=3,
        gamma=0.1,
        reg_alpha=0.01,
        reg_lambda=1,
        random_state=seed,
        n_jobs=-1,
        verbosity=1,
        eval_metric="rmse"
    )

    model.fit(
        X_train_scaled, y_train,
        eval_set=[(X_train_scaled, y_train), (X_test_scaled, y_test)],
        verbose=10
    )

    y_pred = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    smape_val = smape(y_test, y_pred)  # make sure you have defined smape()

    print("\nEvaluation on test set:")
    print(f"MAE: {mae:.6f}")
    print(f"RMSE: {rmse:.6f}")
    print(f"R2: {r2:.6f}")
    print(f"SMAPE: {smape_val:.2f}%")

    # save model + scaler
    joblib.dump(model, os.path.join(out_dir, 'xgb_regressor.joblib'))
    joblib.dump(scaler, os.path.join(out_dir, 'scaler.joblib'))

    # save predictions and true values
    np.save(os.path.join(out_dir, 'y_test.npy'), y_test)
    np.save(os.path.join(out_dir, 'y_pred.npy'), y_pred)

    print(f"Saved model and scaler to {out_dir}")
    return {'mae': mae, 'rmse': rmse, 'r2': r2}


In [9]:
out_dir = 'outputs'
seed = 42

In [10]:
csv_file = 'catalog_75ktrain.csv'

In [11]:

df = pd.read_csv(csv_file, on_bad_lines='skip')


In [12]:
 # save target vector y (price_log)
y = df['price_log'].values.astype(float)


In [13]:
y.shape

(75000,)

In [14]:
fuse = np.load('fused_x_reduced_image.npy')

In [15]:
metrics = train_and_evaluate(fuse, y, out_dir=out_dir, seed=seed)

[0]	validation_0-rmse:0.93468	validation_1-rmse:0.94764
[10]	validation_0-rmse:0.89533	validation_1-rmse:0.91259
[20]	validation_0-rmse:0.86931	validation_1-rmse:0.89079
[30]	validation_0-rmse:0.84964	validation_1-rmse:0.87547
[40]	validation_0-rmse:0.83389	validation_1-rmse:0.86359
[50]	validation_0-rmse:0.82082	validation_1-rmse:0.85450
[60]	validation_0-rmse:0.80980	validation_1-rmse:0.84760
[70]	validation_0-rmse:0.79975	validation_1-rmse:0.84197
[80]	validation_0-rmse:0.79062	validation_1-rmse:0.83701
[90]	validation_0-rmse:0.78199	validation_1-rmse:0.83259
[100]	validation_0-rmse:0.77390	validation_1-rmse:0.82860
[110]	validation_0-rmse:0.76620	validation_1-rmse:0.82477
[120]	validation_0-rmse:0.75885	validation_1-rmse:0.82152
[130]	validation_0-rmse:0.75179	validation_1-rmse:0.81846
[140]	validation_0-rmse:0.74523	validation_1-rmse:0.81568
[150]	validation_0-rmse:0.73888	validation_1-rmse:0.81295
[160]	validation_0-rmse:0.73251	validation_1-rmse:0.81057
[170]	validation_0-rmse:0