* Cleans catalog text and extracts key structured fields like value, unit, and brand.

* Encodes categorical features using one-hot and target encoding for both brand and unit columns.

* Generates numerical embeddings for product descriptions using a SentenceTransformer model (all-MiniLM-L6-v2) on GPU.

* Extracts visual embeddings for product images using a pretrained ResNet50 CNN from timm.

* Embeddings are cached to .npy files to avoid recomputation in later runs.

* Combines all features — text embeddings, numeric values, categorical encodings, and image embeddings — into one feature matrix.

* Trains an XGBoost regressor (XGBRegressor) with GPU acceleration  to predict log-transformed prices.

* Evaluates model performance using SMAPE (Symmetric Mean Absolute Percentage Error).

* Performs a train-validation split to check for overfitting and generalization.

* Saves the final trained model to Google Drive for reuse or deployment.

* The image-augmented model slightly improves SMAPE over the non-image baseline while keeping generalization stable, showing that visual cues contribute additional useful signal for price prediction.From 10.40 to 10.23
* Has an overfitting check, model generalizes well.

In [None]:
!pip install -q sentence-transformers xgboost timm torchvision

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import re
import os
from xgboost import XGBRegressor
from sentence_transformers import SentenceTransformer
import torch
import joblib
from sklearn.model_selection import train_test_split
from PIL import Image
import torchvision.transforms as T
import timm
from tqdm import tqdm

In [None]:
train_path = '/content/drive/MyDrive/DataSets/train.csv'
test_path = '/content/drive/MyDrive/DataSets/test.csv'
image_folder = '/content/drive/MyDrive/DataSets/images/'
cache_folder = '/content/drive/MyDrive/DataSets/image_embeddings/'
os.makedirs(cache_folder, exist_ok=True)

In [None]:
train = pd.read_csv(train_path, sep=',', quotechar='"', engine='python', on_bad_lines='skip')
test = pd.read_csv(test_path, sep=',', quotechar='"', engine='python', on_bad_lines='skip')

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (75000, 4)
Test shape: (75000, 3)


In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower().replace('\n',' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

train['clean_text'] = train['catalog_content'].apply(clean_text)
test['clean_text'] = test['catalog_content'].apply(clean_text)

In [None]:
def extract_value_unit(text):
    value, unit = None, None
    value_match = re.search(r'Value[: ]+\s*([\d\.]+)', text, re.IGNORECASE)
    if value_match:
        try:
            value = float(value_match.group(1))
        except:
            value = None
    unit_match = re.search(r'Unit[: ]+\s*([a-zA-Z]+)', text, re.IGNORECASE)
    if unit_match:
        unit = unit_match.group(1).lower()
    return value, unit

train[['value_num', 'unit']] = train['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))
test[['value_num', 'unit']] = test['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))

In [None]:
def extract_brand(text):
    if pd.isna(text):
        return None
    brand_match = re.match(r'([^,]+)', text)
    if brand_match:
        return brand_match.group(1).strip().lower()
    return None

train['brand'] = train['catalog_content'].apply(extract_brand)
test['brand'] = test['catalog_content'].apply(extract_brand)

In [None]:
TOP_K_BRANDS = 50
top_brands = train['brand'].value_counts().nlargest(TOP_K_BRANDS).index.tolist()
train_top_brands = train['brand'].apply(lambda x: x if x in top_brands else 'other')
test_top_brands = test['brand'].apply(lambda x: x if x in top_brands else 'other')

train_brands_onehot = pd.get_dummies(train_top_brands, prefix='brand', dummy_na=True)
test_brands_onehot = pd.get_dummies(test_top_brands, prefix='brand', dummy_na=True)
test_brands_onehot = test_brands_onehot.reindex(columns=train_brands_onehot.columns, fill_value=0)

brand_target_mean = train.groupby('brand')['price'].mean().to_dict()
train['brand_target'] = train['brand'].map(brand_target_mean).fillna(train['price'].mean())
test['brand_target'] = test['brand'].map(brand_target_mean).fillna(train['price'].mean())


In [None]:
TOP_K_UNITS = 20
top_units = train['unit'].value_counts().nlargest(TOP_K_UNITS).index.tolist()
train_top_units = train['unit'].apply(lambda x: x if x in top_units else 'other')
test_top_units = test['unit'].apply(lambda x: x if x in top_units else 'other')

train_units_onehot = pd.get_dummies(train_top_units, prefix='unit', dummy_na=True)
test_units_onehot = pd.get_dummies(test_top_units, prefix='unit', dummy_na=True)
test_units_onehot = test_units_onehot.reindex(columns=train_units_onehot.columns, fill_value=0)

unit_target_mean = train.groupby('unit')['price'].mean().to_dict()
train['unit_target'] = train['unit'].map(unit_target_mean).fillna(train['price'].mean())
test['unit_target'] = test['unit'].map(unit_target_mean).fillna(train['price'].mean())


In [None]:
X_train_cat = np.hstack([
    train_brands_onehot.values,
    train['brand_target'].values.reshape(-1,1),
    train_units_onehot.values,
    train['unit_target'].values.reshape(-1,1)
]).astype(np.float32)

X_test_cat = np.hstack([
    test_brands_onehot.values,
    test['brand_target'].values.reshape(-1,1),
    test_units_onehot.values,
    test['unit_target'].values.reshape(-1,1)
]).astype(np.float32)

In [None]:
X_train_num = train[['value_num']].fillna(0).astype(np.float32).values
X_test_num = test[['value_num']].fillna(0).astype(np.float32).values

In [None]:
# Text embeddings
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)
embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

X_train_text = embedder.encode(train['clean_text'].tolist(), batch_size=64, show_progress_bar=True, device=device)
X_test_text = embedder.encode(test['clean_text'].tolist(), batch_size=64, show_progress_bar=True, device=device)


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

In [None]:
# Combine all features
X_train = np.hstack([X_train_text, X_train_num, X_train_cat])
X_test = np.hstack([X_test_text, X_test_num, X_test_cat])

In [None]:
# Target transformation (log)
y_train = np.log1p(train['price'].values)

In [None]:
# Pretrained CNN (ResNet50) for embeddings
img_model = timm.create_model('resnet50', pretrained=True, num_classes=0, global_pool='avg').to(device)
img_model.eval()

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (act1): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (act1): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (drop_block): Identity()
      (act2): ReLU(inplace=True)
      (aa): Identity()
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
     

In [None]:
# Transform
transform = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225])
])

In [None]:
def batch_image_embeddings(image_names, batch_size=64):
    all_embeddings = []

    for i in tqdm(range(0, len(image_names), batch_size), desc="Embedding images"):
        batch_files = image_names[i:i+batch_size]
        imgs, batch_indices = [], []

        # Prepare batch
        for idx, img_name in enumerate(batch_files):
            safe_name = os.path.splitext(os.path.basename(img_name))[0]
            cache_path = os.path.join(cache_folder, f"{safe_name}.npy")

            if os.path.exists(cache_path):
                emb = np.load(cache_path)
                all_embeddings.append(emb)
            else:
                img_path = os.path.join(image_folder, img_name)
                try:
                    img = Image.open(img_path).convert("RGB")
                    imgs.append(transform(img))
                    batch_indices.append(len(all_embeddings))  # track where this embedding should go
                    all_embeddings.append(None)  # placeholder
                except Exception:
                    all_embeddings.append(np.zeros(img_model.num_features))  # missing image fallback

        # Compute embeddings for new images
        if len(imgs) > 0:
            imgs = torch.stack(imgs).to(device)
            with torch.no_grad():
                batch_emb = img_model(imgs).cpu().numpy()

            for emb, idx in zip(batch_emb, batch_indices):
                all_embeddings[idx] = emb
                # Save to cache
                safe_name = os.path.splitext(os.path.basename(batch_files[batch_indices.index(idx)]))[0]
                cache_path = os.path.join(cache_folder, f"{safe_name}.npy")
                np.save(cache_path, emb)

    return np.vstack(all_embeddings)

In [None]:
X_train_img = batch_image_embeddings(train['image_link'].tolist(), batch_size=64)
X_test_img  = batch_image_embeddings(test['image_link'].tolist(), batch_size=64)

Embedding images: 100%|██████████| 1172/1172 [00:17<00:00, 66.67it/s]
Embedding images: 100%|██████████| 1172/1172 [00:17<00:00, 67.47it/s]


In [None]:
X_train_full = np.hstack([X_train, X_train_img])
X_test_full  = np.hstack([X_test,  X_test_img])

model_img = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method='gpu_hist'
)

In [None]:
model_img.fit(X_train_full, y_train)


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
# Predict
y_pred_img = np.expm1(model_img.predict(X_test_full))
output = test[['sample_id']].copy()
output['price'] = y_pred_img
print(output.head())


    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


   sample_id      price
0     100179  22.212185
1     245611  22.182791
2     146263  23.295153
3      95658  23.074257
4      36806  23.464199


In [None]:
# Evaluate on train
def smape(y_true, y_pred):
    return 100 * np.mean(np.abs(y_pred - y_true) / ((np.abs(y_true)+np.abs(y_pred))/2))

train_smape_img = smape(train['price'].values, np.expm1(model_img.predict(X_train_full)))
print(f"Full Train SMAPE with Images: {train_smape_img:.2f}%")


Full Train SMAPE with Images: 10.23%


In [None]:
# Save model
joblib.dump(model_img, '/content/drive/MyDrive/DataSets/xgb_model_gpu_img.pkl')
print("Image-augmented model saved to Drive!")

Image-augmented model saved to Drive!


In [None]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train_full, y_train, test_size=0.2, random_state=42)
val_model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method='gpu_hist',
    eval_metric='rmse'
)
val_model.fit(X_tr, y_tr, eval_set=[(X_tr,y_tr),(X_val,y_val)], verbose=False)
y_tr_pred = np.expm1(val_model.predict(X_tr))
y_val_pred = np.expm1(val_model.predict(X_val))

train_smape = smape(np.expm1(y_tr), y_tr_pred)
val_smape = smape(np.expm1(y_val), y_val_pred)

print("\n--- Overfitting Check ---")
print(f"Train SMAPE: {train_smape:.2f}%")
print(f"Validation SMAPE: {val_smape:.2f}%")

if val_smape - train_smape > 10:
    print("Model is likely overfitting.")
else:
    print("Model generalizes well.")


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:



--- Overfitting Check ---
Train SMAPE: 10.09%
Validation SMAPE: 12.37%
Model generalizes well.


In [None]:
# Save test predictions for submission
output.to_csv('/content/drive/MyDrive/DataSets/test_out.csv', index=False)
print("Test predictions saved to Drive as test_out.csv!")

Test predictions saved to Drive as test_out.csv!
