* This model was trained on a T4 GPU runtime using XGBoost with GPU acceleration. Unlike earlier versions that relied on Top-K and Frequency encoding for categorical features (brand and unit), this iteration represents both brand names and textual product descriptions through dense semantic embeddings generated using SentenceTransformers. Numeric attributes such as value were also included. These richer, continuous feature representations allow the model to capture deeper contextual and relational patterns between products while avoiding high-dimensional sparse encodings.

* Performance:
The model achieved a Full Train SMAPE of 10.40%, a significant improvement from the previous 64% result.


In [None]:
# Install dependencies
!pip install -q sentence-transformers xgboost

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import re
from xgboost import XGBRegressor
from sentence_transformers import SentenceTransformer
import torch
import joblib

# -------------------------------
# Paths
# -------------------------------
train_path = '/content/drive/MyDrive/DataSets/train.csv'
test_path = '/content/drive/MyDrive/DataSets/test.csv'

# -------------------------------
# Load data
# -------------------------------
train = pd.read_csv(train_path, sep=',', quotechar='"', engine='python', on_bad_lines='skip')
test = pd.read_csv(test_path, sep=',', quotechar='"', engine='python', on_bad_lines='skip')

print("Train shape:", train.shape)
print("Test shape:", test.shape)


Mounted at /content/drive
Train shape: (75000, 4)
Test shape: (75000, 3)


In [None]:
# -------------------------------
# Text cleaning
# -------------------------------
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

train['clean_text'] = train['catalog_content'].apply(clean_text)
test['clean_text'] = test['catalog_content'].apply(clean_text)

# -------------------------------
# Extract numeric value & unit
# -------------------------------
def extract_value_unit(text):
    value, unit = None, None
    value_match = re.search(r'Value[: ]+\s*([\d\.]+)', text, re.IGNORECASE)
    if value_match:
        try:
            value = float(value_match.group(1))
        except:
            value = None
    unit_match = re.search(r'Unit[: ]+\s*([a-zA-Z]+)', text, re.IGNORECASE)
    if unit_match:
        unit = unit_match.group(1).lower()
    return value, unit

train[['value_num', 'unit']] = train['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))
test[['value_num', 'unit']] = test['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))

# -------------------------------
# Extract brand
# -------------------------------
def extract_brand(text):
    if pd.isna(text):
        return None
    brand_match = re.match(r'([^,]+)', text)
    if brand_match:
        return brand_match.group(1).strip().lower()
    return None

train['brand'] = train['catalog_content'].apply(extract_brand)
test['brand'] = test['catalog_content'].apply(extract_brand)

# -------------------------------
# Hybrid Encoding for Brands and Units
# -------------------------------

# --- Brands ---
TOP_K_BRANDS = 50
top_brands = train['brand'].value_counts().nlargest(TOP_K_BRANDS).index.tolist()

train_top_brands = train['brand'].apply(lambda x: x if x in top_brands else 'other')
test_top_brands = test['brand'].apply(lambda x: x if x in top_brands else 'other')

train_brands_onehot = pd.get_dummies(train_top_brands, prefix='brand', dummy_na=True)
test_brands_onehot = pd.get_dummies(test_top_brands, prefix='brand', dummy_na=True)
test_brands_onehot = test_brands_onehot.reindex(columns=train_brands_onehot.columns, fill_value=0)

# Target encoding for brands
brand_target_mean = train.groupby('brand')['price'].mean().to_dict()
train['brand_target'] = train['brand'].map(brand_target_mean).fillna(train['price'].mean())
test['brand_target'] = test['brand'].map(brand_target_mean).fillna(train['price'].mean())

# --- Units ---
TOP_K_UNITS = 20
top_units = train['unit'].value_counts().nlargest(TOP_K_UNITS).index.tolist()

train_top_units = train['unit'].apply(lambda x: x if x in top_units else 'other')
test_top_units = test['unit'].apply(lambda x: x if x in top_units else 'other')

train_units_onehot = pd.get_dummies(train_top_units, prefix='unit', dummy_na=True)
test_units_onehot = pd.get_dummies(test_top_units, prefix='unit', dummy_na=True)
test_units_onehot = test_units_onehot.reindex(columns=train_units_onehot.columns, fill_value=0)

# Target encoding for units
unit_target_mean = train.groupby('unit')['price'].mean().to_dict()
train['unit_target'] = train['unit'].map(unit_target_mean).fillna(train['price'].mean())
test['unit_target'] = test['unit'].map(unit_target_mean).fillna(train['price'].mean())


In [None]:
# -------------------------------
# Combine categorical features
# -------------------------------
X_train_cat = np.hstack([
    train_brands_onehot.values,
    train['brand_target'].values.reshape(-1,1),
    train_units_onehot.values,
    train['unit_target'].values.reshape(-1,1)
]).astype(np.float32)

X_test_cat = np.hstack([
    test_brands_onehot.values,
    test['brand_target'].values.reshape(-1,1),
    test_units_onehot.values,
    test['unit_target'].values.reshape(-1,1)
]).astype(np.float32)

# -------------------------------
# Numeric features
# -------------------------------
X_train_num = train[['value_num']].fillna(0).astype(np.float32).values
X_test_num = test[['value_num']].fillna(0).astype(np.float32).values

# -------------------------------
# Text embeddings on GPU
# -------------------------------
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

X_train_text = embedder.encode(train['clean_text'].tolist(), batch_size=64, show_progress_bar=True, device=device)
X_test_text = embedder.encode(test['clean_text'].tolist(), batch_size=64, show_progress_bar=True, device=device)

# -------------------------------
# Combine all features
# -------------------------------
X_train = np.hstack([X_train_text, X_train_num, X_train_cat])
X_test = np.hstack([X_test_text, X_test_num, X_test_cat])

# -------------------------------
# Target transformation (log)
# -------------------------------
y_train = np.log1p(train['price'].values)

# -------------------------------
# XGBoost on GPU
# -------------------------------
model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method='gpu_hist'
)

model.fit(X_train, y_train)

# Predict
y_pred = np.expm1(model.predict(X_test))  # inverse log1p

output = test[['sample_id']].copy()
output['price'] = y_pred
print(output.head())

# -------------------------------
# Evaluate on train
# -------------------------------
def smape(y_true, y_pred):
    return 100 * np.mean(np.abs(y_pred - y_true) / ((np.abs(y_true) + np.abs(y_pred)) / 2))

smape_score = smape(train['price'].values, np.expm1(model.predict(X_train)))
print(f"Full Train SMAPE: {smape_score:.2f}%")


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


   sample_id      price
0     100179  21.685692
1     245611  22.107433
2     146263  23.106655
3      95658  23.081575
4      36806  23.960258
Full Train SMAPE: 10.40%


In [None]:

# -------------------------------
# Save model
# -------------------------------
joblib.dump(model, '/content/drive/MyDrive/DataSets/xgb_model_gpu_10.pkl')
print("Model saved to Drive!")

Model saved to Drive!


In [None]:
# ---------------------------------
# Overfitting Check (post-training)
# ---------------------------------
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import numpy as np

# Split same training data for validation
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train a smaller model for validation comparison
val_model = XGBRegressor(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method='gpu_hist',
    eval_metric='rmse'   # moved here
)

val_model.fit(
    X_tr, y_tr,
    eval_set=[(X_tr, y_tr), (X_val, y_val)],
    verbose=False
)

# Predict and evaluate SMAPE on both sets
y_tr_pred = np.expm1(val_model.predict(X_tr))
y_val_pred = np.expm1(val_model.predict(X_val))

def smape(y_true, y_pred):
    return 100 * np.mean(np.abs(y_pred - y_true) / ((np.abs(y_true) + np.abs(y_pred)) / 2))

train_smape = smape(np.expm1(y_tr), y_tr_pred)
val_smape = smape(np.expm1(y_val), y_val_pred)

print("\n--- Overfitting Check ---")
print(f"Train SMAPE: {train_smape:.2f}%")
print(f"Validation SMAPE: {val_smape:.2f}%")

if val_smape - train_smape > 10:
    print("Model is likely overfitting.")
else:
    print("Model generalizes well.")



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:



--- Overfitting Check ---
Train SMAPE: 10.20%
Validation SMAPE: 12.60%
Model generalizes well.
