* This model runs on a T4 GPU runtime. Instead of traditional one-hot encoding, it represents brand and unit information using Top K encoding and Frequency encoding so it does not crash, combined with numeric features. Textual product descriptions are also embedded via SentenceTransformers, and the model achieves a SMAPE of 64% on the training set.
* Also uses XGBRegressor.
To decrease the SMAPE, we will replace the Top K and Frequency encoding of brands and units with embeddings generated via SentenceTransformers, so that categorical features are represented in a richer, continuous space, potentially improving model accuracy.

In [None]:
!pip install -q sentence-transformers xgboost

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import re
from scipy.sparse import hstack
from xgboost import XGBRegressor
from sentence_transformers import SentenceTransformer
import torch

In [None]:
train_path = '/content/drive/MyDrive/DataSets/train.csv'
test_path = '/content/drive/MyDrive/DataSets/test.csv'

In [None]:
train = pd.read_csv(train_path, sep=',', quotechar='"', engine='python', on_bad_lines='skip')
test = pd.read_csv(test_path, sep=',', quotechar='"', engine='python', on_bad_lines='skip')

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (75000, 4)
Test shape: (75000, 3)


In [None]:
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

train['clean_text'] = train['catalog_content'].apply(clean_text)
test['clean_text'] = test['catalog_content'].apply(clean_text)

In [None]:
def extract_value_unit(text):
    value, unit = None, None
    value_match = re.search(r'Value[: ]+\s*([\d\.]+)', text, re.IGNORECASE)
    if value_match:
        try:
            value = float(value_match.group(1))
        except:
            value = None
    unit_match = re.search(r'Unit[: ]+\s*([a-zA-Z]+)', text, re.IGNORECASE)
    if unit_match:
        unit = unit_match.group(1).lower()
    return value, unit

train[['value_num', 'unit']] = train['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))
test[['value_num', 'unit']] = test['catalog_content'].apply(lambda x: pd.Series(extract_value_unit(x)))

In [None]:
def extract_brand(text):
    if pd.isna(text):
        return None
    brand_match = re.match(r'([^,]+)', text)
    if brand_match:
        return brand_match.group(1).strip().lower()
    return None

train['brand'] = train['catalog_content'].apply(extract_brand)
test['brand'] = test['catalog_content'].apply(extract_brand)

In [None]:
# -------------------------------
# Hybrid Encoding for Brands and Units
# -------------------------------

# --- Brands ---
TOP_K_BRANDS = 50
top_brands = train['brand'].value_counts().nlargest(TOP_K_BRANDS).index.tolist()

# One-hot for top-K brands
train_top_brands = train['brand'].apply(lambda x: x if x in top_brands else 'other')
test_top_brands = test['brand'].apply(lambda x: x if x in top_brands else 'other')

train_brands_onehot = pd.get_dummies(train_top_brands, prefix='brand', dummy_na=True)
test_brands_onehot = pd.get_dummies(test_top_brands, prefix='brand', dummy_na=True)
test_brands_onehot = test_brands_onehot.reindex(columns=train_brands_onehot.columns, fill_value=0)

# Frequency encoding for all brands
brand_counts = train['brand'].value_counts().to_dict()
train['brand_freq'] = train['brand'].map(brand_counts).fillna(0)
test['brand_freq'] = test['brand'].map(brand_counts).fillna(0)

# --- Units ---
TOP_K_UNITS = 20
top_units = train['unit'].value_counts().nlargest(TOP_K_UNITS).index.tolist()

# One-hot for top-K units
train_top_units = train['unit'].apply(lambda x: x if x in top_units else 'other')
test_top_units = test['unit'].apply(lambda x: x if x in top_units else 'other')

train_units_onehot = pd.get_dummies(train_top_units, prefix='unit', dummy_na=True)
test_units_onehot = pd.get_dummies(test_top_units, prefix='unit', dummy_na=True)
test_units_onehot = test_units_onehot.reindex(columns=train_units_onehot.columns, fill_value=0)

# Frequency encoding for all units
unit_counts = train['unit'].value_counts().to_dict()
train['unit_freq'] = train['unit'].map(unit_counts).fillna(0)
test['unit_freq'] = test['unit'].map(unit_counts).fillna(0)

# --- Combine categorical features ---
X_train_cat = np.hstack([
    train_brands_onehot.values,
    train['brand_freq'].values.reshape(-1, 1),
    train_units_onehot.values,
    train['unit_freq'].values.reshape(-1, 1)
]).astype(np.float32)

X_test_cat = np.hstack([
    test_brands_onehot.values,
    test['brand_freq'].values.reshape(-1, 1),
    test_units_onehot.values,
    test['unit_freq'].values.reshape(-1, 1)
]).astype(np.float32)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)

embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)

X_train_text = embedder.encode(train['clean_text'].tolist(), batch_size=64, show_progress_bar=True, device=device)
X_test_text = embedder.encode(test['clean_text'].tolist(), batch_size=64, show_progress_bar=True, device=device)

Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

Batches:   0%|          | 0/1172 [00:00<?, ?it/s]

In [None]:
X_train_num = train[['value_num']].fillna(0).astype(np.float32).values
X_test_num = test[['value_num']].fillna(0).astype(np.float32).values

# Combine all features
X_train = np.hstack([X_train_text, X_train_num, X_train_cat])
X_test = np.hstack([X_test_text, X_test_num, X_test_cat])

y_train = train['price'].values

In [None]:
model = XGBRegressor(
    n_estimators=200,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    tree_method='gpu_hist'  # <-- Use GPU
)

model.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
y_pred = model.predict(X_test)

output = test[['sample_id']].copy()
output['price'] = y_pred
print(output.head())


    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


   sample_id      price
0     100179  21.521418
1     245611  20.200880
2     146263  23.787832
3      95658  20.582125
4      36806  29.091726


In [None]:
def smape(y_true, y_pred):
    return 100 * np.mean(np.abs(y_pred - y_true) / ((np.abs(y_true) + np.abs(y_pred)) / 2))

smape_score = smape(y_train, model.predict(X_train))
print(f"Full Train SMAPE: {smape_score:.2f}%")

Full Train SMAPE: 61.40%


In [None]:
import joblib
joblib.dump(model, '/content/drive/MyDrive/DataSets/xgb_model_gpu.pkl')
print("Model saved to Drive!")

Model saved to Drive!
