In [9]:
# %%
import importlib
import subprocess
import sys
import os
import hashlib
import warnings
from typing import Tuple

# ========== STEP 1: Auto Install Required Libraries ==========
required_packages = [
    "pandas",
    "numpy",
    "lightgbm",
    "sentence-transformers",
    "tqdm",
    "torch",
    "scikit-learn"
]
for package in required_packages:
    try:
        importlib.import_module(package)
    except ImportError:
        print(f"⚙️ Installing missing package: {package}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package, "--quiet"])

⚙️ Installing missing package: sentence-transformers


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


⚙️ Installing missing package: scikit-learn


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
# ===== Logging setup (with easy de-logging) =====
import logging
LOGGER_NAME = "price_regression"
DELOG = False  # <-- set True to suppress info/debug logs (only warnings+)
LOG_LEVEL = logging.WARNING if DELOG else logging.INFO

logger = logging.getLogger(LOGGER_NAME)
logger.setLevel(LOG_LEVEL)
if not logger.handlers:
    _h = logging.StreamHandler(sys.stdout)
    _fmt = logging.Formatter("[%(levelname)s] %(message)s")
    _h.setFormatter(_fmt)
    logger.addHandler(_h)

# Quiet external libs if desired
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
try:
    import transformers
    logging.getLogger("transformers").setLevel(logging.ERROR if DELOG else logging.WARNING)
except Exception:
    pass
logging.getLogger("lightgbm").setLevel(logging.ERROR if DELOG else logging.WARNING)

In [11]:
# %%
import pandas as pd
import numpy as np
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm
tqdm.pandas()

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# ========= Utility: SMAPE =========
def smape(y_true, y_pred, eps=1e-8) -> float:
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    return float(np.mean(np.abs(y_pred - y_true) / denom) * 100.0)

# ========= Utility: IQR outlier removal on log(price) =========
def iqr_filter_on_log_price(y: pd.Series, k: float = 1.5) -> Tuple[np.ndarray, float, float]:
    """
    Returns a boolean mask (keep=True) for entries within [Q1 - k*IQR, Q3 + k*IQR]
    computed on log2(price_clipped).
    """
    y_log = np.log2(y.clip(lower=1e-6))
    q1, q3 = np.percentile(y_log, [25, 75])
    iqr = q3 - q1
    lo, hi = q1 - k * iqr, q3 + k * iqr
    keep = (y_log >= lo) & (y_log <= hi)
    return keep, lo, hi

# ========= Utility: deterministic filename for embeddings =========
def stable_np_cache_name(prefix: str, texts: pd.Series) -> str:
    # Simple content-based hash on length + a few bytes to avoid re-embedding when split changes
    m = hashlib.md5()
    m.update(str(len(texts)).encode())
    if len(texts) > 0:
        m.update(str(texts.iloc[0])[:256].encode(errors="ignore"))
        m.update(str(texts.iloc[-1])[:256].encode(errors="ignore"))
    return f"{prefix}_{m.hexdigest()}.npy"

In [12]:
# ========= Load data =========
train = pd.read_csv("train_cleaned.csv")
test = pd.read_csv("test_cleaned.csv")

# Basic cleaning (optional but safe)
train = train.dropna(subset=["catalog_content", "price"])
train = train[train["price"] > 0] 

In [14]:
# ========= Train/Val split =========
X_train_text, X_test_text, y_train_raw = train["catalog_content"], test["catalog_content"], train["price"], 

logger.info(f"Initial train size: {len(X_train_text):,}, test size: {len(X_test_text):,}")

[INFO] Initial train size: 75,000, test size: 75,000


In [15]:

# ========= Outlier removal ONLY on training set (IQR on log2(price)) =========
keep_mask, lo, hi = iqr_filter_on_log_price(pd.Series(y_train_raw))
removed = int((~keep_mask).sum())
logger.info(
    f"Outlier filter on log2(price): keep in [{lo:.3f}, {hi:.3f}] — removed {removed:,} "
    f"({removed/len(keep_mask)*100:.2f}%) from training."
)

X_train_text = X_train_text.reset_index(drop=True)[keep_mask.values]
y_train = pd.Series(y_train_raw).reset_index(drop=True)[keep_mask.values]

logger.info(f"Post-filter train size: {len(X_train_text):,}")

[INFO] Outlier filter on log2(price): keep in [-0.348, 7.951] — removed 273 (0.36%) from training.
[INFO] Post-filter train size: 74,727


In [16]:
test

Unnamed: 0,sample_id,catalog_content
0,100179,Rani 14Spice Eshamayas Mango Chutney Indian Pr...
1,245611,Natural MILK TEA Flavoring extract by HALO PAN...
2,146263,Honey Filled Hard Candy Bulk Pack 2 Pounds Ind...
3,95658,Vlasic Snackmms Kosher Dill 16 Oz Pack of 2 2....
4,36806,McCormick Culinary Vanilla Extract 32.0 Fl Oz
...,...,...
74995,93616,Good Seasons Zezty Italian Salad Dressing Mix ...
74996,249434,Colombina Swirled Love Tiger Pops 7.0 Ounce
74997,162217,Kerns 11.5 Fl Oz
74998,230487,NY SPICE SHOP Licorice Candy 1 Pound Red Licor...


In [17]:

# ========= Embeddings (with caching) =========
model_name = "Qwen/Qwen3-Embedding-0.6B"  # replace if you use a different model
logger.info(f"Loading embedder: {model_name}")
embedder = SentenceTransformer(model_name)

def get_or_create_embeddings(texts: pd.Series, prefix: str) -> np.ndarray:
    fname = stable_np_cache_name(prefix, texts)
    if os.path.exists(fname):
        logger.info(f"📂 Loading cached embeddings: {fname}")
        arr = np.load(fname)
        # Sanity check
        if arr.shape[0] != len(texts):
            logger.warning("Cached embedding count mismatch; re-computing.")
        else:
            return arr
    logger.info(f"🔄 Creating embeddings for {prefix} ({len(texts):,} rows) ...")
    emb = embedder.encode(
        texts.tolist(),
        batch_size=32,
        show_progress_bar=not DELOG,
        convert_to_numpy=True,
        normalize_embeddings=False
    )
    np.save(fname, emb)
    logger.info(f"✅ Saved embeddings to {fname}")
    return emb

train_embeddings = get_or_create_embeddings(X_train_text, "train_embeddings")
test_embeddings  = get_or_create_embeddings(X_test_text, "test_embeddings")

[INFO] Loading embedder: Qwen/Qwen3-Embedding-0.6B
[INFO] 🔄 Creating embeddings for train_embeddings (74,727 rows) ...


Batches:   0%|          | 0/2336 [00:00<?, ?it/s]

[INFO] ✅ Saved embeddings to train_embeddings_8383ad239ec278b1b6b99f9e7c102c5f.npy
[INFO] 🔄 Creating embeddings for test_embeddings (75,000 rows) ...


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

[INFO] ✅ Saved embeddings to test_embeddings_86088a3b720d340906849e270bc612da.npy


In [18]:
# ========= Targets (log2 for training) =========
y_train_log2 = np.log2(y_train.clip(lower=1e-6))

X_train = train_embeddings
X_test  = test_embeddings

# ========= Model =========
model = lgb.LGBMRegressor(
    n_estimators=3000,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=-1,
    subsample=0.9,
    objective="huber",
    colsample_bytree=0.9,
    random_state=RANDOM_SEED,
    verbose=-1  # keep LGBM quiet; use our logger
)

logger.info("🚀 Training LightGBM model on log2(price)...")
model.fit(X_train, y_train_log2)

[INFO] 🚀 Training LightGBM model on log2(price)...


0,1,2
,boosting_type,'gbdt'
,num_leaves,64
,max_depth,-1
,learning_rate,0.05
,n_estimators,3000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [19]:
# ========= Predict (back-transform from log2) =========
log_preds = model.predict(X_test)
preds = np.power(2.0, log_preds)

In [20]:
pd.DataFrame({
    "sample_id" : test["sample_id"], 
    "price" : preds
}).to_csv("qwen_06B_raw.csv", index = False)