# Amazon ML 2025 SOLUTION

In [1]:
# =========================
# Optimized pipeline (improve SMAPE)
# Adds richer features + OOF target encoding + tuned LightGBM
# Paste & run in Colab (A100 recommended). Uses /mnt/data/train.csv and /mnt/data/test.csv
# =========================
!pip install -q sentence-transformers lightgbm scikit-learn pandas numpy tqdm transformers datasets

import os, time, gc, re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
from collections import Counter

from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, train_test_split
from sklearn.linear_model import Ridge
import lightgbm as lgb

# -------------------- USER SETTINGS --------------------
MODE = "fast"       # "fast" or "strong"
USE_IMAGES = False  # set True if you want to include CLIP (slow)
TRAIN_PATH = "/content/train.csv"
TEST_PATH  = "/content/test.csv"
OUT_NAME = "submission_optimized.csv"
SVD_DIM = 320      # increase slightly from 256 -> more info
EMBED_BATCH = 64
RANDOM_STATE = 42
NFOLDS = 5         # 5-fold OOF stacking
TOP_TOKENS = 50    # number of tokens to do OOF target encoding for
# LightGBM general control (you can reduce num_boost_round for speed)
TFIDF_ROUNDS = 1000
DENSE_ROUNDS = 1500
# --------------------

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device, "Mode:", MODE)

# -------------------- HELPERS --------------------
def compute_smape(y_true, y_pred):
    t = np.asarray(y_true, float)
    p = np.asarray(y_pred, float)
    denom = (np.abs(t) + np.abs(p)) / 2.0
    denom = np.where(denom == 0, 1e-8, denom)
    return np.mean(np.abs(p - t) / denom) * 100.0

def extract_pack_qty(x):
    try:
        s = str(x).lower()
        m = re.search(r'pack(?:\s*of)?\s*(\d{1,3})', s)
        if m: return float(m.group(1))
        m2 = re.search(r'(\d{1,3})\s*x\b', s)
        if m2: return float(m2.group(1))
    except:
        pass
    return 1.0

# Detect unit numbers and normalize to base ml/g
def extract_unit_size(text):
    # returns size in ml (for liters convert) or grams for weight; else 0
    try:
        s = str(text).lower()
        m = re.search(r'(\d+(?:\.\d+)?)\s*(ml|l|liter|litre|g|kg|gram|grams|oz)\b', s)
        if m:
            val = float(m.group(1))
            unit = m.group(2)
            if unit in ("l","liter","litre"): return val * 1000.0   # liters -> ml
            if unit == "ml": return val
            if unit in ("kg",): return val * 1000.0                # kg -> grams (treat as ml-scale)
            if unit in ("g","gram","grams"): return val
            if unit == "oz": return val * 28.3495
    except:
        pass
    return 0.0

def has_offer_words(text):
    s = str(text).lower()
    for w in ["offer","discount","save","deal","pack of","combo","bundle"]:
        if w in s: return 1
    return 0

# -------------------- LOAD DATA --------------------
assert os.path.exists(TRAIN_PATH), f"Train not found at {TRAIN_PATH}"
assert os.path.exists(TEST_PATH), f"Test not found at {TEST_PATH}"
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
print("Loaded train", train.shape, "test", test.shape)

# Compose text
text_candidates = [c for c in ["catalog_content","product_name","title","description"] if c in train.columns]
if len(text_candidates)==0:
    for c in train.columns:
        if train[c].dtype == object and c not in ["sample_id","image_link","price"]:
            text_candidates.append(c); break
print("Text columns used:", text_candidates)
train["text_full"] = train[text_candidates].astype(str).agg(" ".join, axis=1).fillna("")
test["text_full"] = test[text_candidates].astype(str).agg(" ".join, axis=1).fillna("")

# Basic numeric features
for df in (train, test):
    df["text_len"] = df["text_full"].str.len().fillna(0).astype(int)
    df["word_count"] = df["text_full"].str.split().apply(lambda x: len(x) if isinstance(x, list) else 0)
    df["pack_qty"] = df["text_full"].apply(extract_pack_qty)
    df["digit_count"] = df["text_full"].str.count(r'\d').fillna(0).astype(int)
    df["unit_size"] = df["text_full"].apply(extract_unit_size)
    df["has_offer"] = df["text_full"].apply(has_offer_words)
    # punctuation ratio
    df["punct_ratio"] = df["text_full"].str.count(r'[^\w\s]').fillna(0) / (df["word_count"]+1)

num_cols = ["text_len","word_count","pack_qty","digit_count","unit_size","has_offer","punct_ratio"]
print("Numeric features:", num_cols)

# -------------------- TARGET PROCESSING --------------------
low_clip = max(0.0, train["price"].quantile(0.001))
high_clip = train["price"].quantile(0.99)
print("Clipping target to: ", low_clip, high_clip)
train["price_clipped"] = train["price"].clip(lower=low_clip, upper=high_clip)
train["y_log"] = np.log1p(train["price_clipped"].values)

# -------------------- TEXT TOKEN FREQUENCY & TOP TOKENS (for OOF target-encoding) ------------
# simple unigram tokenization
all_tokens = []
for t in train["text_full"].astype(str).values:
    toks = re.findall(r'\b[a-z0-9]{2,}\b', t.lower())
    all_tokens.extend(toks)
top_unigrams = [w for w,_ in Counter(all_tokens).most_common(200)]
top_tokens = top_unigrams[:TOP_TOKENS]
print("Top tokens (sample):", top_tokens[:10])

# Build presence matrix for top tokens (sparse boolean per row)
def token_presence(df, tokens):
    arr = np.zeros((len(df), len(tokens)), dtype=np.uint8)
    for i,txt in enumerate(df["text_full"].astype(str).values):
        s = set(re.findall(r'\b[a-z0-9]{2,}\b', txt.lower()))
        for j,tk in enumerate(tokens):
            if tk in s: arr[i,j]=1
    return arr

train_tok_pres = token_presence(train, top_tokens)
test_tok_pres  = token_presence(test, top_tokens)

# -------------------- Text Embeddings (mpnet) --------------------
print("Loading MPNet and computing embeddings...")
st_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2', device=device)
def encode_texts(texts, batch_size=EMBED_BATCH):
    embs = []
    for i in tqdm(range(0,len(texts),batch_size), desc="encode"):
        batch = texts[i:i+batch_size]
        e = st_model.encode(batch, convert_to_numpy=True, show_progress_bar=False, device=device)
        embs.append(e)
    return np.vstack(embs)

t0=time.time()
text_emb_train = encode_texts(train["text_full"].tolist(), batch_size=EMBED_BATCH)
text_emb_test  = encode_texts(test["text_full"].tolist(), batch_size=EMBED_BATCH)
print("Emb shapes:", text_emb_train.shape, text_emb_test.shape, "time (min):", (time.time()-t0)/60)

# -------------------- SVD reduce --------------------
print("Applying TruncatedSVD ->", SVD_DIM)
svd = TruncatedSVD(n_components=SVD_DIM, random_state=RANDOM_STATE)
svd.fit(np.vstack([text_emb_train, text_emb_test]))
all_red = svd.transform(np.vstack([text_emb_train, text_emb_test]))
t_emb_train = all_red[:len(text_emb_train)]
t_emb_test  = all_red[len(text_emb_train):]
del text_emb_train, text_emb_test, all_red
gc.collect()

# -------------------- TF-IDF features --------------------
print("TF-IDF")
tfidf = TfidfVectorizer(max_features=30000, ngram_range=(1,2), min_df=3)
all_texts = pd.concat([train["text_full"], test["text_full"]], axis=0)
X_tfidf_all = tfidf.fit_transform(all_texts)
X_tfidf_train = X_tfidf_all[:len(train)]
X_tfidf_test  = X_tfidf_all[len(train):]
del X_tfidf_all; gc.collect()

# -------------------- Numeric + engineered features assembly --------------------
# add OOF target encoding for top tokens (we'll compute below)
base_train_dense = np.hstack([t_emb_train, train[num_cols].values])
base_test_dense  = np.hstack([t_emb_test,  test[num_cols].values])
print("Base dense shapes:", base_train_dense.shape, base_test_dense.shape)

# -------------------- OOF target encoding for top tokens --------------------
print("Computing OOF target encoding for top tokens...")
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=RANDOM_STATE)
oof_token_means = np.zeros((len(train), len(top_tokens)))
test_token_means = np.zeros((len(test), len(top_tokens)))

y = train["y_log"].values
for fold, (tr_idx, val_idx) in enumerate(kf.split(train)):
    print("Token OOF fold", fold+1)
    # compute token means on tr_idx
    token_sums = np.zeros(len(top_tokens))
    token_counts = np.zeros(len(top_tokens))
    # accumulate sums for presence==1
    pres_tr = train_tok_pres[tr_idx]
    for j in range(len(top_tokens)):
        mask = pres_tr[:,j]==1
        if mask.any():
            token_sums[j] = y[tr_idx][mask].sum()
            token_counts[j] = mask.sum()
        else:
            token_sums[j] = 0.0; token_counts[j]=0.0
    # compute means (handle zero counts)
    token_means = np.where(token_counts>0, token_sums / (token_counts + 1e-12), 0.0)
    # fill val
    pres_val = train_tok_pres[val_idx]
    for i_local, i_global in enumerate(val_idx):
        # for each token, if present use token_mean else 0
        oof_token_means[i_global,:] = token_means * pres_val[i_local,:]
    # accumulate test (average later)
    pres_test = test_tok_pres
    test_token_means += (token_means * pres_test)

# average test accumulations
test_token_means /= NFOLDS

# append token-encoding to dense matrices
X_train_dense = np.hstack([base_train_dense, oof_token_means])
X_test_dense  = np.hstack([base_test_dense,  test_token_means])

del base_train_dense, base_test_dense, oof_token_means, test_token_means
gc.collect()
print("Final dense shapes:", X_train_dense.shape, X_test_dense.shape)

# -------------------- Scale dense --------------------
scaler = StandardScaler()
X_train_dense = scaler.fit_transform(X_train_dense)
X_test_dense  = scaler.transform(X_test_dense)

# -------------------- LightGBM training (improved params) --------------------
y_log = train["y_log"].values
kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=RANDOM_STATE)

# TF-IDF LGBM
print("Training TF-IDF LightGBM")
oof_tfidf = np.zeros(len(train)); preds_tfidf = np.zeros(len(test))
params_tfidf = {
    "objective":"regression","learning_rate":0.03,"num_leaves":128,
    "feature_fraction":0.7,"bagging_fraction":0.8,"bagging_freq":5,
    "lambda_l2":2.0,"metric":"rmse","verbose":-1,"num_threads":8
}
for fold,(tr, val) in enumerate(kf.split(X_tfidf_train)):
    print("TFIDF fold", fold+1)
    dtr = lgb.Dataset(X_tfidf_train[tr], label=y_log[tr])
    dval = lgb.Dataset(X_tfidf_train[val], label=y_log[val])
    m = lgb.train(params_tfidf, dtr, num_boost_round=TFIDF_ROUNDS, valid_sets=[dval],
                  callbacks=[lgb.early_stopping(stopping_rounds=80), lgb.log_evaluation(200)])
    oof_tfidf[val] = m.predict(X_tfidf_train[val], num_iteration=m.best_iteration)
    preds_tfidf += m.predict(X_tfidf_test, num_iteration=m.best_iteration) / NFOLDS
    del dtr,dval,m; gc.collect()

print("TF-IDF OOF SMAPE:", compute_smape(np.expm1(y_log), np.expm1(oof_tfidf)))

# Dense LGBM
print("Training Dense LightGBM")
oof_dense = np.zeros(len(train)); preds_dense = np.zeros(len(test))
params_dense = {
    "objective":"regression","learning_rate":0.025,"num_leaves":256,
    "feature_fraction":0.6,"bagging_fraction":0.7,"bagging_freq":5,
    "lambda_l2":3.0,"metric":"rmse","verbose":-1,"num_threads":8
}
for fold,(tr,val) in enumerate(kf.split(X_train_dense)):
    print("Dense fold", fold+1)
    dtr = lgb.Dataset(X_train_dense[tr], label=y_log[tr])
    dval = lgb.Dataset(X_train_dense[val], label=y_log[val])
    m = lgb.train(params_dense, dtr, num_boost_round=DENSE_ROUNDS, valid_sets=[dval],
                  callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(200)])
    oof_dense[val] = m.predict(X_train_dense[val], num_iteration=m.best_iteration)
    preds_dense += m.predict(X_test_dense, num_iteration=m.best_iteration) / NFOLDS
    del dtr,dval,m; gc.collect()

print("Dense OOF SMAPE:", compute_smape(np.expm1(y_log), np.expm1(oof_dense)))

# Optional: DeBERTa stacking (strong mode) - disabled by default to save time
oof_deberta = np.zeros(len(train)); preds_deberta = np.zeros(len(test))
if MODE=="strong":
    print("DeBERTa fine-tune (optional) - not included in fast runs")

# -------------------- Meta stacking with Ridge --------------------
print("Stacking with Ridge")
stack_X = np.vstack([oof_tfidf, oof_dense, oof_deberta]).T
stack_test = np.vstack([preds_tfidf, preds_dense, preds_deberta]).T
meta = Ridge(alpha=1.0)
meta.fit(stack_X, y_log)
oof_meta = meta.predict(stack_X)
test_meta = meta.predict(stack_test)
final_smape = compute_smape(np.expm1(y_log), np.expm1(oof_meta))
print("Stacked OOF SMAPE (log->exp):", final_smape)

# -------------------- Save CSV --------------------
final_preds = np.expm1(test_meta)
id_col = "sample_id" if "sample_id" in test.columns else test.columns[0]
submission = pd.DataFrame({id_col: test[id_col], "price": final_preds})
submission.to_csv(OUT_NAME, index=False)
print("Saved submission:", OUT_NAME)

# Print component SMAPEs
print("TFIDF SMAPE:", compute_smape(np.expm1(y_log), np.expm1(oof_tfidf)))
print("Dense SMAPE:", compute_smape(np.expm1(y_log), np.expm1(oof_dense)))
print("Stacked SMAPE:", final_smape)


Device: cuda Mode: fast
Loaded train (75000, 4) test (75000, 3)
Text columns used: ['catalog_content']
Numeric features: ['text_len', 'word_count', 'pack_qty', 'digit_count', 'unit_size', 'has_offer', 'punct_ratio']
Clipping target to:  0.659995 145.25029999999984
Top tokens (sample): ['and', 'point', 'bullet', 'the', 'of', 'to', 'for', 'with', 'in', 'is']
Loading MPNet and computing embeddings...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

encode:   0%|          | 0/1172 [00:00<?, ?it/s]

encode:   0%|          | 0/1172 [00:00<?, ?it/s]

Emb shapes: (75000, 768) (75000, 768) time (min): 9.920968655745188
Applying TruncatedSVD -> 320
TF-IDF
Base dense shapes: (75000, 327) (75000, 327)
Computing OOF target encoding for top tokens...
Token OOF fold 1
Token OOF fold 2
Token OOF fold 3
Token OOF fold 4
Token OOF fold 5
Final dense shapes: (75000, 377) (75000, 377)
Training TF-IDF LightGBM
TFIDF fold 1
Training until validation scores don't improve for 80 rounds
[200]	valid_0's rmse: 0.69352
[400]	valid_0's rmse: 0.679659
[600]	valid_0's rmse: 0.675995
[800]	valid_0's rmse: 0.674869
Early stopping, best iteration is:
[850]	valid_0's rmse: 0.674534
TFIDF fold 2
Training until validation scores don't improve for 80 rounds
[200]	valid_0's rmse: 0.679321
[400]	valid_0's rmse: 0.666644
[600]	valid_0's rmse: 0.664087
[800]	valid_0's rmse: 0.663949
Early stopping, best iteration is:
[740]	valid_0's rmse: 0.663625
TFIDF fold 3
Training until validation scores don't improve for 80 rounds
[200]	valid_0's rmse: 0.680395
[400]	valid_0's