In [1]:
import os
import math
import random
import json
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup,
)

In [2]:
# --------------------------- Config ---------------------------
CSV_PATH        = os.environ.get("TRAIN_CSV", "jl_fs/train.csv")   # must contain text + price
TEXT_COL        = os.environ.get("TEXT_COL", "catalog_content")         # e.g., "text" or "catalog_content"
PRICE_COL       = os.environ.get("PRICE_COL", "price")

MODEL_ID        = os.environ.get("MODEL_ID", "distilbert/distilbert-base-uncased-finetuned-sst-2-english")
OUTPUT_DIR      = os.environ.get("OUTPUT_DIR", "price_distilbert_contrastive")

SEED            = 42
VAL_FRAC        = 0.1                   # 50/50 split as requested
MAX_LEN         = 192                   # adjust if texts are long
BATCH_SIZE      = 16
LR              = 3e-5
WEIGHT_DECAY    = 0.01
EPOCHS          = 5
WARMUP_RATIO    = 0.06
GRAD_ACCUM      = 1
MAX_GRAD_NORM   = 1.0
FP16            = True                  # use AMP for speed


In [3]:
ALPHA_CONTRAST  = 0.25                  # weight for contrastive loss (0..1). 0.25 is a good start.
TAU             = 0.05                  # contrastive temperature

# Augmentations
WORD_MASK_P     = 0.08                  # probability to randomly mask an input token (2nd view only)
DROPOUT_PROB    = 0.1                   # Dropout already inside the transformer; can adjust here in heads.

EARLY_STOP_ROUNDS = 3                   # stop if val SMAPE hasn't improved for these epochs

# Price clipping for log transform
MIN_PRICE       = 1e-6

In [4]:
def set_seed(seed: int = SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

def smape_np(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    return 100.0 * np.mean(np.abs(y_pred - y_true) / denom)

def log2_price(p: np.ndarray) -> np.ndarray:
    return np.log2(np.clip(p, MIN_PRICE, None))

def delog2(x: np.ndarray) -> np.ndarray:
    return np.power(2.0, x)

def split_train_val(df: pd.DataFrame, frac_val: float = VAL_FRAC, seed: int = SEED):
    df = df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    n_val = int(len(df) * frac_val)
    df_val = df.iloc[:n_val].reset_index(drop=True)
    df_tr  = df.iloc[n_val:].reset_index(drop=True)
    return df_tr, df_val

class PriceTextDataset(Dataset):
    def __init__(self, texts: List[str], prices_log2: Optional[np.ndarray], tokenizer, max_len: int, training: bool):
        self.texts = texts
        self.prices_log2 = prices_log2  # None in pure inference; here we train/val so not None
        self.tok = tokenizer
        self.max_len = max_len
        self.training = training

    def _tokenize(self, text: str):
        return self.tok(
            text,
            truncation=True,
            max_length=self.max_len,
            padding=False,
            return_tensors="pt"
        )

    def _random_word_mask(self, input_ids: torch.Tensor, mask_token_id: int, prob: float) -> torch.Tensor:
        if prob <= 0.0:
            return input_ids
        ids = input_ids.clone()
        # don't mask special tokens
        special_tokens = set(self.tok.all_special_ids)
        for i in range(ids.size(0)):
            for j in range(ids.size(1)):
                if ids[i, j].item() in special_tokens:
                    continue
                if random.random() < prob:
                    ids[i, j] = mask_token_id
        return ids

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        t = self.texts[idx] if isinstance(self.texts[idx], str) else ""
        enc1 = self._tokenize(t)

        # second "view": token masking as augmentation (SimCSE-style uses dropout only; we add light word masking)
        enc2 = {k: v.clone() for k, v in enc1.items()}
        enc2["input_ids"] = self._random_word_mask(
            enc2["input_ids"], mask_token_id=self.tok.mask_token_id, prob=WORD_MASK_P
        )

        item = {
            "input_ids_1": enc1["input_ids"].squeeze(0),
            "attention_mask_1": enc1["attention_mask"].squeeze(0),
            "input_ids_2": enc2["input_ids"].squeeze(0),
            "attention_mask_2": enc2["attention_mask"].squeeze(0),
        }
        if self.prices_log2 is not None:
            item["target"] = torch.tensor(self.prices_log2[idx], dtype=torch.float32)
        return item


In [5]:
@dataclass
class Collate:
    pad_id: int
    def __call__(self, batch):
        # pad each view independently
        keys1 = ["input_ids_1", "attention_mask_1"]
        keys2 = ["input_ids_2", "attention_mask_2"]

        def pad_stack(keylist):
            maxlen = max(x[keylist[0]].size(0) for x in batch)
            out = {}
            for k in keylist:
                if "input_ids" in k:
                    pad_val = self.pad_id
                else:
                    pad_val = 0
                tensors = []
                for x in batch:
                    v = x[k]
                    if v.size(0) < maxlen:
                        pad = torch.full((maxlen - v.size(0),), pad_val, dtype=v.dtype)
                        v = torch.cat([v, pad], dim=0)
                    tensors.append(v.unsqueeze(0))
                out[k] = torch.cat(tensors, dim=0)
            return out

        out1 = pad_stack(keys1)
        out2 = pad_stack(keys2)

        res = {**out1, **out2}
        if "target" in batch[0]:
            res["target"] = torch.stack([x["target"] for x in batch], dim=0)
        return res

class DistilBertPriceModel(nn.Module):
    def __init__(self, model_id: str, proj_dim: int = 256, dropout: float = DROPOUT_PROB):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_id)  # DistilBERT
        hidden = self.backbone.config.dim

        # Regression head
        self.regressor = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, 1)
        )

        # Projection head for contrastive learning
        self.proj = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.GELU(),
            nn.Linear(hidden, proj_dim)
        )

        # Optional: layerwise LR decay could be added by parameter groups

    def forward_once(self, input_ids, attention_mask):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        # DistilBERT: last_hidden_state; use first token as pooled (CLS analog at position 0)
        cls = out.last_hidden_state[:, 0, :]  # (B, hidden)
        yhat = self.regressor(cls).squeeze(-1)  # (B,)
        z = self.proj(cls)  # (B, proj_dim)
        return yhat, z

    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2):
        y1, z1 = self.forward_once(input_ids_1, attention_mask_1)
        y2, z2 = self.forward_once(input_ids_2, attention_mask_2)
        return (y1 + y2) / 2.0, z1, z2  # average predictions from two views for stability

def info_nce(z1: torch.Tensor, z2: torch.Tensor, tau: float = TAU) -> torch.Tensor:
    # Normalize
    z1 = F.normalize(z1, dim=-1)
    z2 = F.normalize(z2, dim=-1)

    logits = torch.matmul(z1, z2.t()) / tau  # (B, B)
    labels = torch.arange(z1.size(0), device=z1.device)
    loss1 = F.cross_entropy(logits, labels)
    loss2 = F.cross_entropy(logits.t(), labels)
    return (loss1 + loss2) * 0.5

def huber_loss(pred, target, delta=1.0):
    return F.huber_loss(pred, target, delta=delta)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [6]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"🔧 Loading CSV: {CSV_PATH}")
df = pd.read_csv(CSV_PATH)

if TEXT_COL not in df.columns:
    raise ValueError(f"TEXT_COL '{TEXT_COL}' not in CSV columns={df.columns.tolist()}")
if PRICE_COL not in df.columns:
    raise ValueError(f"PRICE_COL '{PRICE_COL}' not in CSV columns={df.columns.tolist()}")

# Clean/guard text
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str).str.strip()
# Filter rows with non-positive or missing price
df = df.loc[pd.to_numeric(df[PRICE_COL], errors="coerce").notnull()].copy()
df[PRICE_COL] = df[PRICE_COL].astype(float)
df = df.loc[df[PRICE_COL] >= 0.0].reset_index(drop=True)

# 50/50 split
df_tr, df_va = split_train_val(df, frac_val=VAL_FRAC, seed=SEED)
print(f"📊 Split: train={len(df_tr)} | valid={len(df_va)}")

y_tr_log = log2_price(df_tr[PRICE_COL].values)
y_va_log = log2_price(df_va[PRICE_COL].values)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.mask_token is None:
    # For uncased DistilBERT, mask token should exist; fallback guard
    tokenizer.add_special_tokens({"mask_token": "[MASK]"})

train_ds = PriceTextDataset(
    texts=df_tr[TEXT_COL].tolist(),
    prices_log2=y_tr_log,
    tokenizer=tokenizer,
    max_len=MAX_LEN,
    training=True
)
val_ds = PriceTextDataset(
    texts=df_va[TEXT_COL].tolist(),
    prices_log2=y_va_log,
    tokenizer=tokenizer,
    max_len=MAX_LEN,
    training=False
)

collate = Collate(pad_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🖥️ Device: {device}")

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, collate_fn=collate)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True, collate_fn=collate)

model = DistilBertPriceModel(MODEL_ID).to(device)
# If we added a new mask token or changed tokenizer vocab size, resize embeddings
if tokenizer.vocab_size != model.backbone.get_input_embeddings().weight.size(0):
    model.backbone.resize_token_embeddings(len(tokenizer))

print(f"🧮 Trainable params: {count_parameters(model):,}")

# Optimizer & scheduler
no_decay = ["bias", "LayerNorm.weight"]
grouped = [
    {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
     "weight_decay": WEIGHT_DECAY},
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
     "weight_decay": 0.0},
]
optimizer = torch.optim.AdamW(grouped, lr=LR)

num_training_steps = EPOCHS * math.ceil(len(train_loader) / GRAD_ACCUM)
num_warmup = int(num_training_steps * WARMUP_RATIO)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup, num_training_steps=num_training_steps
)

scaler = torch.cuda.amp.GradScaler(enabled=FP16)

best_smape = float("inf")
best_path = os.path.join(OUTPUT_DIR, "best_90-10.pt")
patience = 0

for epoch in range(1, EPOCHS + 1):
    model.train()
    train_loss_running = 0.0
    reg_loss_running = 0.0
    con_loss_running = 0.0

    for step, batch in enumerate(train_loader, 1):
        batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}

        with torch.cuda.amp.autocast(enabled=FP16):
            yhat, z1, z2 = model(
                input_ids_1=batch["input_ids_1"],
                attention_mask_1=batch["attention_mask_1"],
                input_ids_2=batch["input_ids_2"],
                attention_mask_2=batch["attention_mask_2"],
            )
            loss_reg = huber_loss(yhat, batch["target"])
            loss_con = info_nce(z1, z2, tau=TAU)
            loss = (1.0 - ALPHA_CONTRAST) * loss_reg + ALPHA_CONTRAST * loss_con

        scaler.scale(loss).backward()

        if step % GRAD_ACCUM == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        train_loss_running += loss.item()
        reg_loss_running += loss_reg.item()
        con_loss_running += loss_con.item()

        if step % 300 == 0:
            print(f"epoch {epoch} step {step}/{len(train_loader)} "
                  f"loss={train_loss_running/step:.4f} reg={reg_loss_running/step:.4f} con={con_loss_running/step:.4f}")

    # ------------------ Validation ------------------
    model.eval()
    preds_log = []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
            yhat, _, _ = model(
                input_ids_1=batch["input_ids_1"],
                attention_mask_1=batch["attention_mask_1"],
                input_ids_2=batch["input_ids_2"],
                attention_mask_2=batch["attention_mask_2"],
            )
            preds_log.append(yhat.detach().float().cpu().numpy())

    preds_log = np.concatenate(preds_log, axis=0)
    # de-log for SMAPE
    va_preds = delog2(preds_log)
    va_true  = delog2(y_va_log)
    smape = smape_np(va_true, va_preds)
    print(f"✅ Epoch {epoch}: VAL SMAPE = {smape:.3f}%")

    # Save best
    if smape < best_smape - 1e-6:
        best_smape = smape
        patience = 0
        torch.save(
            {
                "model_state": model.state_dict(),
                "tokenizer": MODEL_ID,
                "config": {
                    "ALPHA_CONTRAST": ALPHA_CONTRAST,
                    "TAU": TAU,
                    "MAX_LEN": MAX_LEN,
                },
            },
            best_path
        )
        print(f"💾 Saved new best to {best_path}")
    else:
        patience += 1
        print(f"⏸️ No improvement. Patience {patience}/{EARLY_STOP_ROUNDS}")
        if patience >= EARLY_STOP_ROUNDS:
            print("🛑 Early stopping triggered.")
            break

print(f"🏁 Best VAL SMAPE: {best_smape:.3f}% | Checkpoint: {best_path}")

# Save final artifacts
with open(os.path.join(OUTPUT_DIR, "metrics1.json"), "w") as f:
    json.dump({"best_val_smape": best_smape}, f, indent=2)

🔧 Loading CSV: jl_fs/train.csv
📊 Split: train=67500 | valid=7500


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

🖥️ Device: cuda


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

🧮 Trainable params: 67,741,697


  scaler = torch.cuda.amp.GradScaler(enabled=FP16)
  with torch.cuda.amp.autocast(enabled=FP16):


epoch 1 step 300/4219 loss=1.5448 reg=1.9364 con=0.3700
epoch 1 step 600/4219 loss=1.0299 reg=1.3091 con=0.1921
epoch 1 step 900/4219 loss=0.8408 reg=1.0770 con=0.1324
epoch 1 step 1200/4219 loss=0.7376 reg=0.9495 con=0.1018
epoch 1 step 1500/4219 loss=0.6721 reg=0.8684 con=0.0832
epoch 1 step 1800/4219 loss=0.6262 reg=0.8114 con=0.0708
epoch 1 step 2100/4219 loss=0.5933 reg=0.7705 con=0.0617
epoch 1 step 2400/4219 loss=0.5657 reg=0.7361 con=0.0547
epoch 1 step 2700/4219 loss=0.5437 reg=0.7086 con=0.0490
epoch 1 step 3000/4219 loss=0.5252 reg=0.6854 con=0.0446
epoch 1 step 3300/4219 loss=0.5102 reg=0.6665 con=0.0412
epoch 1 step 3600/4219 loss=0.4976 reg=0.6508 con=0.0381
epoch 1 step 3900/4219 loss=0.4862 reg=0.6365 con=0.0355
epoch 1 step 4200/4219 loss=0.4755 reg=0.6230 con=0.0332
✅ Epoch 1: VAL SMAPE = 51.493%
💾 Saved new best to price_distilbert_contrastive/best_90-10.pt
epoch 2 step 300/4219 loss=0.2990 reg=0.3977 con=0.0031
epoch 2 step 600/4219 loss=0.2992 reg=0.3976 con=0.0037