In [12]:
# full_train_then_test.py
# DistilBERT + contrastive + Huber regression
# Trains on 100% of TRAIN_CSV (no validation), then predicts on TEST_CSV and writes predictions.csv

import os, math, random, json
from dataclasses import dataclass
from typing import List, Optional

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, get_linear_schedule_with_warmup

# --------------------------- Config ---------------------------
# paths/columns (override via env if needed)
TRAIN_CSV       = os.environ.get("TRAIN_CSV", "jl_fs/train.csv")
TEST_CSV        = os.environ.get("TEST_CSV", "jl_fs/test.csv")  # file with sample_id, catalog_content
ID_COL          = os.environ.get("ID_COL", "sample_id")
TEXT_COL        = os.environ.get("TEXT_COL", "catalog_content")
PRICE_COL       = os.environ.get("PRICE_COL", "price")

MODEL_ID        = os.environ.get("MODEL_ID", "distilbert-base-uncased")
OUTPUT_DIR      = os.environ.get("OUTPUT_DIR", "price_distilbert_contrastive_full")

EPOCHS          = int(os.environ.get("EPOCHS", "3"))
MAX_LEN         = int(os.environ.get("MAX_LEN", "192"))
BATCH_SIZE      = int(os.environ.get("BATCH_SIZE", "16"))
LR              = float(os.environ.get("LR", "3e-5"))
WEIGHT_DECAY    = float(os.environ.get("WEIGHT_DECAY", "0.01"))
WARMUP_RATIO    = float(os.environ.get("WARMUP_RATIO", "0.06"))
GRAD_ACCUM      = int(os.environ.get("GRAD_ACCUM", "1"))
MAX_GRAD_NORM   = float(os.environ.get("MAX_GRAD_NORM", "1.0"))
FP16            = os.environ.get("FP16", "true").lower() != "false"

# hybrid loss mixing
ALPHA_CONTRAST  = float(os.environ.get("ALPHA_CONTRAST", "0.25"))  # 0..1
TAU             = float(os.environ.get("TAU", "0.05"))

# light augmentation for 2nd view
WORD_MASK_P     = float(os.environ.get("WORD_MASK_P", "0.08"))
DROPOUT_PROB    = float(os.environ.get("DROPOUT_PROB", "0.1"))

SEED            = int(os.environ.get("SEED", "42"))
MIN_PRICE       = float(os.environ.get("MIN_PRICE", "1e-6"))

FINAL_CKPT_NAME = "final_full.pt"
PRED_CSV_NAME   = "predictions.csv"
# --------------------------------------------------------------


In [7]:

def set_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(SEED)

def smape_np(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    return 100.0 * np.mean(np.abs(y_pred - y_true) / denom)

def log2_price(p: np.ndarray) -> np.ndarray:
    return np.log2(np.clip(p, MIN_PRICE, None))

def delog2(x: np.ndarray) -> np.ndarray:
    return np.power(2.0, x)

class PriceTextDataset(Dataset):
    def __init__(self, texts: List[str], prices_log2: Optional[np.ndarray], tokenizer, max_len: int):
        self.texts = texts
        self.prices_log2 = prices_log2
        self.tok = tokenizer
        self.max_len = max_len

    def _tokenize(self, text: str):
        return self.tok(
            text if isinstance(text, str) else "",
            truncation=True,
            max_length=self.max_len,
            padding=False,
            return_tensors="pt"
        )

    def _random_word_mask(self, input_ids: torch.Tensor, mask_token_id: int, prob: float) -> torch.Tensor:
        if prob <= 0.0:
            return input_ids
        ids = input_ids.clone()
        special = set(self.tok.all_special_ids)
        for i in range(ids.size(0)):
            for j in range(ids.size(1)):
                if ids[i, j].item() in special:
                    continue
                if random.random() < prob:
                    ids[i, j] = mask_token_id
        return ids

    def __len__(self): return len(self.texts)

    def __getitem__(self, idx):
        enc1 = self._tokenize(self.texts[idx])
        enc2 = {k: v.clone() for k, v in enc1.items()}
        enc2["input_ids"] = self._random_word_mask(enc2["input_ids"], self.tok.mask_token_id, WORD_MASK_P)
        item = {
            "input_ids_1": enc1["input_ids"].squeeze(0),
            "attention_mask_1": enc1["attention_mask"].squeeze(0),
            "input_ids_2": enc2["input_ids"].squeeze(0),
            "attention_mask_2": enc2["attention_mask"].squeeze(0),
        }
        if self.prices_log2 is not None:
            item["target"] = torch.tensor(self.prices_log2[idx], dtype=torch.float32)
        return item

@dataclass
class Collate:
    pad_id: int
    def __call__(self, batch):
        keys1 = ["input_ids_1", "attention_mask_1"]
        keys2 = ["input_ids_2", "attention_mask_2"]

        def pad_stack(keylist):
            maxlen = max(x[keylist[0]].size(0) for x in batch)
            out = {}
            for k in keylist:
                pad_val = self.pad_id if "input_ids" in k else 0
                tensors = []
                for x in batch:
                    v = x[k]
                    if v.size(0) < maxlen:
                        pad = torch.full((maxlen - v.size(0),), pad_val, dtype=v.dtype)
                        v = torch.cat([v, pad], dim=0)
                    tensors.append(v.unsqueeze(0))
                out[k] = torch.cat(tensors, dim=0)
            return out

        out1 = pad_stack(keys1)
        out2 = pad_stack(keys2)
        res = {**out1, **out2}
        if "target" in batch[0]:
            res["target"] = torch.stack([x["target"] for x in batch], dim=0)
        return res

class DistilBertPriceModel(nn.Module):
    def __init__(self, model_id: str, proj_dim: int = 256, dropout: float = DROPOUT_PROB):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(model_id)
        hidden = self.backbone.config.dim
        self.regressor = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, 1)
        )
        self.proj = nn.Sequential(
            nn.Linear(hidden, hidden),
            nn.GELU(),
            nn.Linear(hidden, proj_dim)
        )

    def forward_once(self, input_ids, attention_mask):
        out = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls = out.last_hidden_state[:, 0, :]
        yhat = self.regressor(cls).squeeze(-1)
        z = self.proj(cls)
        return yhat, z

    def forward(self, input_ids_1, attention_mask_1, input_ids_2, attention_mask_2):
        y1, z1 = self.forward_once(input_ids_1, attention_mask_1)
        y2, z2 = self.forward_once(input_ids_2, attention_mask_2)
        return (y1 + y2) / 2.0, z1, z2

def info_nce(z1: torch.Tensor, z2: torch.Tensor, tau: float) -> torch.Tensor:
    z1 = F.normalize(z1, dim=-1)
    z2 = F.normalize(z2, dim=-1)
    logits = torch.matmul(z1, z2.t()) / tau
    labels = torch.arange(z1.size(0), device=z1.device)
    loss1 = F.cross_entropy(logits, labels)
    loss2 = F.cross_entropy(logits.t(), labels)
    return 0.5 * (loss1 + loss2)

def huber_loss(pred, target, delta=1.0):
    return F.huber_loss(pred, target, delta=delta)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def make_loader(texts, y_log2, tokenizer, batch_size, shuffle):
    ds = PriceTextDataset(texts=texts, prices_log2=y_log2, tokenizer=tokenizer, max_len=MAX_LEN)
    collate = Collate(pad_id=tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id)
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle, num_workers=2, pin_memory=True, collate_fn=collate)


In [8]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🖥️ Device: {device}")
print(f"🔧 Loading train CSV: {TRAIN_CSV}")
df = pd.read_csv(TRAIN_CSV)

# checks & cleaning
for col in [ID_COL, TEXT_COL, PRICE_COL]:
    if col not in df.columns:
        raise ValueError(f"Column '{col}' missing in {TRAIN_CSV}. Found: {df.columns.tolist()}")
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str).str.strip()
df = df.loc[pd.to_numeric(df[PRICE_COL], errors="coerce").notnull()].copy()
df[PRICE_COL] = df[PRICE_COL].astype(float)
df = df.loc[df[PRICE_COL] >= 0.0].reset_index(drop=True)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
if tokenizer.mask_token is None:
    tokenizer.add_special_tokens({"mask_token": "[MASK]"})

model = DistilBertPriceModel(MODEL_ID).to(device)
if tokenizer.vocab_size != model.backbone.get_input_embeddings().weight.size(0):
    model.backbone.resize_token_embeddings(len(tokenizer))
print(f"🧮 Trainable params: {count_parameters(model):,}")

# train loader on 100% data
y_all_log = log2_price(df[PRICE_COL].values)
loader_all = make_loader(df[TEXT_COL].tolist(), y_all_log, tokenizer, BATCH_SIZE, shuffle=True)

# optimizer/scheduler
no_decay = ["bias", "LayerNorm.weight"]
grouped = [
    {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": WEIGHT_DECAY},
    {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = torch.optim.AdamW(grouped, lr=LR)
steps_per_epoch = max(1, math.ceil(len(loader_all) / GRAD_ACCUM))
num_training_steps = EPOCHS * steps_per_epoch
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(num_training_steps * WARMUP_RATIO),
    num_training_steps=num_training_steps
)
scaler = torch.cuda.amp.GradScaler(enabled=FP16)

history = []

# -------------------- TRAIN (full data) --------------------
for epoch in range(1, EPOCHS + 1):
    model.train()
    run_loss = run_reg = run_con = 0.0
    for step, batch in enumerate(loader_all, 1):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.cuda.amp.autocast(enabled=FP16):
            yhat, z1, z2 = model(
                input_ids_1=batch["input_ids_1"],
                attention_mask_1=batch["attention_mask_1"],
                input_ids_2=batch["input_ids_2"],
                attention_mask_2=batch["attention_mask_2"],
            )
            loss_reg = huber_loss(yhat, batch["target"])
            loss_con = info_nce(z1, z2, tau=TAU)
            loss = (1.0 - ALPHA_CONTRAST) * loss_reg + ALPHA_CONTRAST * loss_con
        scaler.scale(loss).backward()

        if step % GRAD_ACCUM == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        run_loss += loss.item(); run_reg += loss_reg.item(); run_con += loss_con.item()
        if step % 300 == 0:
            print(f"epoch {epoch} step {step}/{len(loader_all)} "
                  f"loss={run_loss/step:.4f} reg={run_reg/step:.4f} con={run_con/step:.4f}")

    # TRAIN SMAPE (on full data)
    model.eval()
    preds_log, tgts_log = [], []
    with torch.no_grad():
        for batch in loader_all:
            batch = {k: v.to(device) for k, v in batch.items()}
            yhat, _, _ = model(
                input_ids_1=batch["input_ids_1"],
                attention_mask_1=batch["attention_mask_1"],
                input_ids_2=batch["input_ids_2"],
                attention_mask_2=batch["attention_mask_2"],
            )
            preds_log.append(yhat.detach().float().cpu().numpy())
            tgts_log.append(batch["target"].detach().float().cpu().numpy())
    preds_log = np.concatenate(preds_log, axis=0)
    tgts_log  = np.concatenate(tgts_log, axis=0)
    train_smape = smape_np(delog2(tgts_log), delog2(preds_log))
    print(f"✅ Epoch {epoch}: TRAIN SMAPE = {train_smape:.3f}%")

    history.append({"epoch": int(epoch), "train_smape": float(train_smape)})
    with open(os.path.join(OUTPUT_DIR, "metrics_history.json"), "w") as f:
        json.dump(history, f, indent=2)

# save final checkpoint + summary
torch.save({"model_state": model.state_dict(), "tokenizer": MODEL_ID},
           os.path.join(OUTPUT_DIR, FINAL_CKPT_NAME))
with open(os.path.join(OUTPUT_DIR, "metrics.json"), "w") as f:
    json.dump({"history": history, "last_train_smape": float(history[-1]["train_smape"])}, f, indent=2)
print(f"💾 Saved final model to {os.path.join(OUTPUT_DIR, FINAL_CKPT_NAME)}")


🖥️ Device: cuda
🔧 Loading train CSV: jl_fs/train.csv
🧮 Trainable params: 67,741,697


  scaler = torch.cuda.amp.GradScaler(enabled=FP16)
  with torch.cuda.amp.autocast(enabled=FP16):


epoch 1 step 300/4688 loss=1.2891 reg=1.5871 con=0.3949
epoch 1 step 600/4688 loss=0.9002 reg=1.1328 con=0.2026
epoch 1 step 900/4688 loss=0.7557 reg=0.9616 con=0.1382
epoch 1 step 1200/4688 loss=0.6735 reg=0.8626 con=0.1061
epoch 1 step 1500/4688 loss=0.6203 reg=0.7981 con=0.0867
epoch 1 step 1800/4688 loss=0.5816 reg=0.7510 con=0.0733
epoch 1 step 2100/4688 loss=0.5533 reg=0.7165 con=0.0635
epoch 1 step 2400/4688 loss=0.5307 reg=0.6889 con=0.0561
epoch 1 step 2700/4688 loss=0.5133 reg=0.6675 con=0.0506
epoch 1 step 3000/4688 loss=0.4974 reg=0.6479 con=0.0459
epoch 1 step 3300/4688 loss=0.4842 reg=0.6315 con=0.0423
epoch 1 step 3600/4688 loss=0.4732 reg=0.6179 con=0.0390
epoch 1 step 3900/4688 loss=0.4638 reg=0.6062 con=0.0364
epoch 1 step 4200/4688 loss=0.4549 reg=0.5952 con=0.0340
epoch 1 step 4500/4688 loss=0.4474 reg=0.5858 con=0.0321
✅ Epoch 1: TRAIN SMAPE = 45.780%
epoch 2 step 300/4688 loss=0.2885 reg=0.3834 con=0.0038
epoch 2 step 600/4688 loss=0.2934 reg=0.3900 con=0.0037
epo

In [13]:


# -------------------- INFERENCE on TEST_CSV --------------------
if TEST_CSV and os.path.exists(TEST_CSV):
    print(f"🔮 Loading test CSV: {TEST_CSV}")
    dft = pd.read_csv(TEST_CSV)
    if ID_COL not in dft.columns or TEXT_COL not in dft.columns:
        raise ValueError(f"Test file must contain '{ID_COL}' and '{TEXT_COL}'. Found: {dft.columns.tolist()}")
    dft[TEXT_COL] = dft[TEXT_COL].fillna("").astype(str).str.strip()

    class InferDataset(Dataset):
        def __init__(self, texts, tokenizer, max_len):
            self.texts = texts; self.tok = tokenizer; self.max_len = max_len
        def __len__(self): return len(self.texts)
        def __getitem__(self, idx):
            enc = self.tok(self.texts[idx], truncation=True, max_length=self.max_len, padding=False, return_tensors="pt")
            return {"input_ids": enc["input_ids"].squeeze(0), "attention_mask": enc["attention_mask"].squeeze(0)}

    @dataclass
    class InferCollate:
        pad_id: int
        def __call__(self, batch):
            maxlen = max(x["input_ids"].size(0) for x in batch)
            def pad(key, pad_val):
                arr = []
                for x in batch:
                    v = x[key]
                    if v.size(0) < maxlen:
                        pad = torch.full((maxlen - v.size(0),), pad_val, dtype=v.dtype)
                        v = torch.cat([v, pad], dim=0)
                    arr.append(v.unsqueeze(0))
                return torch.cat(arr, dim=0)
            return {
                "input_ids": pad("input_ids", self.pad_id),
                "attention_mask": pad("attention_mask", 0),
            }

    infer_ds = InferDataset(dft[TEXT_COL].tolist(), tokenizer, MAX_LEN)
    infer_loader = DataLoader(
        infer_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2,
        pin_memory=True, collate_fn=InferCollate(pad_id=tokenizer.pad_token_id or tokenizer.eos_token_id)
    )

    model.eval()
    preds = []
    with torch.no_grad():
        for batch in infer_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            # two forward_once passes (dropout stochasticity) averaged for stability
            y1, _ = model.forward_once(input_ids, attention_mask)
            y2, _ = model.forward_once(input_ids, attention_mask)
            yhat_log = ((y1 + y2) / 2.0).detach().float().cpu().numpy()
            preds.append(yhat_log)
    preds = np.concatenate(preds, axis=0)
    price_pred = delog2(preds)

    out_df = pd.DataFrame({ID_COL: dft[ID_COL].values, "price_pred": price_pred})
    out_path = os.path.join(OUTPUT_DIR, PRED_CSV_NAME)
    out_df.to_csv(out_path, index=False)
    print(f"📤 Wrote predictions to: {out_path}")

    # optional: compute test SMAPE if ground-truth price exists
    if PRICE_COL in dft.columns:
        gt = pd.to_numeric(dft[PRICE_COL], errors="coerce")
        mask = gt.notnull()
        if mask.any():
            smape_test = smape_np(gt[mask].values.astype(float), out_df.loc[mask, "price_pred"].values.astype(float))
            with open(os.path.join(OUTPUT_DIR, "metrics_test.json"), "w") as f:
                json.dump({"test_smape": float(smape_test)}, f, indent=2)
            print(f"🧪 Test SMAPE = {smape_test:.3f}% (computed only for rows with ground truth)")
else:
        print("ℹ️ TEST_CSV not set or file not found — skipping prediction.")



🔮 Loading test CSV: jl_fs/test.csv


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 349, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/root/miniconda3/envs/py3.10/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "/tmp/ipykernel_1127/3910470799.py", line 32, in __call__
    "input_ids": pad("input_ids", self.pad_id),
  File "/tmp/ipykernel_1127/3910470799.py", line 27, in pad
    pad = torch.full((maxlen - v.size(0),), pad_val, dtype=v.dtype)
TypeError: full() received an invalid combination of arguments - got (tuple, NoneType, dtype=torch.dtype), but expected one of:
 * (tuple of ints size, Number fill_value, *, tuple of names names, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)
 * (tuple of ints size, Number fill_value, *, Tensor out = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)



In [15]:
# ---------- TESTING / INFERENCE ONLY (fixed PAD) ----------
import os, json
from dataclasses import dataclass
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

# required from your training script:
# - DistilBertPriceModel
# - delog2()
# - smape_np()
# - constants: MODEL_ID, OUTPUT_DIR, TEST_CSV, ID_COL, TEXT_COL, PRICE_COL, MAX_LEN, BATCH_SIZE
# If not defined above, set sensible defaults here:
MODEL_ID   = globals().get("MODEL_ID", "distilbert-base-uncased")
OUTPUT_DIR = globals().get("OUTPUT_DIR", "price_distilbert_contrastive_full")
TEST_CSV   = globals().get("TEST_CSV", "jl_fstest.csv")
ID_COL     = globals().get("ID_COL", "sample_id")
TEXT_COL   = globals().get("TEXT_COL", "catalog_content")
PRICE_COL  = globals().get("PRICE_COL", "price")
MAX_LEN    = globals().get("MAX_LEN", 192)
BATCH_SIZE = globals().get("BATCH_SIZE", 16)

ckpt_path = os.path.join(OUTPUT_DIR, "final_full.pt")
device = "cuda" if torch.cuda.is_available() else "cpu"

# 1) Load tokenizer (PAD fallback -> 0)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0  # <- fix

# 2) Recreate model and load weights
model = DistilBertPriceModel(MODEL_ID).to(device)
state = torch.load(ckpt_path, map_location=device)
model.load_state_dict(state["model_state"], strict=True)
model.eval()

# 3) Load test CSV
dft = pd.read_csv(TEST_CSV)
if ID_COL not in dft.columns or TEXT_COL not in dft.columns:
    raise ValueError(f"Test file must contain '{ID_COL}' and '{TEXT_COL}'. Got: {dft.columns.tolist()}")
dft[TEXT_COL] = dft[TEXT_COL].fillna("").astype(str).str.strip()

# 4) Inference dataset + collate (uses pad_id fallback)
class InferDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts; self.tok = tokenizer; self.max_len = max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        enc = self.tok(self.texts[idx], truncation=True, max_length=self.max_len,
                       padding=False, return_tensors="pt")
        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
        }

@dataclass
class InferCollate:
    pad_id: int
    def __call__(self, batch):
        maxlen = max(x["input_ids"].size(0) for x in batch)
        def pad(key, pad_val):
            arr = []
            for x in batch:
                v = x[key]
                if v.size(0) < maxlen:
                    pad = torch.full((maxlen - v.size(0),), pad_val, dtype=v.dtype)
                    v = torch.cat([v, pad], dim=0)
                arr.append(v.unsqueeze(0))
            return torch.cat(arr, dim=0)
        return {
            "input_ids": pad("input_ids", self.pad_id),
            "attention_mask": pad("attention_mask", 0),
        }

infer_ds = InferDataset(dft[TEXT_COL].tolist(), tokenizer, MAX_LEN)
infer_loader = DataLoader(
    infer_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2,
    pin_memory=True, collate_fn=InferCollate(pad_id=pad_id)
)

# 5) Run inference (two passes averaged for stability)
preds_log = []
with torch.no_grad():
    for batch in infer_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        y1, _ = model.forward_once(input_ids, attention_mask)
        y2, _ = model.forward_once(input_ids, attention_mask)
        yhat_log = ((y1 + y2) / 2.0).detach().float().cpu().numpy()
        preds_log.append(yhat_log)

preds_log = np.concatenate(preds_log, axis=0)
price_pred = delog2(preds_log)

# 6) Save CSV
out_df = pd.DataFrame({ID_COL: dft[ID_COL].values, "price_pred": price_pred})
pred_path = os.path.join(OUTPUT_DIR, "predictions.csv")
out_df.to_csv(pred_path, index=False)
print(f"📤 Wrote predictions to: {pred_path}")



📤 Wrote predictions to: price_distilbert_contrastive_full/predictions.csv
