In [1]:
# %%
import os, math, random, json
from dataclasses import dataclass
from typing import Optional, List, Dict

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    CLIPModel,
    AutoProcessor,
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup,
)

In [9]:

# =========================== Config ===========================
CSV_PATH         = os.environ.get("TRAIN_CSV", "train_updated.csv")   # must contain TEXT + PRICE + IMG path
TEXT_COL         = os.environ.get("TEXT_COL", "catalog_content")
PRICE_COL        = os.environ.get("PRICE_COL", "price")
IMG_COL          = os.environ.get("IMG_COL",  "image_path")

# Backbones
CLIP_ID          = os.environ.get("CLIP_ID",  "openai/clip-vit-large-patch14")
DISTIL_ID_RAW    = os.environ.get("DISTIL_ID", "distilbert-base-uncased-finetuned-sst-2-english")

OUTPUT_DIR       = os.environ.get("OUTPUT_DIR", "price_clip+distil_fusionv2")

SEED             = int(os.environ.get("SEED", "42"))
VAL_FRAC         = float(os.environ.get("VAL_FRAC", "0.01"))           # set 0.5 for 50/50
MAX_LEN_CLIP     = int(os.environ.get("MAX_LEN_CLIP", "64"))          # CLIP tokenizer is shorter
MAX_LEN_DISTIL   = int(os.environ.get("MAX_LEN_DISTIL", "192"))

BATCH_SIZE       = int(os.environ.get("BATCH_SIZE", "16"))
LR               = float(os.environ.get("LR", "2e-5"))
WEIGHT_DECAY     = float(os.environ.get("WEIGHT_DECAY", "0.01"))
EPOCHS           = int(os.environ.get("EPOCHS", "15"))
WARMUP_RATIO     = float(os.environ.get("WARMUP_RATIO", "0.06"))
GRAD_ACCUM       = int(os.environ.get("GRAD_ACCUM", "1"))
MAX_GRAD_NORM    = float(os.environ.get("MAX_GRAD_NORM", "1.0"))
FP16             = os.environ.get("FP16", "true").lower() == "true"

# Loss mixing
ALPHA_CLIP_NCE   = float(os.environ.get("ALPHA_CLIP_NCE", "0.20"))   # weight for CLIP img↔text InfoNCE
ALPHA_TXT_NCE    = float(os.environ.get("ALPHA_TXT_NCE", "0.10"))    # weight for Distil SimCSE NCE
HUBER_DELTA      = float(os.environ.get("HUBER_DELTA", "1.0"))
TAU              = float(os.environ.get("TAU", "0.07"))

# DistilBERT augmentation (for contrastive)
WORD_MASK_P      = float(os.environ.get("WORD_MASK_P", "0.06"))

EARLY_STOP_ROUNDS = int(os.environ.get("EARLY_STOP_ROUNDS", "5"))
MIN_PRICE        = float(os.environ.get("MIN_PRICE", "1e-6"))

# Missing image policy: zero => dummy pixels (vision forward), text_only => skip vision, drop => drop rows without image
IMG_MISSING_POLICY = os.environ.get("IMG_MISSING_POLICY", "zero").lower()  # zero | text_only | drop
assert IMG_MISSING_POLICY in {"zero", "text_only", "drop"}

# Freeze toggles (if VRAM tight)
FREEZE_CLIP      = os.environ.get("FREEZE_CLIP", "false").lower() == "true"
FREEZE_DISTIL    = os.environ.get("FREEZE_DISTIL", "false").lower() == "true"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# =========================== Utils ===========================
def set_seed(seed: int = SEED):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)
set_seed(SEED)

def smape_np(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    return 100.0 * np.mean(np.abs(y_pred - y_true) / denom)

def log2_price(p: np.ndarray) -> np.ndarray:
    return np.log2(np.clip(p, MIN_PRICE, None))

def delog2(x: np.ndarray) -> np.ndarray:
    return np.power(2.0, x)

def split_train_val(df: pd.DataFrame, frac_val: float = VAL_FRAC, seed: int = SEED):
    df = df.sample(frac=1.0, random_state=seed).reset_index(drop=True)
    n_val = int(len(df) * frac_val)
    return df.iloc[n_val:].reset_index(drop=True), df.iloc[:n_val].reset_index(drop=True)

# =========================== Dataset ===========================
class DualBackboneDataset(Dataset):
    """
    Returns:
      - CLIP: input_ids_clip, attention_mask_clip, pixel_values? (depends on policy)
      - Distil (two augmented views): (ids1,mask1), (ids2,mask2)
      - img_missing flag
      - target (log2 price)
    """
    def __init__(self, df: pd.DataFrame, text_col: str, img_col: str,
                 y_log2: Optional[np.ndarray],
                 clip_processor: AutoProcessor,
                 distil_tok: AutoTokenizer,
                 max_len_clip: int,
                 max_len_distil: int,
                 policy: str,
                 training: bool):
        self.df = df.reset_index(drop=True).copy()
        self.text_col = text_col
        self.img_col  = img_col
        self.y_log2   = y_log2
        self.proc = clip_processor
        self.tok  = distil_tok
        self.max_len_clip   = max_len_clip
        self.max_len_distil = max_len_distil
        self.policy = policy
        self.training = training

        self.df[text_col] = self.df[text_col].fillna("").astype(str)

        if policy == "drop":
            before = len(self.df)
            self.df = self.df[self.df[img_col].apply(lambda p: isinstance(p, str) and len(p)>0 and os.path.exists(p))]
            self.dropped_missing = before - len(self.df)
        else:
            self.dropped_missing = 0

        # Pre-make a dummy pixel for 'zero' policy
        dummy = self.proc(images=Image.new("RGB", (224, 224)), return_tensors="pt")
        self._dummy_pixel = dummy["pixel_values"].squeeze(0)  # (3,H,W)
        self.missing_img_count = 0

    def __len__(self): return len(self.df)

    def _load_image(self, pth: str):
        if isinstance(pth, str) and pth and os.path.exists(pth):
            try:
                return Image.open(pth).convert("RGB")
            except Exception:
                pass
        self.missing_img_count += 1
        return None

    def _tok_clip_text(self, text: str):
        enc = self.proc(text=[text], truncation=True, padding=False, max_length=self.max_len_clip, return_tensors="pt")
        return { "input_ids": enc["input_ids"].squeeze(0),
                 "attention_mask": enc["attention_mask"].squeeze(0) }

    def _tok_distil(self, text: str):
        # two lightly different "views" via word masking
        base = self.tok(text, truncation=True, padding=False, max_length=self.max_len_distil, return_tensors="pt")
        ids1 = base["input_ids"].clone()
        att1 = base["attention_mask"].clone()

        ids2 = ids1.clone()
        att2 = att1.clone()

        # random mask tokens in view2
        if WORD_MASK_P > 0 and self.tok.mask_token_id is not None:
            special = set(self.tok.all_special_ids)
            for i in range(ids2.size(1)):
                if ids2[0, i].item() in special: 
                    continue
                if random.random() < WORD_MASK_P:
                    ids2[0, i] = self.tok.mask_token_id

        return {
            "ids1": ids1.squeeze(0), "att1": att1.squeeze(0),
            "ids2": ids2.squeeze(0), "att2": att2.squeeze(0),
        }

    def __getitem__(self, idx):
        row  = self.df.iloc[idx]
        text = row[self.text_col]
        imgp = str(row[self.img_col]) if row[self.img_col] is not None else ""

        # CLIP text
        clip_txt = self._tok_clip_text(text)

        # CLIP image (may be missing)
        img_missing = 0
        im = self._load_image(imgp)
        pixel_values = None
        if im is None:
            img_missing = 1
            if self.policy == "zero":
                pixel_values = self._dummy_pixel.clone()
        else:
            enc_img = self.proc(images=im, return_tensors="pt")
            pixel_values = enc_img["pixel_values"].squeeze(0)

        # Distil two views
        distil = self._tok_distil(text)

        item = {
            # CLIP text
            "clip_input_ids": clip_txt["input_ids"],
            "clip_attention_mask": clip_txt["attention_mask"],
            # Distil 2 views
            "distil_ids1": distil["ids1"],
            "distil_att1": distil["att1"],
            "distil_ids2": distil["ids2"],
            "distil_att2": distil["att2"],
            # Image
            "img_missing": torch.tensor(img_missing, dtype=torch.uint8),
        }
        if pixel_values is not None:
            item["pixel_values"] = pixel_values

        if self.y_log2 is not None:
            item["target"] = torch.tensor(self.y_log2[idx], dtype=torch.float32)
        return item

@dataclass
class DualCollate:
    clip_tokenizer: any
    distil_pad_id: int
    def __call__(self, batch):
        # Pad CLIP text with its tokenizer pad util
        clip_ids  = [b["clip_input_ids"] for b in batch]
        clip_attn = [b["clip_attention_mask"] for b in batch]
        clip_padded = self.clip_tokenizer.pad(
            {"input_ids": clip_ids, "attention_mask": clip_attn},
            padding=True, return_tensors="pt"
        )

        # Pad Distil views manually
        def pad_stack(tensors: List[torch.Tensor], pad_val: int):
            maxlen = max(t.size(0) for t in tensors)
            out = []
            for t in tensors:
                if t.size(0) < maxlen:
                    pad = torch.full((maxlen - t.size(0),), pad_val, dtype=t.dtype)
                    t = torch.cat([t, pad], dim=0)
                out.append(t.unsqueeze(0))
            return torch.cat(out, dim=0)

        ids1 = pad_stack([b["distil_ids1"] for b in batch], self.distil_pad_id)
        att1 = pad_stack([b["distil_att1"] for b in batch], 0)
        ids2 = pad_stack([b["distil_ids2"] for b in batch], self.distil_pad_id)
        att2 = pad_stack([b["distil_att2"] for b in batch], 0)

        # Images (optional per item)
        pixel_values = None
        if any("pixel_values" in b for b in batch):
            C,H,W = next(b["pixel_values"].shape for b in batch if "pixel_values" in b)
            pv = []
            for b in batch:
                pv.append(b["pixel_values"] if "pixel_values" in b else torch.zeros((C,H,W), dtype=torch.float32))
            pixel_values = torch.stack(pv, dim=0)

        res = {
            "clip_input_ids": clip_padded["input_ids"],
            "clip_attention_mask": clip_padded["attention_mask"],
            "distil_ids1": ids1,
            "distil_att1": att1,
            "distil_ids2": ids2,
            "distil_att2": att2,
            "img_missing": torch.stack([b["img_missing"] for b in batch], dim=0),
        }
        if pixel_values is not None: res["pixel_values"] = pixel_values
        if "target" in batch[0]:
            res["target"] = torch.stack([b["target"] for b in batch], dim=0)
        return res

# =========================== Model ===========================
class FusionRegressor(nn.Module):
    """
    - CLIP image →  proj_dim_c (uses CLIP projection_dim)
    - CLIP text  →  proj_dim_c
    - Distil pooled → linear projection to proj_dim_d
    Fuse: [norm(img_c), norm(txt_c), norm(distil_proj)] → MLP → price (log2)
    Also returns z_img, z_txt, z_distil1/2 for contrastive losses.
    """
    def __init__(self, clip_id: str, distil_id: str, distil_proj_dim: int = 256, head_hidden_mult: float = 2.0):
        super().__init__()
        # CLIP
        self.clip = CLIPModel.from_pretrained(clip_id)
        clip_dim = self.clip.config.projection_dim  # e.g., 768 or 1024

        # Distil
        try:
            self.distil = AutoModel.from_pretrained(distil_id)
            self.distil_id_used = distil_id
        except Exception:
            # fallback if finetuned SST-2 ID not available in your env/cache
            self.distil = AutoModel.from_pretrained("distilbert-base-uncased")
            self.distil_id_used = "distilbert-base-uncased"

        distil_hidden = self.distil.config.dim  # 768
        self.distil_proj = nn.Sequential(
            nn.Linear(distil_hidden, distil_hidden),
            nn.GELU(),
            nn.Linear(distil_hidden, distil_proj_dim),
        )

        fused_dim = clip_dim + clip_dim + distil_proj_dim
        hidden = int(fused_dim * head_hidden_mult)
        self.head = nn.Sequential(
            nn.Dropout(0.1),
            nn.Linear(fused_dim, hidden),
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(hidden, 1),
        )

    def forward(
        self,
        clip_input_ids, clip_attention_mask,
        pixel_values: Optional[torch.Tensor],
        img_missing: torch.Tensor,
        distil_ids1, distil_att1,
        distil_ids2, distil_att2,
    ):
        # CLIP text
        txt_feat = self.clip.get_text_features(
            input_ids=clip_input_ids,
            attention_mask=clip_attention_mask
        )  # (B, clip_dim)

        # CLIP image
        if pixel_values is not None:
            vision_dtype = next(self.clip.vision_model.parameters()).dtype
            img_feat = self.clip.get_image_features(pixel_values=pixel_values.to(dtype=vision_dtype))
            # zero-out missing images to prevent leakage
            if img_missing.any():
                img_feat = img_feat * (1.0 - img_missing.unsqueeze(1).float())
        else:
            img_feat = torch.zeros_like(txt_feat)

        # Distil pooled embedding (view1 & view2 for SimCSE-style)
        out1 = self.distil(input_ids=distil_ids1, attention_mask=distil_att1)
        out2 = self.distil(input_ids=distil_ids2, attention_mask=distil_att2)
        # use token[0] as pooled representation (DistilBERT has no pooler)
        cls1 = out1.last_hidden_state[:, 0, :]
        cls2 = out2.last_hidden_state[:, 0, :]
        z_d1 = self.distil_proj(cls1)     # (B, dproj)
        z_d2 = self.distil_proj(cls2)

        # Normalize
        img_n = F.normalize(img_feat, dim=-1)
        txt_n = F.normalize(txt_feat, dim=-1)
        d1_n  = F.normalize(z_d1, dim=-1)
        d2_n  = F.normalize(z_d2, dim=-1)

        # Fuse (use d1 for regression; d2 only for contrastive)
        fused = torch.cat([img_n, txt_n, d1_n], dim=-1)
        pred_log2 = self.head(fused).squeeze(-1)
        return pred_log2, img_n, txt_n, d1_n, d2_n

# =========================== Losses ===========================
def info_nce(z_a: torch.Tensor, z_b: torch.Tensor, tau: float = TAU) -> torch.Tensor:
    z_a = F.normalize(z_a, dim=-1)
    z_b = F.normalize(z_b, dim=-1)
    logits = torch.matmul(z_a, z_b.t()) / tau
    labels = torch.arange(z_a.size(0), device=z_a.device)
    return 0.5 * (F.cross_entropy(logits, labels) + F.cross_entropy(logits.t(), labels))

def huber_loss(pred, target, delta=HUBER_DELTA):
    return F.huber_loss(pred, target, delta=delta)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [10]:
# =========================== Load data ===========================
print(f"🔧 Loading CSV: {CSV_PATH}")
df = pd.read_csv(CSV_PATH)

for col, name in [(TEXT_COL, "TEXT_COL"), (PRICE_COL, "PRICE_COL"), (IMG_COL, "IMG_COL")]:
    if col not in df.columns:
        raise ValueError(f"{name} '{col}' not in CSV columns={df.columns.tolist()}")

# clean / guard
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str).str.strip()
df = df.loc[pd.to_numeric(df[PRICE_COL], errors="coerce").notnull()].copy()
df[PRICE_COL] = df[PRICE_COL].astype(float)
df = df.loc[df[PRICE_COL] >= 0.0].reset_index(drop=True)

df_tr, df_va = split_train_val(df, frac_val=VAL_FRAC, seed=SEED)
print(f"📊 Split: train={len(df_tr)} | valid={len(df_va)}")
y_tr_log = log2_price(df_tr[PRICE_COL].values)
y_va_log = log2_price(df_va[PRICE_COL].values)
df

🔧 Loading CSV: train_updated.csv
📊 Split: train=74250 | valid=750


Unnamed: 0,sample_id,catalog_content,image_link,price,image_path
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.890,jl_fs/images/train/33127.jpg
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.120,jl_fs/images/train/198967.jpg
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.970,jl_fs/images/train/261251.jpg
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.340,jl_fs/images/train/55858.jpg
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.490,jl_fs/images/train/292686.jpg
...,...,...,...,...,...
74995,41424,Item Name: ICE BREAKERS Spearmint Sugar Free M...,https://m.media-amazon.com/images/I/81p9PcPsff...,10.395,jl_fs/images/train/41424.jpg
74996,35537,"Item Name: Davidson's Organics, Vanilla Essenc...",https://m.media-amazon.com/images/I/51DDKoa+mb...,35.920,jl_fs/images/train/35537.jpg
74997,249971,Item Name: Jolly Rancher Hard Candy - Blue Ras...,https://m.media-amazon.com/images/I/91R2XCcpUf...,50.330,jl_fs/images/train/249971.jpg
74998,188322,Item Name: Nescafe Dolce Gusto Capsules - CARA...,https://m.media-amazon.com/images/I/51W40YU98+...,15.275,jl_fs/images/train/188322.jpg


In [11]:
clip_proc = AutoProcessor.from_pretrained(CLIP_ID)
# Distil tokenizer (with safe fallback)
try:
    distil_tok = AutoTokenizer.from_pretrained(DISTIL_ID_RAW, use_fast=True)
    used_distil_tok_id = DISTIL_ID_RAW
except Exception:
    distil_tok = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
    used_distil_tok_id = "distilbert-base-uncased"

if distil_tok.mask_token is None:
    distil_tok.add_special_tokens({"mask_token": "[MASK]"})

train_ds = DualBackboneDataset(
    df=df_tr, text_col=TEXT_COL, img_col=IMG_COL, y_log2=y_tr_log,
    clip_processor=clip_proc, distil_tok=distil_tok,
    max_len_clip=MAX_LEN_CLIP, max_len_distil=MAX_LEN_DISTIL,
    policy=IMG_MISSING_POLICY, training=True
)
val_ds = DualBackboneDataset(
    df=df_va, text_col=TEXT_COL, img_col=IMG_COL, y_log2=y_va_log,
    clip_processor=clip_proc, distil_tok=distil_tok,
    max_len_clip=MAX_LEN_CLIP, max_len_distil=MAX_LEN_DISTIL,
    policy=IMG_MISSING_POLICY, training=False
)
collate = DualCollate(clip_tokenizer=clip_proc.tokenizer,
                      distil_pad_id=distil_tok.pad_token_id if distil_tok.pad_token_id is not None else 0)

device = "cuda" if torch.cuda.is_available() else "cpu"

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True, collate_fn=collate)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True, collate_fn=collate)

In [12]:
# =========================== Build model ===========================
model = FusionRegressor(CLIP_ID, used_distil_tok_id).to(device)

# Optionally freeze
if FREEZE_CLIP:
    for p in model.clip.parameters(): p.requires_grad = False
if FREEZE_DISTIL:
    for p in model.distil.parameters(): p.requires_grad = False

print(f"🖥️ Device: {device}")
print(f"🧮 Trainable params: {count_parameters(model):,}")

🖥️ Device: cuda
🧮 Trainable params: 501,196,546


In [13]:
# Optimizer & scheduler
no_decay = ["bias", "LayerNorm.weight"]
named_params = list(model.named_parameters())
grouped = [
    {"params": [p for n,p in named_params if not any(nd in n for nd in no_decay)], "weight_decay": WEIGHT_DECAY},
    {"params": [p for n,p in named_params if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = torch.optim.AdamW(grouped, lr=LR)
num_training_steps = EPOCHS * max(1, math.ceil(len(train_loader) / max(1, GRAD_ACCUM)))
num_warmup = int(num_training_steps * WARMUP_RATIO)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup, num_training_steps)
scaler = torch.cuda.amp.GradScaler(enabled=FP16)

best_smape = float("inf")
best_path = os.path.join(OUTPUT_DIR, "best_clip+distil.pt")
patience = 0

# Warmup to count drops/missing
if len(train_loader) > 0: _ = next(iter(train_loader))
if len(val_loader) > 0: _ = next(iter(val_loader))
print(f"⚠️ Missing images counted (train/val): {train_ds.missing_img_count}/{val_ds.missing_img_count}")
print(f"🗑️ Dropped (policy=drop) train/val: {getattr(train_ds,'dropped_missing',0)}/{getattr(val_ds,'dropped_missing',0)}")

  scaler = torch.cuda.amp.GradScaler(enabled=FP16)
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


⚠️ Missing images counted (train/val): 0/0
🗑️ Dropped (policy=drop) train/val: 0/0


In [None]:
# =========================== Train Loop ===========================
for epoch in range(1, EPOCHS+1):
    model.train()
    optimizer.zero_grad(set_to_none=True)
    run_loss = run_reg = run_nce_clip = run_nce_txt = 0.0

    for step, batch in enumerate(train_loader, 1):
        # Move to device
        batch = {k: (v.to(device, non_blocking=True) if torch.is_tensor(v) else v) for k,v in batch.items()}

        with torch.cuda.amp.autocast(enabled=FP16):
            pred_log2, z_img, z_txt, z_d1, z_d2 = model(
                clip_input_ids=batch["clip_input_ids"],
                clip_attention_mask=batch["clip_attention_mask"],
                pixel_values=batch.get("pixel_values", None),
                img_missing=batch["img_missing"],
                distil_ids1=batch["distil_ids1"],
                distil_att1=batch["distil_att1"],
                distil_ids2=batch["distil_ids2"],
                distil_att2=batch["distil_att2"],
            )

            loss_reg  = huber_loss(pred_log2, batch["target"], delta=HUBER_DELTA)
            # Only compute CLIP NCE if we actually used images and have at least 2 valid
            loss_nce_clip = torch.tensor(0.0, device=device)
            valid_idx = (batch["img_missing"] == 0).nonzero(as_tuple=False).squeeze(-1)
            if ("pixel_values" in batch) and (IMG_MISSING_POLICY != "text_only") and valid_idx.numel() > 1:
                loss_nce_clip = info_nce(z_img[valid_idx], z_txt[valid_idx], tau=TAU)

            # Distil SimCSE-style (two views)
            loss_nce_txt = info_nce(z_d1, z_d2, tau=TAU)

            loss = loss_reg + ALPHA_CLIP_NCE*loss_nce_clip + ALPHA_TXT_NCE*loss_nce_txt

        scaler.scale(loss).backward()

        if step % GRAD_ACCUM == 0:
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        run_loss     += float(loss.item())
        run_reg      += float(loss_reg.item())
        run_nce_clip += float(loss_nce_clip.item())
        run_nce_txt  += float(loss_nce_txt.item())

        if step % 200 == 0:
            denom = step if step > 0 else 1
            print(f"epoch {epoch} step {step}/{len(train_loader)} "
                  f"loss={run_loss/denom:.4f} reg={run_reg/denom:.4f} "
                  f"nce_clip={run_nce_clip/denom:.4f} nce_txt={run_nce_txt/denom:.4f}")

    # ------------------ Validation ------------------
    model.eval()
    preds_log = []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: (v.to(device, non_blocking=True) if torch.is_tensor(v) else v) for k,v in batch.items()}
            pred_log2, *_ = model(
                clip_input_ids=batch["clip_input_ids"],
                clip_attention_mask=batch["clip_attention_mask"],
                pixel_values=batch.get("pixel_values", None),
                img_missing=batch["img_missing"],
                distil_ids1=batch["distil_ids1"],
                distil_att1=batch["distil_att1"],
                distil_ids2=batch["distil_ids2"],
                distil_att2=batch["distil_att2"],
            )
            preds_log.append(pred_log2.detach().float().cpu().numpy())

    preds_log = np.concatenate(preds_log, axis=0) if len(preds_log) else np.array([])
    if len(preds_log):
        va_preds = delog2(preds_log)
        va_true  = delog2(y_va_log)
        smape = smape_np(va_true, va_preds)
    else:
        smape = float("inf")

    print(f"✅ Epoch {epoch}: VAL SMAPE = {smape:.3f}%")

    # Save best
    if smape < best_smape - 1e-6:
        best_smape = smape
        patience = 0
        torch.save(
            {
                "fusion_state": model.state_dict(),
                "clip_id": CLIP_ID,
                "distil_id_used": model.distil_id_used,
                "config": {
                    "ALPHA_CLIP_NCE": ALPHA_CLIP_NCE,
                    "ALPHA_TXT_NCE": ALPHA_TXT_NCE,
                    "TAU": TAU,
                    "HUBER_DELTA": HUBER_DELTA,
                    "MAX_LEN_CLIP": MAX_LEN_CLIP,
                    "MAX_LEN_DISTIL": MAX_LEN_DISTIL,
                    "IMG_MISSING_POLICY": IMG_MISSING_POLICY,
                    "clip_projection_dim": model.clip.config.projection_dim,
                },
                "columns": {"text": TEXT_COL, "image": IMG_COL, "price": PRICE_COL},
                "val_frac": VAL_FRAC,
            },
            best_path
        )
        print(f"💾 Saved new best to {best_path}")
    else:
        patience += 1
        print(f"⏸️ No improvement. Patience {patience}/{EARLY_STOP_ROUNDS}")
        if patience >= EARLY_STOP_ROUNDS:
            print("🛑 Early stopping triggered.")
            break

print(f"🏁 Best VAL SMAPE: {best_smape:.3f}% | Checkpoint: {best_path}")

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  with torch.cuda.amp.autocast(enabled=FP16):


epoch 1 step 200/4641 loss=3.4633 reg=3.2916 nce_clip=0.3388 nce_txt=1.0401
epoch 1 step 400/4641 loss=3.3093 reg=3.1858 nce_clip=0.2981 nce_txt=0.6388
epoch 1 step 600/4641 loss=3.0932 reg=2.9787 nce_clip=0.3325 nce_txt=0.4800
epoch 1 step 800/4641 loss=2.8328 reg=2.7117 nce_clip=0.3967 nce_txt=0.4176
epoch 1 step 1000/4641 loss=2.5367 reg=2.4093 nce_clip=0.4465 nce_txt=0.3815
epoch 1 step 1200/4641 loss=2.2648 reg=2.1379 nce_clip=0.4608 nce_txt=0.3477
epoch 1 step 1400/4641 loss=2.0480 reg=1.9257 nce_clip=0.4533 nce_txt=0.3165
epoch 1 step 1600/4641 loss=1.8710 reg=1.7548 nce_clip=0.4364 nce_txt=0.2888
epoch 1 step 1800/4641 loss=1.7338 reg=1.6232 nce_clip=0.4197 nce_txt=0.2663
epoch 1 step 2000/4641 loss=1.6187 reg=1.5138 nce_clip=0.4011 nce_txt=0.2469
epoch 1 step 2200/4641 loss=1.5234 reg=1.4235 nce_clip=0.3848 nce_txt=0.2296
epoch 1 step 2400/4641 loss=1.4439 reg=1.3484 nce_clip=0.3700 nce_txt=0.2151
epoch 1 step 2600/4641 loss=1.3743 reg=1.2827 nce_clip=0.3565 nce_txt=0.2025
epo

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 1: VAL SMAPE = 52.559%
💾 Saved new best to price_clip+distil_fusionv2/best_clip+distil.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 2 step 200/4641 loss=0.4287 reg=0.3972 nce_clip=0.1425 nce_txt=0.0299
epoch 2 step 400/4641 loss=0.4225 reg=0.3914 nce_clip=0.1415 nce_txt=0.0281
epoch 2 step 600/4641 loss=0.4198 reg=0.3893 nce_clip=0.1385 nce_txt=0.0277
epoch 2 step 800/4641 loss=0.4196 reg=0.3892 nce_clip=0.1382 nce_txt=0.0277
epoch 2 step 1000/4641 loss=0.4231 reg=0.3920 nce_clip=0.1418 nce_txt=0.0278
epoch 2 step 1200/4641 loss=0.4211 reg=0.3898 nce_clip=0.1427 nce_txt=0.0279
epoch 2 step 1400/4641 loss=0.4223 reg=0.3912 nce_clip=0.1422 nce_txt=0.0272
epoch 2 step 1600/4641 loss=0.4216 reg=0.3906 nce_clip=0.1415 nce_txt=0.0269
epoch 2 step 1800/4641 loss=0.4206 reg=0.3894 nce_clip=0.1424 nce_txt=0.0266
epoch 2 step 2000/4641 loss=0.4184 reg=0.3872 nce_clip=0.1429 nce_txt=0.0259
epoch 2 step 2200/4641 loss=0.4176 reg=0.3866 nce_clip=0.1422 nce_txt=0.0255
epoch 2 step 2400/4641 loss=0.4172 reg=0.3862 nce_clip=0.1424 nce_txt=0.0253
epoch 2 step 2600/4641 loss=0.4173 reg=0.3863 nce_clip=0.1424 nce_txt=0.0250
epo

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 2: VAL SMAPE = 47.802%
💾 Saved new best to price_clip+distil_fusionv2/best_clip+distil.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 3 step 200/4641 loss=0.3016 reg=0.2794 nce_clip=0.1026 nce_txt=0.0165
epoch 3 step 400/4641 loss=0.3050 reg=0.2831 nce_clip=0.1012 nce_txt=0.0166
epoch 3 step 600/4641 loss=0.3023 reg=0.2804 nce_clip=0.1007 nce_txt=0.0172
epoch 3 step 800/4641 loss=0.3033 reg=0.2814 nce_clip=0.1013 nce_txt=0.0170
epoch 3 step 1000/4641 loss=0.3028 reg=0.2808 nce_clip=0.1017 nce_txt=0.0166
epoch 3 step 1200/4641 loss=0.3029 reg=0.2810 nce_clip=0.1014 nce_txt=0.0162
epoch 3 step 1400/4641 loss=0.3058 reg=0.2840 nce_clip=0.1012 nce_txt=0.0163
epoch 3 step 1600/4641 loss=0.3057 reg=0.2838 nce_clip=0.1016 nce_txt=0.0163
epoch 3 step 1800/4641 loss=0.3065 reg=0.2845 nce_clip=0.1016 nce_txt=0.0163
epoch 3 step 2000/4641 loss=0.3069 reg=0.2850 nce_clip=0.1015 nce_txt=0.0162
epoch 3 step 2200/4641 loss=0.3078 reg=0.2859 nce_clip=0.1012 nce_txt=0.0162
epoch 3 step 2400/4641 loss=0.3071 reg=0.2854 nce_clip=0.1004 nce_txt=0.0164
epoch 3 step 2600/4641 loss=0.3079 reg=0.2861 nce_clip=0.1005 nce_txt=0.0164
epo

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 3: VAL SMAPE = 46.846%
💾 Saved new best to price_clip+distil_fusionv2/best_clip+distil.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 4 step 200/4641 loss=0.2222 reg=0.2050 nce_clip=0.0791 nce_txt=0.0143
epoch 4 step 400/4641 loss=0.2240 reg=0.2065 nce_clip=0.0806 nce_txt=0.0139
epoch 4 step 600/4641 loss=0.2246 reg=0.2072 nce_clip=0.0803 nce_txt=0.0135
epoch 4 step 800/4641 loss=0.2229 reg=0.2058 nce_clip=0.0788 nce_txt=0.0134
epoch 4 step 1000/4641 loss=0.2211 reg=0.2043 nce_clip=0.0774 nce_txt=0.0130
epoch 4 step 1200/4641 loss=0.2222 reg=0.2057 nce_clip=0.0761 nce_txt=0.0129
epoch 4 step 1400/4641 loss=0.2218 reg=0.2054 nce_clip=0.0757 nce_txt=0.0128
epoch 4 step 1600/4641 loss=0.2220 reg=0.2056 nce_clip=0.0756 nce_txt=0.0128
epoch 4 step 1800/4641 loss=0.2224 reg=0.2060 nce_clip=0.0758 nce_txt=0.0127
epoch 4 step 2000/4641 loss=0.2219 reg=0.2053 nce_clip=0.0769 nce_txt=0.0127
epoch 4 step 2200/4641 loss=0.2225 reg=0.2057 nce_clip=0.0778 nce_txt=0.0127
epoch 4 step 2400/4641 loss=0.2239 reg=0.2069 nce_clip=0.0783 nce_txt=0.0128
epoch 4 step 2600/4641 loss=0.2252 reg=0.2083 nce_clip=0.0782 nce_txt=0.0129
epo

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 4: VAL SMAPE = 45.268%
💾 Saved new best to price_clip+distil_fusionv2/best_clip+distil.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 5 step 200/4641 loss=0.1559 reg=0.1418 nce_clip=0.0635 nce_txt=0.0135
epoch 5 step 400/4641 loss=0.1607 reg=0.1467 nce_clip=0.0635 nce_txt=0.0132
epoch 5 step 600/4641 loss=0.1589 reg=0.1451 nce_clip=0.0629 nce_txt=0.0129
epoch 5 step 800/4641 loss=0.1614 reg=0.1475 nce_clip=0.0634 nce_txt=0.0128
epoch 5 step 1000/4641 loss=0.1612 reg=0.1475 nce_clip=0.0626 nce_txt=0.0125
epoch 5 step 1200/4641 loss=0.1603 reg=0.1465 nce_clip=0.0626 nce_txt=0.0127
epoch 5 step 1400/4641 loss=0.1607 reg=0.1470 nce_clip=0.0622 nce_txt=0.0127
epoch 5 step 1600/4641 loss=0.1611 reg=0.1476 nce_clip=0.0616 nce_txt=0.0125
epoch 5 step 1800/4641 loss=0.1619 reg=0.1485 nce_clip=0.0611 nce_txt=0.0125
epoch 5 step 2000/4641 loss=0.1626 reg=0.1492 nce_clip=0.0609 nce_txt=0.0124
epoch 5 step 2200/4641 loss=0.1629 reg=0.1494 nce_clip=0.0612 nce_txt=0.0123
epoch 5 step 2400/4641 loss=0.1633 reg=0.1498 nce_clip=0.0614 nce_txt=0.0122
epoch 5 step 2600/4641 loss=0.1639 reg=0.1503 nce_clip=0.0619 nce_txt=0.0122
epo

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 5: VAL SMAPE = 46.064%
⏸️ No improvement. Patience 1/5


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 6 step 200/4641 loss=0.1210 reg=0.1086 nce_clip=0.0560 nce_txt=0.0125
epoch 6 step 400/4641 loss=0.1203 reg=0.1085 nce_clip=0.0530 nce_txt=0.0114
epoch 6 step 600/4641 loss=0.1208 reg=0.1093 nce_clip=0.0516 nce_txt=0.0116
epoch 6 step 800/4641 loss=0.1224 reg=0.1112 nce_clip=0.0503 nce_txt=0.0116
epoch 6 step 1000/4641 loss=0.1214 reg=0.1103 nce_clip=0.0498 nce_txt=0.0114
epoch 6 step 1200/4641 loss=0.1228 reg=0.1117 nce_clip=0.0498 nce_txt=0.0113
epoch 6 step 1400/4641 loss=0.1231 reg=0.1121 nce_clip=0.0495 nce_txt=0.0112
epoch 6 step 1600/4641 loss=0.1232 reg=0.1122 nce_clip=0.0494 nce_txt=0.0111
epoch 6 step 1800/4641 loss=0.1229 reg=0.1118 nce_clip=0.0498 nce_txt=0.0110
epoch 6 step 2000/4641 loss=0.1232 reg=0.1121 nce_clip=0.0501 nce_txt=0.0110
epoch 6 step 2200/4641 loss=0.1231 reg=0.1121 nce_clip=0.0497 nce_txt=0.0108
epoch 6 step 2400/4641 loss=0.1235 reg=0.1125 nce_clip=0.0494 nce_txt=0.0108
epoch 6 step 2600/4641 loss=0.1238 reg=0.1129 nce_clip=0.0494 nce_txt=0.0108
epo

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 6: VAL SMAPE = 45.094%
💾 Saved new best to price_clip+distil_fusionv2/best_clip+distil.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 7 step 200/4641 loss=0.0968 reg=0.0878 nce_clip=0.0402 nce_txt=0.0100
epoch 7 step 400/4641 loss=0.0981 reg=0.0889 nce_clip=0.0406 nce_txt=0.0103
epoch 7 step 600/4641 loss=0.0974 reg=0.0883 nce_clip=0.0400 nce_txt=0.0104
epoch 7 step 800/4641 loss=0.0975 reg=0.0885 nce_clip=0.0397 nce_txt=0.0104
epoch 7 step 1000/4641 loss=0.0980 reg=0.0889 nce_clip=0.0402 nce_txt=0.0103
epoch 7 step 1200/4641 loss=0.0979 reg=0.0887 nce_clip=0.0405 nce_txt=0.0103
epoch 7 step 1400/4641 loss=0.0971 reg=0.0881 nce_clip=0.0402 nce_txt=0.0101
epoch 7 step 1600/4641 loss=0.0975 reg=0.0884 nce_clip=0.0403 nce_txt=0.0100
epoch 7 step 1800/4641 loss=0.0977 reg=0.0887 nce_clip=0.0404 nce_txt=0.0099
epoch 7 step 2000/4641 loss=0.0969 reg=0.0878 nce_clip=0.0406 nce_txt=0.0099
epoch 7 step 2200/4641 loss=0.0966 reg=0.0876 nce_clip=0.0403 nce_txt=0.0098
epoch 7 step 2400/4641 loss=0.0966 reg=0.0876 nce_clip=0.0404 nce_txt=0.0098
epoch 7 step 2600/4641 loss=0.0966 reg=0.0876 nce_clip=0.0402 nce_txt=0.0098
epo

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 7: VAL SMAPE = 44.585%
💾 Saved new best to price_clip+distil_fusionv2/best_clip+distil.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 8 step 200/4641 loss=0.0800 reg=0.0727 nce_clip=0.0323 nce_txt=0.0084
epoch 8 step 400/4641 loss=0.0798 reg=0.0724 nce_clip=0.0324 nce_txt=0.0085
epoch 8 step 600/4641 loss=0.0802 reg=0.0728 nce_clip=0.0330 nce_txt=0.0082
epoch 8 step 800/4641 loss=0.0787 reg=0.0712 nce_clip=0.0331 nce_txt=0.0081
epoch 8 step 1000/4641 loss=0.0775 reg=0.0702 nce_clip=0.0325 nce_txt=0.0081
epoch 8 step 1200/4641 loss=0.0775 reg=0.0701 nce_clip=0.0330 nce_txt=0.0083
epoch 8 step 1400/4641 loss=0.0771 reg=0.0696 nce_clip=0.0333 nce_txt=0.0084
epoch 8 step 1600/4641 loss=0.0768 reg=0.0693 nce_clip=0.0332 nce_txt=0.0083
epoch 8 step 1800/4641 loss=0.0762 reg=0.0688 nce_clip=0.0330 nce_txt=0.0083
epoch 8 step 2000/4641 loss=0.0758 reg=0.0684 nce_clip=0.0328 nce_txt=0.0082
epoch 8 step 2200/4641 loss=0.0752 reg=0.0679 nce_clip=0.0328 nce_txt=0.0081
epoch 8 step 2400/4641 loss=0.0750 reg=0.0677 nce_clip=0.0325 nce_txt=0.0080
epoch 8 step 2600/4641 loss=0.0749 reg=0.0676 nce_clip=0.0323 nce_txt=0.0079
epo

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 8: VAL SMAPE = 44.108%
💾 Saved new best to price_clip+distil_fusionv2/best_clip+distil.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 9 step 200/4641 loss=0.0638 reg=0.0576 nce_clip=0.0273 nce_txt=0.0078
epoch 9 step 400/4641 loss=0.0595 reg=0.0535 nce_clip=0.0263 nce_txt=0.0074
epoch 9 step 600/4641 loss=0.0598 reg=0.0537 nce_clip=0.0267 nce_txt=0.0074
epoch 9 step 800/4641 loss=0.0596 reg=0.0536 nce_clip=0.0262 nce_txt=0.0073
epoch 9 step 1000/4641 loss=0.0593 reg=0.0534 nce_clip=0.0260 nce_txt=0.0070
epoch 9 step 1200/4641 loss=0.0589 reg=0.0529 nce_clip=0.0267 nce_txt=0.0069
epoch 9 step 1400/4641 loss=0.0592 reg=0.0532 nce_clip=0.0266 nce_txt=0.0071
epoch 9 step 1600/4641 loss=0.0588 reg=0.0528 nce_clip=0.0263 nce_txt=0.0071
epoch 9 step 1800/4641 loss=0.0588 reg=0.0529 nce_clip=0.0262 nce_txt=0.0071
epoch 9 step 2000/4641 loss=0.0588 reg=0.0528 nce_clip=0.0265 nce_txt=0.0071
epoch 9 step 2200/4641 loss=0.0588 reg=0.0528 nce_clip=0.0264 nce_txt=0.0071
epoch 9 step 2400/4641 loss=0.0587 reg=0.0527 nce_clip=0.0263 nce_txt=0.0070
epoch 9 step 2600/4641 loss=0.0585 reg=0.0526 nce_clip=0.0261 nce_txt=0.0070
epo

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 9: VAL SMAPE = 43.070%
💾 Saved new best to price_clip+distil_fusionv2/best_clip+distil.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 10 step 200/4641 loss=0.0469 reg=0.0419 nce_clip=0.0221 nce_txt=0.0060
epoch 10 step 400/4641 loss=0.0475 reg=0.0425 nce_clip=0.0223 nce_txt=0.0062
epoch 10 step 600/4641 loss=0.0470 reg=0.0420 nce_clip=0.0220 nce_txt=0.0060
epoch 10 step 800/4641 loss=0.0471 reg=0.0420 nce_clip=0.0221 nce_txt=0.0064
epoch 10 step 1000/4641 loss=0.0472 reg=0.0421 nce_clip=0.0221 nce_txt=0.0064
epoch 10 step 1200/4641 loss=0.0470 reg=0.0421 nce_clip=0.0216 nce_txt=0.0062
epoch 10 step 1400/4641 loss=0.0469 reg=0.0420 nce_clip=0.0215 nce_txt=0.0062
epoch 10 step 1600/4641 loss=0.0465 reg=0.0416 nce_clip=0.0213 nce_txt=0.0062
epoch 10 step 1800/4641 loss=0.0464 reg=0.0415 nce_clip=0.0212 nce_txt=0.0062
epoch 10 step 2000/4641 loss=0.0460 reg=0.0412 nce_clip=0.0212 nce_txt=0.0061
epoch 10 step 2200/4641 loss=0.0459 reg=0.0411 nce_clip=0.0210 nce_txt=0.0061
epoch 10 step 2400/4641 loss=0.0458 reg=0.0409 nce_clip=0.0211 nce_txt=0.0061
epoch 10 step 2600/4641 loss=0.0456 reg=0.0407 nce_clip=0.0213 nce_t

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 10: VAL SMAPE = 43.538%
⏸️ No improvement. Patience 1/5


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 11 step 200/4641 loss=0.0374 reg=0.0335 nce_clip=0.0168 nce_txt=0.0050
epoch 11 step 400/4641 loss=0.0382 reg=0.0343 nce_clip=0.0171 nce_txt=0.0053
epoch 11 step 600/4641 loss=0.0376 reg=0.0337 nce_clip=0.0171 nce_txt=0.0051
epoch 11 step 800/4641 loss=0.0374 reg=0.0334 nce_clip=0.0176 nce_txt=0.0051
epoch 11 step 1000/4641 loss=0.0368 reg=0.0328 nce_clip=0.0176 nce_txt=0.0052
epoch 11 step 1200/4641 loss=0.0370 reg=0.0329 nce_clip=0.0178 nce_txt=0.0052
epoch 11 step 1400/4641 loss=0.0369 reg=0.0329 nce_clip=0.0177 nce_txt=0.0052
epoch 11 step 1600/4641 loss=0.0368 reg=0.0328 nce_clip=0.0177 nce_txt=0.0053
epoch 11 step 1800/4641 loss=0.0364 reg=0.0324 nce_clip=0.0174 nce_txt=0.0052
epoch 11 step 2000/4641 loss=0.0362 reg=0.0322 nce_clip=0.0175 nce_txt=0.0053
epoch 11 step 2200/4641 loss=0.0359 reg=0.0319 nce_clip=0.0175 nce_txt=0.0053
epoch 11 step 2400/4641 loss=0.0357 reg=0.0317 nce_clip=0.0174 nce_txt=0.0052
epoch 11 step 2600/4641 loss=0.0356 reg=0.0316 nce_clip=0.0173 nce_t

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 11: VAL SMAPE = 43.387%
⏸️ No improvement. Patience 2/5


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 12 step 200/4641 loss=0.0274 reg=0.0232 nce_clip=0.0180 nce_txt=0.0053
epoch 12 step 400/4641 loss=0.0277 reg=0.0239 nce_clip=0.0166 nce_txt=0.0052
epoch 12 step 600/4641 loss=0.0275 reg=0.0238 nce_clip=0.0157 nce_txt=0.0052
epoch 12 step 800/4641 loss=0.0271 reg=0.0234 nce_clip=0.0156 nce_txt=0.0052
epoch 12 step 1000/4641 loss=0.0270 reg=0.0234 nce_clip=0.0154 nce_txt=0.0051
epoch 12 step 1200/4641 loss=0.0268 reg=0.0232 nce_clip=0.0153 nce_txt=0.0051
epoch 12 step 1400/4641 loss=0.0266 reg=0.0231 nce_clip=0.0151 nce_txt=0.0051
epoch 12 step 1600/4641 loss=0.0267 reg=0.0232 nce_clip=0.0149 nce_txt=0.0050
epoch 12 step 1800/4641 loss=0.0267 reg=0.0232 nce_clip=0.0149 nce_txt=0.0049
epoch 12 step 2000/4641 loss=0.0266 reg=0.0231 nce_clip=0.0150 nce_txt=0.0049
epoch 12 step 2200/4641 loss=0.0267 reg=0.0232 nce_clip=0.0148 nce_txt=0.0048
epoch 12 step 2400/4641 loss=0.0265 reg=0.0231 nce_clip=0.0147 nce_txt=0.0049
epoch 12 step 2600/4641 loss=0.0264 reg=0.0230 nce_clip=0.0146 nce_t

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 12: VAL SMAPE = 42.732%
💾 Saved new best to price_clip+distil_fusionv2/best_clip+distil.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 13 step 200/4641 loss=0.0199 reg=0.0170 nce_clip=0.0125 nce_txt=0.0043


In [None]:
# Save run metadata
with open(os.path.join(OUTPUT_DIR, "metrics_fusion.json"), "w") as f:
    json.dump({
        "best_val_smape": float(best_smape),
        "train_missing_images": int(train_ds.missing_img_count),
        "valid_missing_images": int(val_ds.missing_img_count),
        "dropped_train": int(getattr(train_ds, "dropped_missing", 0)),
        "dropped_valid": int(getattr(val_ds, "dropped_missing", 0)),
        "missing_policy": IMG_MISSING_POLICY,
        "val_frac": VAL_FRAC,
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "lr": LR,
        "weight_decay": WEIGHT_DECAY,
        "clip_id": CLIP_ID,
        "distil_id_requested": DISTIL_ID_RAW,
        "distil_id_used": model.distil_id_used,
    }, f, indent=2)

In [17]:
best_smape

np.float64(42.077175012917564)

In [20]:
all_preds = []
with torch.no_grad():
    for batch in tqdm(dl, total=len(dl)):
        batch = {k: (v.to(device, non_blocking=True) if torch.is_tensor(v) else v) for k, v in batch.items()}
        out = model(
            clip_input_ids=batch["clip_input_ids"],
            clip_attention_mask=batch["clip_attention_mask"],
            pixel_values=batch.get("pixel_values", None),
            img_missing=batch["img_missing"],
            distil_ids1=batch["distil_ids1"], distil_att1=batch["distil_att1"],
            distil_ids2=batch["distil_ids2"], distil_att2=batch["distil_att2"],
        )

        # ✅ Handle tuple/list outputs defensively
        if isinstance(out, (tuple, list)):
            # pick the first tensor-like item
            pred_log2 = next((t for t in out if torch.is_tensor(t)), out[0])
        else:
            pred_log2 = out

        pred_log2 = pred_log2.squeeze(-1)  # (B,) if it was (B,1)
        all_preds.append(pred_log2.detach().float().cpu().numpy())

# stack + de-log2
preds_log = np.concatenate(all_preds, axis=0) if len(all_preds) else np.zeros((len(dft),), dtype=np.float32)
price_pred = delog2(preds_log).astype(np.float64)

# optional: guard against infs/NAs
price_pred = np.nan_to_num(price_pred, nan=0.0, posinf=0.0, neginf=0.0)

os.makedirs(OUTPUT_DIR, exist_ok=True)
pd.DataFrame({ID_COL: dft[ID_COL].values, "price_pred": price_pred}).to_csv(PRED_OUT, index=False)
print(f"✅ Saved predictions: {PRED_OUT}")
print(f"🖼️ Missing/failed image loads: {infer_ds.missing_img_count}")

  0%|          | 0/4688 [00:00<?, ?it/s]

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Saved predictions: price_clip+distil_fusionv2/predictions_amritha.csv
🖼️ Missing/failed image loads: 0
