In [1]:
import pandas as pd
pd.set_option("display.max_columns", None) 

df = pd.read_csv("jl_fs/train.csv")
df["image_path"] = df["sample_id"].apply(lambda x : f"jl_fs/images/train/{x}.jpg")
df.to_csv("train_updated.csv", index = False)

In [2]:
import os
import math
import random
import json
from dataclasses import dataclass
from typing import Optional, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image

from transformers import (
    CLIPModel,
    AutoProcessor,
    get_linear_schedule_with_warmup,
)

In [10]:

# --------------------------- Config ---------------------------
CSV_PATH        = os.environ.get("TRAIN_CSV", "train_updated.csv")     # must contain text + price + image path
TEXT_COL        = os.environ.get("TEXT_COL", "catalog_content")
PRICE_COL       = os.environ.get("PRICE_COL", "price")
IMG_COL         = os.environ.get("IMG_COL",  "image_path")             # local jpg path column
ID_COL          = os.environ.get("ID_COL",  "sample_id")

MODEL_ID        = os.environ.get("MODEL_ID", "openai/clip-vit-large-patch14")
OUTPUT_DIR      = os.environ.get("OUTPUT_DIR", "price_clip_fulltrain")

SEED            = int(os.environ.get("SEED", "42"))
MAX_LEN         = int(os.environ.get("MAX_LEN", "64"))                 # CLIP text context is shorter
BATCH_SIZE      = int(os.environ.get("BATCH_SIZE", "32"))
LR              = float(os.environ.get("LR", "2e-5"))
WEIGHT_DECAY    = float(os.environ.get("WEIGHT_DECAY", "0.01"))
EPOCHS          = int(os.environ.get("EPOCHS", "10"))
WARMUP_RATIO    = float(os.environ.get("WARMUP_RATIO", "0.06"))
GRAD_ACCUM      = int(os.environ.get("GRAD_ACCUM", "1"))
MAX_GRAD_NORM   = float(os.environ.get("MAX_GRAD_NORM", "1.0"))
FP16            = os.environ.get("FP16", "true").lower() == "true"

# Loss & regularization
ALPHA_CONTRAST  = float(os.environ.get("ALPHA_CONTRAST", "0.25"))      # weight for contrastive loss (0..1)
TAU             = float(os.environ.get("TAU", "0.07"))                 # InfoNCE temperature
HUBER_DELTA     = float(os.environ.get("HUBER_DELTA", "1.0"))

# Price/log transform
MIN_PRICE       = float(os.environ.get("MIN_PRICE", "1e-6"))

# Missing image policy for TRAIN: zero | text_only | drop
IMG_MISSING_POLICY = os.environ.get("IMG_MISSING_POLICY", "zero").lower()
assert IMG_MISSING_POLICY in {"zero", "text_only", "drop"}

# Inference/Test config (optional)
TEST_CSV        = os.environ.get("TEST_CSV", "jl_fs/test.csv").strip()               # if "", inference is skipped
TEST_IMG_DIR    = os.environ.get("TEST_IMG_DIR", "jl_fs/images/test")  # used if test CSV lacks image_path

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [4]:

# --------------------------- Utils ---------------------------
def set_seed(seed: int = SEED):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

set_seed(SEED)

def smape_np(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    return 100.0 * np.mean(np.abs(y_pred - y_true) / denom)

def log2_price(p: np.ndarray) -> np.ndarray:
    return np.log2(np.clip(p, MIN_PRICE, None))

def delog2(x: np.ndarray) -> np.ndarray:
    return np.power(2.0, x)

# --------------------------- Dataset & Collate ---------------------------
class ClipPriceDataset(Dataset):
    """
    TRAIN:
      policy 'zero':      returns dummy pixel for missing images; later masked to zeros.
      policy 'text_only': returns no pixel_values for missing images; vision forward skipped.
      policy 'drop':      drops rows with missing images at dataset build time.

    TEST:
      We will ALWAYS behave like 'zero' (never drop predictions).

    Items may contain: input_ids, attention_mask, (pixel_values), img_missing, (target)
    """
    def __init__(self, df: pd.DataFrame, text_col: str, img_col: str,
                 prices_log2: Optional[np.ndarray],
                 processor: AutoProcessor, max_len: int, policy: str, is_test: bool = False):
        self.processor = processor
        self.max_len = max_len
        self.policy = policy
        self.is_test = is_test

        df = df.reset_index(drop=True).copy()
        df[text_col] = df[text_col].fillna("").astype(str)

        if (policy == "drop") and (not is_test):
            before = len(df)
            df = df[df[img_col].apply(lambda p: isinstance(p, str) and len(p) > 0 and os.path.exists(p))]
            self.dropped_missing = before - len(df)
        else:
            self.dropped_missing = 0

        self.texts = df[text_col].tolist()
        self.img_paths = df[img_col].fillna("").astype(str).tolist()
        self.prices_log2 = prices_log2
        self.missing_img_count = 0

        # Dummy pixel to get correct shape
        dummy = self.processor(images=Image.new("RGB", (224, 224)), return_tensors="pt")
        self._dummy_pixel = dummy["pixel_values"].squeeze(0)  # (C,H,W)

        # For exporting IDs in test predictions
        self.ids = df[ID_COL].tolist() if (ID_COL in df.columns) else list(range(len(df)))

    def __len__(self): return len(self.texts)

    def _load_image(self, path: str):
        if isinstance(path, str) and path and os.path.exists(path):
            try:
                return Image.open(path).convert("RGB")
            except Exception:
                pass
        self.missing_img_count += 1
        return None

    def __getitem__(self, idx):
        text = self.texts[idx]
        img  = self._load_image(self.img_paths[idx])

        enc_text = self.processor(text=[text], padding=False, truncation=True,
                                  max_length=self.max_len, return_tensors="pt")

        img_missing = 0
        pixel_values = None

        if img is None:
            img_missing = 1
            if self.is_test:
                # For test we NEVER drop; force zero-like behavior
                pixel_values = self._dummy_pixel.clone()
            else:
                if self.policy == "zero":
                    pixel_values = self._dummy_pixel.clone()
                elif self.policy == "text_only":
                    pixel_values = None
                elif self.policy == "drop":
                    # should not occur because drop was handled in __init__
                    pixel_values = self._dummy_pixel.clone()
        else:
            enc_img = self.processor(images=img, return_tensors="pt")
            pixel_values = enc_img["pixel_values"].squeeze(0)

        item = {
            "input_ids": enc_text["input_ids"].squeeze(0),
            "attention_mask": enc_text["attention_mask"].squeeze(0),
            "img_missing": torch.tensor(img_missing, dtype=torch.uint8),
            "row_id": torch.tensor(self.ids[idx], dtype=torch.long)
        }
        if pixel_values is not None:
            item["pixel_values"] = pixel_values
        if self.prices_log2 is not None:
            item["target"] = torch.tensor(self.prices_log2[idx], dtype=torch.float32)
        return item

In [5]:

@dataclass
class CollateClip:
    processor: AutoProcessor
    def __call__(self, batch):
        # pad text
        input_ids = [b["input_ids"] for b in batch]
        attention = [b["attention_mask"] for b in batch]
        text_padded = self.processor.tokenizer.pad(
            {"input_ids": input_ids, "attention_mask": attention},
            padding=True, return_tensors="pt"
        )
        # images: some may be absent (text_only policy)
        has_pix = [("pixel_values" in b) for b in batch]
        pixel_values = None
        if any(has_pix):
            shapes = [b["pixel_values"].shape for b in batch if "pixel_values" in b]
            C,H,W = shapes[0]
            stacked = []
            for b in batch:
                if "pixel_values" in b:
                    stacked.append(b["pixel_values"])
                else:
                    stacked.append(torch.zeros((C,H,W), dtype=torch.float32))
            pixel_values = torch.stack(stacked, dim=0)

        res = {
            "input_ids": text_padded["input_ids"],
            "attention_mask": text_padded["attention_mask"],
            "img_missing": torch.stack([b["img_missing"] for b in batch], dim=0),
            "row_id": torch.stack([b["row_id"] for b in batch], dim=0),
        }
        if pixel_values is not None:
            res["pixel_values"] = pixel_values
        if "target" in batch[0]:
            res["target"] = torch.stack([b["target"] for b in batch], dim=0)
        return res

# --------------------------- Model & Loss ---------------------------
class ClipRegressionHead(nn.Module):
    def __init__(self, embed_dim: int, dropout: float = 0.1):
        super().__init__()
        in_dim = 2 * embed_dim  # concat image+text
        self.net = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(in_dim, in_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(in_dim, 1),
        )
    def forward(self, x): return self.net(x).squeeze(-1)

def info_nce(z_img: torch.Tensor, z_txt: torch.Tensor, tau: float = 0.07) -> torch.Tensor:
    z_img = F.normalize(z_img, dim=-1)
    z_txt = F.normalize(z_txt, dim=-1)
    logits = torch.matmul(z_img, z_txt.t()) / tau  # (B,B)
    labels = torch.arange(z_img.size(0), device=z_img.device)
    loss_i = F.cross_entropy(logits, labels)
    loss_t = F.cross_entropy(logits.t(), labels)
    return 0.5 * (loss_i + loss_t)

def huber_loss(pred, target, delta=1.0):
    return F.huber_loss(pred, target, delta=delta)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [7]:
# --------------------------- Load TRAIN data ---------------------------
print(f"🔧 Loading TRAIN CSV: {CSV_PATH}")
df = pd.read_csv(CSV_PATH)

# checks
for col, name in [(TEXT_COL, "TEXT_COL"), (PRICE_COL, "PRICE_COL"), (IMG_COL, "IMG_COL")]:
    if col not in df.columns:
        raise ValueError(f"{name} '{col}' not in CSV columns={df.columns.tolist()}")

# clean
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str).str.strip()
df = df.loc[pd.to_numeric(df[PRICE_COL], errors="coerce").notnull()].copy()
df[PRICE_COL] = df[PRICE_COL].astype(float)
df = df.loc[df[PRICE_COL] >= 0.0].reset_index(drop=True)

print(f"📦 Train rows: {len(df)}")

# targets (log2)
y_log = log2_price(df[PRICE_COL].values)

# --------------------------- CLIP backbone ---------------------------
processor = AutoProcessor.from_pretrained(MODEL_ID)
clip_model = CLIPModel.from_pretrained(MODEL_ID)

device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = clip_model.to(device)
price_head = ClipRegressionHead(embed_dim=clip_model.config.projection_dim, dropout=0.1).to(device)

print(f"🖥️ Device: {device}")
# Train end-to-end (set to False to freeze CLIP)
for p in clip_model.parameters():
    p.requires_grad = True

print(f"🧮 Trainable params CLIP={count_parameters(clip_model):,} | head={count_parameters(price_head):,}")

train_ds = ClipPriceDataset(df, TEXT_COL, IMG_COL, y_log, processor, MAX_LEN, IMG_MISSING_POLICY, is_test=False)
collate  = CollateClip(processor)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=2, pin_memory=True, collate_fn=collate)


🔧 Loading TRAIN CSV: train_updated.csv
📦 Train rows: 75000
🖥️ Device: cuda
🧮 Trainable params CLIP=427,616,513 | head=2,362,369


In [8]:
no_decay = ["bias", "LayerNorm.weight"]
params = list(clip_model.named_parameters()) + [(f"head.{n}", p) for n, p in price_head.named_parameters()]
grouped = [
    {"params": [p for n, p in params if not any(nd in n for nd in no_decay)], "weight_decay": WEIGHT_DECAY},
    {"params": [p for n, p in params if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = torch.optim.AdamW(grouped, lr=LR)
num_training_steps = EPOCHS * max(1, math.ceil(len(train_loader) / max(1, GRAD_ACCUM)))
num_warmup = int(num_training_steps * WARMUP_RATIO)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup, num_training_steps=num_training_steps
)
scaler = torch.cuda.amp.GradScaler(enabled=FP16)

# Warmup batch to tally missing images
print("🔎 Warmup batch to tally missing images…")
if len(train_loader) > 0:
    _ = next(iter(train_loader))
print(f"⚠️ TRAIN missing images: {train_ds.missing_img_count}")
print(f"🗑️ TRAIN dropped (policy={IMG_MISSING_POLICY}): {getattr(train_ds,'dropped_missing',0)}")

🔎 Warmup batch to tally missing images…


  scaler = torch.cuda.amp.GradScaler(enabled=FP16)
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


⚠️ TRAIN missing images: 0
🗑️ TRAIN dropped (policy=zero): 0


In [11]:
# --------------------------- TRAIN (Full data) ---------------------------
for epoch in range(1, EPOCHS + 1):
    clip_model.train(); price_head.train()
    loss_run = reg_run = con_run = 0.0

    optimizer.zero_grad(set_to_none=True)

    for step, batch in enumerate(train_loader, 1):
        input_ids      = batch["input_ids"].to(device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(device, non_blocking=True)
        targets        = batch["target"].to(device, non_blocking=True).float()
        img_missing    = batch["img_missing"].to(device)

        with torch.cuda.amp.autocast(enabled=FP16):
            # Text features
            txt_feat = clip_model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)

            # Image features
            do_vision = ("pixel_values" in batch) and (IMG_MISSING_POLICY != "text_only")
            if do_vision:
                pixel_values = batch["pixel_values"].to(
                    device,
                    dtype=next(clip_model.vision_model.parameters()).dtype,
                    non_blocking=True
                )
                img_feat = clip_model.get_image_features(pixel_values=pixel_values)
                if img_missing.any():
                    img_feat = img_feat * (1.0 - img_missing.unsqueeze(1).float())
            else:
                img_feat = torch.zeros_like(txt_feat)

            # Normalize + fuse
            txt_n = F.normalize(txt_feat, dim=-1)
            img_n = F.normalize(img_feat, dim=-1)
            fused = torch.cat([img_n, txt_n], dim=-1)

            # Losses
            pred_log = price_head(fused)
            reg_loss = huber_loss(pred_log, targets, delta=HUBER_DELTA)

            con_loss = torch.tensor(0.0, device=device, dtype=txt_n.dtype)
            valid_idx = (img_missing == 0).nonzero(as_tuple=False).squeeze(-1)
            if do_vision and valid_idx.numel() > 1 and ALPHA_CONTRAST > 0:
                con_loss = info_nce(img_n[valid_idx], txt_n[valid_idx], tau=TAU)

            loss = (1.0 - ALPHA_CONTRAST) * reg_loss + ALPHA_CONTRAST * con_loss

        scaler.scale(loss).backward()

        if step % GRAD_ACCUM == 0:
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(list(clip_model.parameters()) + list(price_head.parameters()), MAX_GRAD_NORM)
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        loss_run += float(loss.item())
        reg_run  += float(reg_loss.item())
        con_run  += float(con_loss.item()) if isinstance(con_loss, torch.Tensor) else float(con_loss)

        if step % 200 == 0:
            print(f"epoch {epoch} step {step}/{len(train_loader)} "
                  f"loss={loss_run/step:.4f} reg={reg_run/step:.4f} con={con_run/step:.4f}")

    print(f"✅ Epoch {epoch} done. avg_loss={loss_run/max(1,len(train_loader)):.4f}")

# Save final full-data checkpoint
full_ckpt = os.path.join(OUTPUT_DIR, "full_clip.pt")
torch.save(
    {
        "clip_state": clip_model.state_dict(),
        "head_state": price_head.state_dict(),
        "model_id": MODEL_ID,
        "config": {
            "ALPHA_CONTRAST": ALPHA_CONTRAST,
            "TAU": TAU,
            "MAX_LEN": MAX_LEN,
            "projection_dim": clip_model.config.projection_dim,
            "IMG_MISSING_POLICY": IMG_MISSING_POLICY,
            "HUBER_DELTA": HUBER_DELTA
        },
        "columns": {"id": ID_COL, "text": TEXT_COL, "image": IMG_COL, "price": PRICE_COL},
    },
    full_ckpt
)
print(f"💾 Saved full-data checkpoint to: {full_ckpt}")

with open(os.path.join(OUTPUT_DIR, "metrics_fulltrain.json"), "w") as f:
    json.dump({
        "train_rows": int(len(df)),
        "train_missing_images": int(train_ds.missing_img_count),
        "dropped_train": int(getattr(train_ds, "dropped_missing", 0)),
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "lr": LR,
        "weight_decay": WEIGHT_DECAY,
        "grad_accum": GRAD_ACCUM,
        "fp16": FP16,
    }, f, indent=2)

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  with torch.cuda.amp.autocast(enabled=FP16):


epoch 1 step 200/2344 loss=1.9227 reg=2.4379 con=0.3771
epoch 1 step 400/2344 loss=1.5964 reg=1.9708 con=0.4733
epoch 1 step 600/2344 loss=1.3037 reg=1.5771 con=0.4835
epoch 1 step 800/2344 loss=1.1081 reg=1.3277 con=0.4492
epoch 1 step 1000/2344 loss=0.9790 reg=1.1677 con=0.4127
epoch 1 step 1200/2344 loss=0.8868 reg=1.0545 con=0.3837
epoch 1 step 1400/2344 loss=0.8195 reg=0.9732 con=0.3584
epoch 1 step 1600/2344 loss=0.7667 reg=0.9086 con=0.3410
epoch 1 step 1800/2344 loss=0.7253 reg=0.8588 con=0.3248
epoch 1 step 2000/2344 loss=0.6892 reg=0.8156 con=0.3099
epoch 1 step 2200/2344 loss=0.6608 reg=0.7819 con=0.2973
✅ Epoch 1 done. avg_loss=0.6422


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 2 step 200/2344 loss=0.3039 reg=0.3685 con=0.1100
epoch 2 step 400/2344 loss=0.2982 reg=0.3612 con=0.1094
epoch 2 step 600/2344 loss=0.2981 reg=0.3611 con=0.1091
epoch 2 step 800/2344 loss=0.2969 reg=0.3600 con=0.1075
epoch 2 step 1000/2344 loss=0.2945 reg=0.3574 con=0.1058
epoch 2 step 1200/2344 loss=0.2921 reg=0.3548 con=0.1040
epoch 2 step 1400/2344 loss=0.2914 reg=0.3538 con=0.1044
epoch 2 step 1600/2344 loss=0.2908 reg=0.3531 con=0.1039
epoch 2 step 1800/2344 loss=0.2907 reg=0.3531 con=0.1036
epoch 2 step 2000/2344 loss=0.2913 reg=0.3541 con=0.1031
epoch 2 step 2200/2344 loss=0.2903 reg=0.3530 con=0.1022
✅ Epoch 2 done. avg_loss=0.2900


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 3 step 200/2344 loss=0.1991 reg=0.2433 con=0.0666
epoch 3 step 400/2344 loss=0.1932 reg=0.2347 con=0.0684
epoch 3 step 600/2344 loss=0.1982 reg=0.2414 con=0.0688
epoch 3 step 800/2344 loss=0.1977 reg=0.2411 con=0.0673
epoch 3 step 1000/2344 loss=0.1969 reg=0.2403 con=0.0668
epoch 3 step 1200/2344 loss=0.1978 reg=0.2414 con=0.0670
epoch 3 step 1400/2344 loss=0.1987 reg=0.2425 con=0.0674
epoch 3 step 1600/2344 loss=0.1995 reg=0.2435 con=0.0674
epoch 3 step 1800/2344 loss=0.2003 reg=0.2447 con=0.0672
epoch 3 step 2000/2344 loss=0.2009 reg=0.2454 con=0.0672
epoch 3 step 2200/2344 loss=0.2014 reg=0.2462 con=0.0669
✅ Epoch 3 done. avg_loss=0.2018


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 4 step 200/2344 loss=0.1359 reg=0.1642 con=0.0509
epoch 4 step 400/2344 loss=0.1341 reg=0.1612 con=0.0528
epoch 4 step 600/2344 loss=0.1329 reg=0.1595 con=0.0531
epoch 4 step 800/2344 loss=0.1335 reg=0.1606 con=0.0522
epoch 4 step 1000/2344 loss=0.1333 reg=0.1604 con=0.0521
epoch 4 step 1200/2344 loss=0.1336 reg=0.1610 con=0.0513
epoch 4 step 1400/2344 loss=0.1338 reg=0.1614 con=0.0512
epoch 4 step 1600/2344 loss=0.1347 reg=0.1626 con=0.0511
epoch 4 step 1800/2344 loss=0.1350 reg=0.1631 con=0.0505
epoch 4 step 2000/2344 loss=0.1354 reg=0.1637 con=0.0506
epoch 4 step 2200/2344 loss=0.1357 reg=0.1642 con=0.0505
✅ Epoch 4 done. avg_loss=0.1357


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 5 step 200/2344 loss=0.0910 reg=0.1072 con=0.0423
epoch 5 step 400/2344 loss=0.0920 reg=0.1089 con=0.0412
epoch 5 step 600/2344 loss=0.0916 reg=0.1087 con=0.0404
epoch 5 step 800/2344 loss=0.0917 reg=0.1089 con=0.0401
epoch 5 step 1000/2344 loss=0.0914 reg=0.1086 con=0.0398
epoch 5 step 1200/2344 loss=0.0923 reg=0.1098 con=0.0399
epoch 5 step 1400/2344 loss=0.0926 reg=0.1102 con=0.0396
epoch 5 step 1600/2344 loss=0.0922 reg=0.1098 con=0.0393
epoch 5 step 1800/2344 loss=0.0925 reg=0.1102 con=0.0393
epoch 5 step 2000/2344 loss=0.0928 reg=0.1106 con=0.0393
epoch 5 step 2200/2344 loss=0.0931 reg=0.1111 con=0.0393
✅ Epoch 5 done. avg_loss=0.0930


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 6 step 200/2344 loss=0.0668 reg=0.0784 con=0.0319
epoch 6 step 400/2344 loss=0.0664 reg=0.0778 con=0.0321
epoch 6 step 600/2344 loss=0.0660 reg=0.0774 con=0.0317
epoch 6 step 800/2344 loss=0.0663 reg=0.0778 con=0.0318
epoch 6 step 1000/2344 loss=0.0660 reg=0.0774 con=0.0316
epoch 6 step 1200/2344 loss=0.0658 reg=0.0773 con=0.0314
epoch 6 step 1400/2344 loss=0.0660 reg=0.0776 con=0.0310
epoch 6 step 1600/2344 loss=0.0657 reg=0.0774 con=0.0309
epoch 6 step 1800/2344 loss=0.0657 reg=0.0774 con=0.0308
epoch 6 step 2000/2344 loss=0.0658 reg=0.0775 con=0.0306
epoch 6 step 2200/2344 loss=0.0656 reg=0.0774 con=0.0304
✅ Epoch 6 done. avg_loss=0.0657


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 7 step 200/2344 loss=0.0481 reg=0.0559 con=0.0246
epoch 7 step 400/2344 loss=0.0476 reg=0.0550 con=0.0252
epoch 7 step 600/2344 loss=0.0482 reg=0.0560 con=0.0249
epoch 7 step 800/2344 loss=0.0475 reg=0.0550 con=0.0248
epoch 7 step 1000/2344 loss=0.0469 reg=0.0544 con=0.0245
epoch 7 step 1200/2344 loss=0.0466 reg=0.0539 con=0.0244
epoch 7 step 1400/2344 loss=0.0463 reg=0.0536 con=0.0243
epoch 7 step 1600/2344 loss=0.0462 reg=0.0534 con=0.0246
epoch 7 step 1800/2344 loss=0.0460 reg=0.0533 con=0.0243
epoch 7 step 2000/2344 loss=0.0460 reg=0.0533 con=0.0241
epoch 7 step 2200/2344 loss=0.0456 reg=0.0528 con=0.0240
✅ Epoch 7 done. avg_loss=0.0457


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 8 step 200/2344 loss=0.0326 reg=0.0366 con=0.0206
epoch 8 step 400/2344 loss=0.0332 reg=0.0375 con=0.0202
epoch 8 step 600/2344 loss=0.0333 reg=0.0379 con=0.0195
epoch 8 step 800/2344 loss=0.0332 reg=0.0377 con=0.0194
epoch 8 step 1000/2344 loss=0.0328 reg=0.0372 con=0.0196
epoch 8 step 1200/2344 loss=0.0328 reg=0.0373 con=0.0195
epoch 8 step 1400/2344 loss=0.0328 reg=0.0372 con=0.0193
epoch 8 step 1600/2344 loss=0.0325 reg=0.0370 con=0.0190
epoch 8 step 1800/2344 loss=0.0324 reg=0.0369 con=0.0190
epoch 8 step 2000/2344 loss=0.0323 reg=0.0367 con=0.0189
epoch 8 step 2200/2344 loss=0.0321 reg=0.0366 con=0.0189
✅ Epoch 8 done. avg_loss=0.0322


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 9 step 200/2344 loss=0.0235 reg=0.0257 con=0.0168
epoch 9 step 400/2344 loss=0.0224 reg=0.0245 con=0.0160
epoch 9 step 600/2344 loss=0.0220 reg=0.0241 con=0.0158
epoch 9 step 800/2344 loss=0.0220 reg=0.0240 con=0.0158
epoch 9 step 1000/2344 loss=0.0219 reg=0.0239 con=0.0158
epoch 9 step 1200/2344 loss=0.0219 reg=0.0240 con=0.0158
epoch 9 step 1400/2344 loss=0.0217 reg=0.0237 con=0.0156
epoch 9 step 1600/2344 loss=0.0217 reg=0.0237 con=0.0156
epoch 9 step 1800/2344 loss=0.0216 reg=0.0236 con=0.0156
epoch 9 step 2000/2344 loss=0.0216 reg=0.0236 con=0.0155
epoch 9 step 2200/2344 loss=0.0214 reg=0.0234 con=0.0155
✅ Epoch 9 done. avg_loss=0.0215


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 10 step 200/2344 loss=0.0146 reg=0.0146 con=0.0145
epoch 10 step 400/2344 loss=0.0150 reg=0.0152 con=0.0145
epoch 10 step 600/2344 loss=0.0148 reg=0.0148 con=0.0147
epoch 10 step 800/2344 loss=0.0147 reg=0.0149 con=0.0142
epoch 10 step 1000/2344 loss=0.0147 reg=0.0148 con=0.0143
epoch 10 step 1200/2344 loss=0.0146 reg=0.0147 con=0.0143
epoch 10 step 1400/2344 loss=0.0146 reg=0.0147 con=0.0141
epoch 10 step 1600/2344 loss=0.0146 reg=0.0147 con=0.0141
epoch 10 step 1800/2344 loss=0.0145 reg=0.0147 con=0.0140
epoch 10 step 2000/2344 loss=0.0144 reg=0.0146 con=0.0139
epoch 10 step 2200/2344 loss=0.0144 reg=0.0146 con=0.0138
✅ Epoch 10 done. avg_loss=0.0144
💾 Saved full-data checkpoint to: price_clip_fulltrain/full_clip.pt


In [13]:
# %% [markdown]
# --- Inference: load best checkpoint and predict on TEST_CSV ---

# %%
import os, json, math
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import CLIPModel, AutoProcessor

# ---- Config / paths ----
TEST_CSV        = os.environ.get("TEST_CSV", "jl_fs/test.csv")   # must contain ID + text + image path
ID_COL          = os.environ.get("ID_COL", "sample_id")
TEXT_COL        = os.environ.get("TEXT_COL", "catalog_content")
IMG_COL         = os.environ.get("IMG_COL",  "image_path")

OUTPUT_DIR      = os.environ.get("OUTPUT_DIR", "price_clip_fulltrain")
CKPT_PATH       = os.environ.get("CKPT_PATH", os.path.join(OUTPUT_DIR, "full_clip.pt"))

BATCH_SIZE      = int(os.environ.get("INF_BATCH_SIZE", "64"))
MAX_LEN_ENV     = os.environ.get("MAX_LEN", None)  # if you want to override tokenizer max len
FP16            = os.environ.get("FP16", "true").lower() == "true"

assert os.path.exists(CKPT_PATH), f"Checkpoint not found at {CKPT_PATH}"
assert os.path.exists(TEST_CSV),  f"Test CSV not found at {TEST_CSV}"

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---- Load checkpoint ----
ckpt = torch.load(CKPT_PATH, map_location="cpu")
model_id = ckpt.get("model_id", "openai/clip-vit-large-patch14")
cfg = ckpt.get("config", {})
projection_dim = cfg.get("projection_dim")
img_missing_policy = cfg.get("IMG_MISSING_POLICY", "zero")
max_len = int(cfg.get("MAX_LEN", 64)) if MAX_LEN_ENV is None else int(MAX_LEN_ENV)

print(f"📦 Loaded checkpoint from: {CKPT_PATH}")
print(f"🔤 MODEL_ID={model_id} | projection_dim={projection_dim} | IMG_MISSING_POLICY={img_missing_policy} | MAX_LEN={max_len}")

# ---- Recreate processor & models ----
processor = AutoProcessor.from_pretrained(model_id)
clip_model = CLIPModel.from_pretrained(model_id)
clip_model.load_state_dict(ckpt["clip_state"], strict=True)
clip_model.to(device).eval()

# Recreate and load regression head (same class as training cell)
price_head = ClipRegressionHead(embed_dim=projection_dim, dropout=0.0)
price_head.load_state_dict(ckpt["head_state"], strict=True)
price_head.to(device).eval()

# ---- Load test data ----
dft = pd.read_csv(TEST_CSV)
dft["image_path"] = dft["sample_id"].apply(lambda x : f"jl_fs/images/test/{x}.jpg")
for col, name in [(ID_COL, "ID_COL"), (TEXT_COL, "TEXT_COL"), (IMG_COL, "IMG_COL")]:
    if col not in dft.columns:
        raise ValueError(f"{name} '{col}' missing from test CSV. Columns={dft.columns.tolist()}")

# Basic clean
dft[TEXT_COL] = dft[TEXT_COL].fillna("").astype(str).str.strip()
dft[IMG_COL]  = dft[IMG_COL].fillna("").astype(str)

# Build dataset/dataloader with no targets
test_ds = ClipPriceDataset(
    df=dft[[ID_COL, TEXT_COL, IMG_COL]].copy(),
    text_col=TEXT_COL,
    img_col=IMG_COL,
    prices_log2=None,
    processor=processor,
    max_len=max_len,
    policy=img_missing_policy
)
collate = CollateClip(processor)

dl_te = DataLoader(
    test_ds, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=2, pin_memory=True, collate_fn=collate
)

print(f"🖥 Device: {device}")
print(f"🧪 Test rows: {len(test_ds)} | Missing images encountered (during getitem): {test_ds.missing_img_count}")
print(f"🗑 Dropped due to policy=drop: {getattr(test_ds, 'dropped_missing', 0)}")

# ---- Inference loop ----
clip_model_dtype = next(clip_model.vision_model.parameters()).dtype
preds_log2 = []

with torch.no_grad():
    for batch in tqdm(dl_te, total = len(dl_te)):
        input_ids      = batch["input_ids"].to(device, non_blocking=True)
        attention_mask = batch["attention_mask"].to(device, non_blocking=True)
        img_missing    = batch["img_missing"].to(device)

        # Text features
        with torch.cuda.amp.autocast(enabled=FP16):
            txt_feat = clip_model.get_text_features(input_ids=input_ids, attention_mask=attention_mask)

            # Image features depending on policy
            do_vision = ("pixel_values" in batch) and (img_missing_policy != "text_only")
            if do_vision:
                pixel_values = batch["pixel_values"].to(device, dtype=clip_model_dtype, non_blocking=True)
                img_feat = clip_model.get_image_features(pixel_values=pixel_values)
                if img_missing.any():
                    img_feat = img_feat * (1.0 - img_missing.unsqueeze(1).float())
            else:
                img_feat = torch.zeros_like(txt_feat)

            # Normalize + fuse
            txt_n = F.normalize(txt_feat, dim=-1)
            img_n = F.normalize(img_feat, dim=-1)
            fused = torch.cat([img_n, txt_n], dim=-1)

            # Predict log2(price)
            pred_log = price_head(fused)
            preds_log2.append(pred_log.detach().float().cpu().numpy())

📦 Loaded checkpoint from: price_clip_fulltrain/full_clip.pt
🔤 MODEL_ID=openai/clip-vit-large-patch14 | projection_dim=768 | IMG_MISSING_POLICY=zero | MAX_LEN=64
🖥 Device: cuda
🧪 Test rows: 75000 | Missing images encountered (during getitem): 0
🗑 Dropped due to policy=drop: 0


  0%|          | 0/1172 [00:00<?, ?it/s]

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  with torch.cuda.amp.autocast(enabled=FP16):


In [14]:
# ---- Convert back to price (delog2) and save ----
if len(preds_log2):
    preds_log2 = np.concatenate(preds_log2, axis=0)
    preds_log2_delog = np.pow(2,preds_log2)
    preds_price = np.clip(preds_log2_delog, 0,2000)  # safe de-log clamp
else:
    preds_price = np.array([])

out = pd.DataFrame({
    ID_COL: dft[ID_COL].values[: len(preds_price)],
    "price": preds_price
})
pred_path = os.path.join(OUTPUT_DIR, "test_predictions_clip_big_epoch-10_full_v2.csv")
os.makedirs(OUTPUT_DIR, exist_ok=True)
out.to_csv(pred_path, index=False)

print(f"✅ Done. Wrote {len(out)} predictions to: {pred_path}")
print(f"   Missing images counted during dataset load: {test_ds.missing_img_count}")
print(f"   Dropped rows (policy=drop): {getattr(test_ds,'dropped_missing',0)}")

✅ Done. Wrote 75000 predictions to: price_clip_fulltrain/test_predictions_clip_big_epoch-10_full_v2.csv
   Missing images counted during dataset load: 0
   Dropped rows (policy=drop): 0
