In [1]:
import pandas as pd
pd.set_option("display.max_columns", None) 

df = pd.read_csv("jl_fs/train.csv")
df["image_path"] = df["sample_id"].apply(lambda x : f"jl_fs/images/train/{x}.jpg")
df.to_csv("train_updated.csv", index = False)

In [11]:
import pandas as pd
pd.set_option("display.max_columns", None) 

df = pd.read_csv("jl_fs/test.csv")
df["image_path"] = df["sample_id"].apply(lambda x : f"jl_fs/images/test/{x}.jpg")
df.to_csv("test_updated.csv", index = False)

In [5]:
import os
import math
import random
import json
from dataclasses import dataclass
from typing import Optional, List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from PIL import Image

from transformers import (
    CLIPModel,
    AutoProcessor,
    get_linear_schedule_with_warmup,
    AutoTokenizer,
    AutoModel,
)

In [6]:

# --------------------------- Config ---------------------------
CSV_PATH        = os.environ.get("TRAIN_CSV", "train_updated.csv")     # must contain text + price + image path
TEXT_COL        = os.environ.get("TEXT_COL", "catalog_content")
PRICE_COL       = os.environ.get("PRICE_COL", "price")
IMG_COL         = os.environ.get("IMG_COL",  "image_path")             # local jpg path column
ID_COL          = os.environ.get("ID_COL",  "sample_id")

# Model configurations
CLIP_MODEL_ID   = os.environ.get("CLIP_MODEL_ID", "openai/clip-vit-large-patch14")  # Using base CLIP
BERT_MODEL_ID   = os.environ.get("BERT_MODEL_ID", "intfloat/e5-base-v2")             # BERT for text
OUTPUT_DIR      = os.environ.get("OUTPUT_DIR", "price_large_clip_infloat_bert_hybrid_10")

SEED            = int(os.environ.get("SEED", "42"))
MAX_LEN         = int(os.environ.get("MAX_LEN", "77"))                 # CLIP text context is shorter
BATCH_SIZE      = int(os.environ.get("BATCH_SIZE", "24"))
LR              = float(os.environ.get("LR", "2e-5"))
WEIGHT_DECAY    = float(os.environ.get("WEIGHT_DECAY", "0.01"))
EPOCHS          = int(os.environ.get("EPOCHS", "10"))
WARMUP_RATIO    = float(os.environ.get("WARMUP_RATIO", "0.06"))
GRAD_ACCUM      = int(os.environ.get("GRAD_ACCUM", "1"))
MAX_GRAD_NORM   = float(os.environ.get("MAX_GRAD_NORM", "1.0"))
FP16            = os.environ.get("FP16", "true").lower() == "true"

# Loss & regularization
ALPHA_CONTRAST  = float(os.environ.get("ALPHA_CONTRAST", "0.25"))      # weight for contrastive loss (0..1)
TAU             = float(os.environ.get("TAU", "0.07"))                 # InfoNCE temperature
HUBER_DELTA     = float(os.environ.get("HUBER_DELTA", "1.0"))

# Cross-attention configuration
NUM_ATTENTION_HEADS = int(os.environ.get("NUM_ATTENTION_HEADS", "8"))  # Number of cross-attention heads
ATTENTION_DROPOUT   = float(os.environ.get("ATTENTION_DROPOUT", "0.1")) # Dropout for attention layers

# Price/log transform
MIN_PRICE       = float(os.environ.get("MIN_PRICE", "1e-6"))

# Missing image policy for TRAIN: zero | text_only | drop
IMG_MISSING_POLICY = os.environ.get("IMG_MISSING_POLICY", "zero").lower()
assert IMG_MISSING_POLICY in {"zero", "text_only", "drop"}

# Inference/Test config (optional)
TEST_CSV        = os.environ.get("TEST_CSV", "test_updated.csv").strip()               # if "", inference is skipped
TEST_IMG_DIR    = os.environ.get("TEST_IMG_DIR", "jl_fs/images/test")  # used if test CSV lacks image_path

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [7]:

# --------------------------- Utils ---------------------------

# --------------------------- New Loss: SMAPE on de-logged values ---------------------------

import math

# ---------- Helper losses ----------
def smape_on_prices(pred_price: torch.Tensor, target_price: torch.Tensor, eps: float = 1e-8):
    """
    SMAPE fraction in [0, 2]. Multiply by 100 for percent.
    pred_price, target_price: same shape, >=0
    """
    num = torch.abs(pred_price - target_price)
    denom = (torch.abs(pred_price) + torch.abs(target_price) + eps) / 2.0
    return (num / denom).mean()  # fraction

def pseudo_huber_loss(pred: torch.Tensor, target: torch.Tensor, delta: float = 1.0):
    """
    Pseudo-Huber on inputs (works similar to Huber but smoother).
    Inputs are assumed to be log-space (differences additive).
    """
    x = pred - target
    scale = delta
    return torch.mean(scale**2 * (torch.sqrt(1.0 + (x/scale)**2) - 1.0))

# ---------- Composite regression loss (with learned uncertainties + curriculum) ----------
class CompositeRegLoss(nn.Module):
    def __init__(self, base: float = 2.0, init_log_var_huber: float = 0.0, init_log_var_smape: float = 0.0,
                 use_pseudo_huber: bool = False):
        """
        base: base for log transform (e.g., 2.0 if using log2).
        init_log_var_*: initial log-variance scalars (learnable) to scale task losses.
        """
        super().__init__()
        self.base = base
        # learnable log-variances (s1, s2). Initialize near 0.
        self.log_var_huber = nn.Parameter(torch.tensor(init_log_var_huber, dtype=torch.float32))
        self.log_var_smape = nn.Parameter(torch.tensor(init_log_var_smape, dtype=torch.float32))
        self.use_pseudo_huber = use_pseudo_huber

    def forward(self, pred_log: torch.Tensor, target_log: torch.Tensor,
                epoch: int = 0, total_epochs: int = 1,
                huber_delta: float = 1.0):
        """
        Returns: reg_loss (scalar), dict of components {huber, smape, reg_loss_unweighted, s1, s2}
        pred_log, target_log: tensors in log_{base} space (shape [B])
        epoch, total_epochs: used for curriculum annealing (0..1)
        """
        # 1) Huber-like on log-space
        if self.use_pseudo_huber:
            L_H = pseudo_huber_loss(pred_log, target_log, delta=huber_delta)
        else:
            L_H = F.huber_loss(pred_log, target_log, delta=huber_delta, reduction="mean")

        # 2) SMAPE computed on de-logged prices
        # Convert back to price-space differentiably:
        pred_price = torch.pow(self.base, pred_log)          # 2**log2
        target_price = torch.pow(self.base, target_log)
        L_S = smape_on_prices(pred_price, target_price)      # fraction 0..2

        # Optionally scale SMAPE to similar magnitude as huber early on:
        # We normalize L_S by a running scalar if you want; but learned log_var takes care of scale.

        # 3) Curriculum scheduling: gradually shift weight from Huber -> SMAPE
        # schedule parameter tau in [0,1]: tau=0 => only huber; tau=1 => only smape
        if total_epochs <= 1:
            tau = 0.5
        else:
            tau = float(epoch) / float(max(1, total_epochs - 1))  # linearly 0..1 over epochs

        # combine with curriculum: weighted linear blend (can be tuned)
        # compute weighted components:
        # You can change blend formula; this is simple and works well in practice.
        L_H_eff = (1.0 - tau) * L_H
        L_S_eff = (tau) * L_S

        # 4) Kendall-style learned uncertainty weighting
        s1 = self.log_var_huber   # scalar
        s2 = self.log_var_smape

        # NLL style combination:
        loss_term_h = 0.5 * torch.exp(-s1) * L_H_eff + 0.5 * s1
        loss_term_s = 0.5 * torch.exp(-s2) * L_S_eff + 0.5 * s2

        reg_loss = loss_term_h + loss_term_s

        return reg_loss, {
            "L_H": L_H.detach(),
            "L_S": L_S.detach(),
            "L_H_eff": L_H_eff.detach(),
            "L_S_eff": L_S_eff.detach(),
            "s1": s1.detach(),
            "s2": s2.detach(),
            "reg_loss_unweighted": (L_H_eff + L_S_eff).detach()
        }



def set_seed(seed: int = SEED):
    random.seed(seed); np.random.seed(seed)
    torch.manual_seed(seed); torch.cuda.manual_seed_all(seed)

set_seed(SEED)

def smape_np(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    return 100.0 * np.mean(np.abs(y_pred - y_true) / denom)

def log2_price(p: np.ndarray) -> np.ndarray:
    return np.log2(np.clip(p, MIN_PRICE, None))

def delog2(x: np.ndarray) -> np.ndarray:
    return np.power(2.0, x)

# --------------------------- Dataset & Collate ---------------------------
class ClipBertPriceDataset(Dataset):
    """
    TRAIN:
      policy 'zero':      returns dummy pixel for missing images; later masked to zeros.
      policy 'text_only': returns no pixel_values for missing images; vision forward skipped.
      policy 'drop':      drops rows with missing images at dataset build time.

    TEST:
      We will ALWAYS behave like 'zero' (never drop predictions).

    Items may contain: clip_input_ids, clip_attention_mask, bert_input_ids, bert_attention_mask, (pixel_values), img_missing, (target)
    """
    def __init__(self, df: pd.DataFrame, text_col: str, img_col: str,
                 prices_log2: Optional[np.ndarray],
                 clip_processor: AutoProcessor, bert_tokenizer: AutoTokenizer, 
                 max_len: int, policy: str, is_test: bool = False):
        self.clip_processor = clip_processor
        self.bert_tokenizer = bert_tokenizer
        self.max_len = max_len
        self.policy = policy
        self.is_test = is_test

        df = df.reset_index(drop=True).copy()
        df[text_col] = df[text_col].fillna("").astype(str)

        if (policy == "drop") and (not is_test):
            before = len(df)
            df = df[df[img_col].apply(lambda p: isinstance(p, str) and len(p) > 0 and os.path.exists(p))]
            self.dropped_missing = before - len(df)
        else:
            self.dropped_missing = 0

        self.texts = df[text_col].tolist()
        self.img_paths = df[img_col].fillna("").astype(str).tolist()
        self.prices_log2 = prices_log2
        self.missing_img_count = 0

        # Dummy pixel to get correct shape
        dummy = self.clip_processor(images=Image.new("RGB", (224, 224)), return_tensors="pt")
        self._dummy_pixel = dummy["pixel_values"].squeeze(0)  # (C,H,W)

        # For exporting IDs in test predictions
        self.ids = df[ID_COL].tolist() if (ID_COL in df.columns) else list(range(len(df)))

    def __len__(self): return len(self.texts)

    def _load_image(self, path: str):
        if isinstance(path, str) and path and os.path.exists(path):
            try:
                return Image.open(path).convert("RGB")
            except Exception:
                pass
        self.missing_img_count += 1
        return None

    def __getitem__(self, idx):
        text = self.texts[idx]
        img  = self._load_image(self.img_paths[idx])

        # CLIP text encoding
        clip_enc_text = self.clip_processor(text=[text], padding=False, truncation=True,
                                           max_length=self.max_len, return_tensors="pt")
        
        # BERT text encoding
        bert_enc_text = self.bert_tokenizer(text, padding=False, truncation=True,
                                           max_length=192, return_tensors="pt")

        img_missing = 0
        pixel_values = None

        if img is None:
            img_missing = 1
            if self.is_test:
                # For test we NEVER drop; force zero-like behavior
                pixel_values = self._dummy_pixel.clone()
            else:
                if self.policy == "zero":
                    pixel_values = self._dummy_pixel.clone()
                elif self.policy == "text_only":
                    pixel_values = None
                elif self.policy == "drop":
                    # should not occur because drop was handled in __init__
                    pixel_values = self._dummy_pixel.clone()
        else:
            enc_img = self.clip_processor(images=img, return_tensors="pt")
            pixel_values = enc_img["pixel_values"].squeeze(0)

        item = {
            "clip_input_ids": clip_enc_text["input_ids"].squeeze(0),
            "clip_attention_mask": clip_enc_text["attention_mask"].squeeze(0),
            "bert_input_ids": bert_enc_text["input_ids"].squeeze(0),
            "bert_attention_mask": bert_enc_text["attention_mask"].squeeze(0),
            "img_missing": torch.tensor(img_missing, dtype=torch.uint8),
            "row_id": torch.tensor(self.ids[idx], dtype=torch.long)
        }
        if pixel_values is not None:
            item["pixel_values"] = pixel_values
        if self.prices_log2 is not None:
            item["target"] = torch.tensor(self.prices_log2[idx], dtype=torch.float32)
        return item

In [8]:

@dataclass
class CollateClipBert:
    clip_processor: AutoProcessor
    bert_tokenizer: AutoTokenizer
    def __call__(self, batch):
        # pad CLIP text
        clip_input_ids = [b["clip_input_ids"] for b in batch]
        clip_attention = [b["clip_attention_mask"] for b in batch]
        clip_text_padded = self.clip_processor.tokenizer.pad(
            {"input_ids": clip_input_ids, "attention_mask": clip_attention},
            padding=True, return_tensors="pt"
        )
        
        # pad BERT text
        bert_input_ids = [b["bert_input_ids"] for b in batch]
        bert_attention = [b["bert_attention_mask"] for b in batch]
        bert_text_padded = self.bert_tokenizer.pad(
            {"input_ids": bert_input_ids, "attention_mask": bert_attention},
            padding=True, return_tensors="pt"
        )
        
        # images: some may be absent (text_only policy)
        has_pix = [("pixel_values" in b) for b in batch]
        pixel_values = None
        if any(has_pix):
            shapes = [b["pixel_values"].shape for b in batch if "pixel_values" in b]
            C,H,W = shapes[0]
            stacked = []
            for b in batch:
                if "pixel_values" in b:
                    stacked.append(b["pixel_values"])
                else:
                    stacked.append(torch.zeros((C,H,W), dtype=torch.float32))
            pixel_values = torch.stack(stacked, dim=0)

        res = {
            "clip_input_ids": clip_text_padded["input_ids"],
            "clip_attention_mask": clip_text_padded["attention_mask"],
            "bert_input_ids": bert_text_padded["input_ids"],
            "bert_attention_mask": bert_text_padded["attention_mask"],
            "img_missing": torch.stack([b["img_missing"] for b in batch], dim=0),
            "row_id": torch.stack([b["row_id"] for b in batch], dim=0),
        }
        if pixel_values is not None:
            res["pixel_values"] = pixel_values
        if "target" in batch[0]:
            res["target"] = torch.stack([b["target"] for b in batch], dim=0)
        return res

# --------------------------- Model & Loss ---------------------------
class MultiHeadCrossAttention(nn.Module):
    def __init__(self, embed_dim: int, num_heads: int = 8, dropout: float = 0.1):
        super().__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        
        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
        
        self.q_proj = nn.Linear(embed_dim, embed_dim)
        self.k_proj = nn.Linear(embed_dim, embed_dim)
        self.v_proj = nn.Linear(embed_dim, embed_dim)
        self.out_proj = nn.Linear(embed_dim, embed_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, query, key, value, attention_mask=None):
        B, N, D = query.shape
        
        # Linear projections
        Q = self.q_proj(query).view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
        K = self.k_proj(key).view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
        V = self.v_proj(value).view(B, N, self.num_heads, self.head_dim).transpose(1, 2)
        
        # Scaled dot-product attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        
        if attention_mask is not None:
            scores = scores.masked_fill(attention_mask == 0, -1e9)
            
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        attn_output = torch.matmul(attn_weights, V)
        attn_output = attn_output.transpose(1, 2).contiguous().view(B, N, D)
        
        return self.out_proj(attn_output)

class ClipBertHybridModel(nn.Module):
    def __init__(self, clip_model, bert_model, clip_dim: int, bert_dim: int, 
                 num_attention_heads: int = 8, dropout: float = 0.1):
        super().__init__()
        self.clip_model = clip_model
        self.bert_model = bert_model
        
        # Project BERT features to CLIP dimension for cross-attention
        self.bert_proj = nn.Linear(bert_dim, clip_dim)
        
        # Cross-attention layers
        self.cross_attn_1 = MultiHeadCrossAttention(clip_dim, num_attention_heads, dropout)
        self.cross_attn_2 = MultiHeadCrossAttention(clip_dim, num_attention_heads, dropout)
        
        # Final regression head
        self.regression_head = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(3 * clip_dim, clip_dim),  # clip_img + clip_txt + bert_txt
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(clip_dim, clip_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(clip_dim // 2, 1),
        )
        
    def forward(self, clip_input_ids, clip_attention_mask, bert_input_ids, bert_attention_mask, 
                pixel_values, img_missing):
        # Get CLIP features
        clip_txt_feat = self.clip_model.get_text_features(
            input_ids=clip_input_ids, attention_mask=clip_attention_mask
        )
        clip_img_feat = self.clip_model.get_image_features(pixel_values=pixel_values)
        
        # Mask image features for missing images
        if img_missing.any():
            clip_img_feat = clip_img_feat * (1.0 - img_missing.unsqueeze(1).float())
        
        # Get BERT/DistilBERT features
        bert_outputs = self.bert_model(
            input_ids=bert_input_ids, 
            attention_mask=bert_attention_mask,
            return_dict=True
        )
        
        # DistilBERT doesn't have pooler_output, use last_hidden_state[:, 0, :] instead
        if hasattr(bert_outputs, 'pooler_output') and bert_outputs.pooler_output is not None:
            # For BERT models with pooler
            bert_feat = bert_outputs.pooler_output
        else:
            # For DistilBERT and models without pooler - use [CLS] token
            bert_feat = bert_outputs.last_hidden_state[:, 0, :]
        
        bert_feat = self.bert_proj(bert_feat)  # Project to CLIP dimension
        
        # Normalize features
        clip_txt_norm = F.normalize(clip_txt_feat, dim=-1)
        clip_img_norm = F.normalize(clip_img_feat, dim=-1)
        bert_norm = F.normalize(bert_feat, dim=-1)
        
        # Cross-attention between CLIP text and BERT features
        # BERT attends to CLIP text
        bert_attended = self.cross_attn_1(
            query=bert_norm.unsqueeze(1),  # Add sequence dimension
            key=clip_txt_norm.unsqueeze(1),
            value=clip_txt_norm.unsqueeze(1)
        ).squeeze(1)
        
        # CLIP text attends to BERT features
        clip_txt_attended = self.cross_attn_2(
            query=clip_txt_norm.unsqueeze(1),
            key=bert_norm.unsqueeze(1),
            value=bert_norm.unsqueeze(1)
        ).squeeze(1)
        
        # Concatenate all features
        fused_features = torch.cat([clip_img_norm, clip_txt_attended, bert_attended], dim=-1)
        
        # Final prediction
        price_pred = self.regression_head(fused_features).squeeze(-1)
        
        return price_pred, clip_txt_norm, clip_img_norm, bert_attended

def info_nce(z_img: torch.Tensor, z_txt: torch.Tensor, tau: float = 0.07) -> torch.Tensor:
    z_img = F.normalize(z_img, dim=-1)
    z_txt = F.normalize(z_txt, dim=-1)
    logits = torch.matmul(z_img, z_txt.t()) / tau  # (B,B)
    labels = torch.arange(z_img.size(0), device=z_img.device)
    loss_i = F.cross_entropy(logits, labels)
    loss_t = F.cross_entropy(logits.t(), labels)
    return 0.5 * (loss_i + loss_t)


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


In [9]:
# --------------------------- Load TRAIN data ---------------------------
print(f"🔧 Loading TRAIN CSV: {CSV_PATH}")
df = pd.read_csv(CSV_PATH)
# checks
for col, name in [(TEXT_COL, "TEXT_COL"), (PRICE_COL, "PRICE_COL"), (IMG_COL, "IMG_COL")]:
    if col not in df.columns:
        raise ValueError(f"{name} '{col}' not in CSV columns={df.columns.tolist()}")
# clean
df[TEXT_COL] = df[TEXT_COL].fillna("").astype(str).str.strip()
df = df.loc[pd.to_numeric(df[PRICE_COL], errors="coerce").notnull()].copy()
df[PRICE_COL] = df[PRICE_COL].astype(float)
df = df.loc[df[PRICE_COL] >= 0.0].reset_index(drop=True)
print(f"📦 Train rows: {len(df)}")

# --------------------------- Train/Val Split (90/10) ---------------------------
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    df, 
    test_size=0.1, 
    random_state=42, 
    shuffle=True
)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

print(f"📊 Train split: {len(train_df)} | Val split: {len(val_df)}")

# targets (log2)
y_train_log = log2_price(train_df[PRICE_COL].values)
y_val_log = log2_price(val_df[PRICE_COL].values)

# --------------------------- Load Models ---------------------------
print(f"🔄 Loading CLIP model: {CLIP_MODEL_ID}")
clip_processor = AutoProcessor.from_pretrained(CLIP_MODEL_ID)
clip_model = CLIPModel.from_pretrained(CLIP_MODEL_ID)
print(f"🔄 Loading BERT model: {BERT_MODEL_ID}")
bert_tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_ID)
bert_model = AutoModel.from_pretrained(BERT_MODEL_ID)
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = clip_model.to(device)
bert_model = bert_model.to(device)

# Create hybrid model
hybrid_model = ClipBertHybridModel(
    clip_model=clip_model,
    bert_model=bert_model,
    clip_dim=clip_model.config.projection_dim,
    bert_dim=bert_model.config.hidden_size,
    num_attention_heads=NUM_ATTENTION_HEADS,
    dropout=ATTENTION_DROPOUT
).to(device)
print(f"🖥️ Device: {device}")

# Train end-to-end (set to False to freeze models)
for p in clip_model.parameters():
    p.requires_grad = True
for p in bert_model.parameters():
    p.requires_grad = True
    
print(f"🧮 Trainable params CLIP={count_parameters(clip_model):,} | BERT={count_parameters(bert_model):,} | Hybrid={count_parameters(hybrid_model):,}")

# --------------------------- Create Datasets & Loaders ---------------------------
train_ds = ClipBertPriceDataset(
    train_df, TEXT_COL, IMG_COL, y_train_log, 
    clip_processor, bert_tokenizer, MAX_LEN, 
    IMG_MISSING_POLICY, is_test=False
)
val_ds = ClipBertPriceDataset(
    val_df, TEXT_COL, IMG_COL, y_val_log, 
    clip_processor, bert_tokenizer, MAX_LEN, 
    IMG_MISSING_POLICY, is_test=False
)

composite_loss_fn = CompositeRegLoss(base=2.0, init_log_var_huber=0.0, init_log_var_smape=0.0,
                                     use_pseudo_huber=False).to(device)

collate = CollateClipBert(clip_processor, bert_tokenizer)

train_loader = DataLoader(
    train_ds, 
    batch_size=BATCH_SIZE, 
    shuffle=True,
    num_workers=2, 
    pin_memory=True, 
    collate_fn=collate
)
val_loader = DataLoader(
    val_ds, 
    batch_size=BATCH_SIZE, 
    shuffle=False,  # No shuffle for validation
    num_workers=2, 
    pin_memory=True, 
    collate_fn=collate
)

print(f"🔄 Train batches: {len(train_loader)} | Val batches: {len(val_loader)}")

🔧 Loading TRAIN CSV: train_updated.csv
📦 Train rows: 75000
📊 Train split: 67500 | Val split: 7500
🔄 Loading CLIP model: openai/clip-vit-large-patch14
🔄 Loading BERT model: intfloat/e5-base-v2
🖥️ Device: cuda
🧮 Trainable params CLIP=427,616,513 | BERT=109,482,240 | Hybrid=544,480,002
🔄 Train batches: 2813 | Val batches: 313


In [10]:
no_decay = ["bias", "LayerNorm.weight"]
params = (list(clip_model.named_parameters()) + 
          list(bert_model.named_parameters()) + 
          [(f"hybrid.{n}", p) for n, p in hybrid_model.named_parameters()])
grouped = [
    {"params": [p for n, p in params if not any(nd in n for nd in no_decay)], "weight_decay": WEIGHT_DECAY},
    {"params": [p for n, p in params if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
]
optimizer = torch.optim.AdamW(grouped, lr=LR)
num_training_steps = EPOCHS * max(1, math.ceil(len(train_loader) / max(1, GRAD_ACCUM)))
num_warmup = int(num_training_steps * WARMUP_RATIO)
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=num_warmup, num_training_steps=num_training_steps
)
scaler = torch.cuda.amp.GradScaler(enabled=FP16)

# Warmup batch to tally missing images
print("🔎 Warmup batch to tally missing images…")
if len(train_loader) > 0:
    _ = next(iter(train_loader))
print(f"⚠️ TRAIN missing images: {train_ds.missing_img_count}")
print(f"🗑️ TRAIN dropped (policy={IMG_MISSING_POLICY}): {getattr(train_ds,'dropped_missing',0)}")

🔎 Warmup batch to tally missing images…


  return disable_fn(*args, **kwargs)
  scaler = torch.cuda.amp.GradScaler(enabled=FP16)
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


⚠️ TRAIN missing images: 0
🗑️ TRAIN dropped (policy=zero): 0


In [8]:
# # --------------------------- TRAIN (Full data) ---------------------------
# def evaluate_smape(model, data_loader, device, epoch):
#     """Evaluate SMAPE on a subset of training data"""
#     model.eval()
#     predictions = []
#     targets = []
    
#     with torch.no_grad():
#         for i, batch in enumerate(data_loader):
                
#             clip_input_ids = batch["clip_input_ids"].to(device, non_blocking=True)
#             clip_attention_mask = batch["clip_attention_mask"].to(device, non_blocking=True)
#             bert_input_ids = batch["bert_input_ids"].to(device, non_blocking=True)
#             bert_attention_mask = batch["bert_attention_mask"].to(device, non_blocking=True)
#             targets_batch = batch["target"].to(device, non_blocking=True).float()
#             img_missing = batch["img_missing"].to(device)
            
#             pixel_values = batch["pixel_values"].to(
#                 device,
#                 dtype=next(clip_model.vision_model.parameters()).dtype,
#                 non_blocking=True
#             )
            
#             pred_log, _, _, _ = model(
#                 clip_input_ids, clip_attention_mask, 
#                 bert_input_ids, bert_attention_mask,
#                 pixel_values, img_missing
#             )
            
#             # Convert back to original price scale
#             pred_price = delog2(pred_log.cpu().numpy())
#             target_price = delog2(targets_batch.cpu().numpy())
            
#             predictions.extend(pred_price)
#             targets.extend(target_price)
    
#     model.train()
#     return smape_np(targets, predictions)

# for epoch in range(1, EPOCHS + 1):
#     hybrid_model.train()
#     loss_run = reg_run = con_run = 0.0

#     optimizer.zero_grad(set_to_none=True)

#     for step, batch in enumerate(train_loader, 1):
#         clip_input_ids = batch["clip_input_ids"].to(device, non_blocking=True)
#         clip_attention_mask = batch["clip_attention_mask"].to(device, non_blocking=True)
#         bert_input_ids = batch["bert_input_ids"].to(device, non_blocking=True)
#         bert_attention_mask = batch["bert_attention_mask"].to(device, non_blocking=True)
#         targets = batch["target"].to(device, non_blocking=True).float()
#         img_missing = batch["img_missing"].to(device)

#         with torch.cuda.amp.autocast(enabled=FP16):
#             # Get predictions from hybrid model
#             pixel_values = batch["pixel_values"].to(
#                 device,
#                 dtype=next(clip_model.vision_model.parameters()).dtype,
#                 non_blocking=True
#             )
            
#             pred_log, clip_txt_norm, clip_img_norm, bert_attended = hybrid_model(
#                 clip_input_ids, clip_attention_mask,
#                 bert_input_ids, bert_attention_mask,
#                 pixel_values, img_missing
#             )
            
#             # Regression loss
#             reg_loss = huber_loss(pred_log, targets, delta=HUBER_DELTA)

#             # Contrastive loss between CLIP image and text features
#             con_loss = torch.tensor(0.0, device=device, dtype=clip_txt_norm.dtype)
#             valid_idx = (img_missing == 0).nonzero(as_tuple=False).squeeze(-1)
#             if valid_idx.numel() > 1 and ALPHA_CONTRAST > 0:
#                 con_loss = info_nce(clip_img_norm[valid_idx], clip_txt_norm[valid_idx], tau=TAU)

#             loss = (1.0 - ALPHA_CONTRAST) * reg_loss + ALPHA_CONTRAST * con_loss

#         scaler.scale(loss).backward()

#         if step % GRAD_ACCUM == 0:
#             scaler.unscale_(optimizer)
#             nn.utils.clip_grad_norm_(
#                 list(clip_model.parameters()) + 
#                 list(bert_model.parameters()) + 
#                 list(hybrid_model.parameters()), 
#                 MAX_GRAD_NORM
#             )
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad(set_to_none=True)
#             scheduler.step()

#         loss_run += float(loss.item())
#         reg_run  += float(reg_loss.item())
#         con_run  += float(con_loss.item()) if isinstance(con_loss, torch.Tensor) else float(con_loss)

#         if step % 200 == 0:
#             print(f"epoch {epoch} step {step}/{len(train_loader)} "
#                   f"loss={loss_run/step:.4f} reg={reg_run/step:.4f} con={con_run/step:.4f}")

#     # Evaluate SMAPE at end of epoch
#     smape_score = evaluate_smape(hybrid_model, val_loader, device, epoch)
#     print(f"✅ Epoch {epoch} done. avg_loss={loss_run/max(1,len(train_loader)):.4f} | SMAPE={smape_score:.4f}")

# # Save final full-data checkpoint
# full_ckpt = os.path.join(OUTPUT_DIR, "clip_bert_hybrid.pt")
# torch.save(
#     {
#         "clip_state": clip_model.state_dict(),
#         "bert_state": bert_model.state_dict(),
#         "hybrid_state": hybrid_model.state_dict(),
#         "clip_model_id": CLIP_MODEL_ID,
#         "bert_model_id": BERT_MODEL_ID,
#         "config": {
#             "ALPHA_CONTRAST": ALPHA_CONTRAST,
#             "TAU": TAU,
#             "MAX_LEN": MAX_LEN,
#             "clip_projection_dim": clip_model.config.projection_dim,
#             "bert_hidden_size": bert_model.config.hidden_size,
#             "num_attention_heads": NUM_ATTENTION_HEADS,
#             "attention_dropout": ATTENTION_DROPOUT,
#             "IMG_MISSING_POLICY": IMG_MISSING_POLICY,
#             "HUBER_DELTA": HUBER_DELTA
#         },
#         "columns": {"id": ID_COL, "text": TEXT_COL, "image": IMG_COL, "price": PRICE_COL},
#     },
#     full_ckpt
# )
# print(f"💾 Saved hybrid model checkpoint to: {full_ckpt}")

# with open(os.path.join(OUTPUT_DIR, "metrics_hybrid_train.json"), "w") as f:
#     json.dump({
#         "train_rows": int(len(df)),
#         "train_missing_images": int(train_ds.missing_img_count),
#         "dropped_train": int(getattr(train_ds, "dropped_missing", 0)),
#         "epochs": EPOCHS,
#         "batch_size": BATCH_SIZE,
#         "lr": LR,
#         "weight_decay": WEIGHT_DECAY,
#         "grad_accum": GRAD_ACCUM,
#         "fp16": FP16,
#         "clip_model": CLIP_MODEL_ID,
#         "bert_model": BERT_MODEL_ID,
#         "num_attention_heads": NUM_ATTENTION_HEADS,
#     }, f, indent=2)

In [None]:
# --------------------------- TRAIN (Full data) ---------------------------
def evaluate_metrics(model, data_loader, device, split_name="val"):
    """Evaluate comprehensive metrics on a data loader"""
    model.eval()
    predictions = []
    targets = []
    
    with torch.no_grad():
        for i, batch in enumerate(data_loader):
            clip_input_ids = batch["clip_input_ids"].to(device, non_blocking=True)
            clip_attention_mask = batch["clip_attention_mask"].to(device, non_blocking=True)
            bert_input_ids = batch["bert_input_ids"].to(device, non_blocking=True)
            bert_attention_mask = batch["bert_attention_mask"].to(device, non_blocking=True)
            targets_batch = batch["target"].to(device, non_blocking=True).float()
            img_missing = batch["img_missing"].to(device)
            
            pixel_values = batch["pixel_values"].to(
                device,
                dtype=next(clip_model.vision_model.parameters()).dtype,
                non_blocking=True
            )
            
            pred_log, _, _, _ = model(
                clip_input_ids, clip_attention_mask, 
                bert_input_ids, bert_attention_mask,
                pixel_values, img_missing
            )
            
            # Convert back to original price scale
            pred_price = delog2(pred_log.cpu().numpy())
            target_price = delog2(targets_batch.cpu().numpy())
            
            predictions.extend(pred_price)
            targets.extend(target_price)
    
    model.train()
    
    # Calculate metrics
    predictions = np.array(predictions)
    targets = np.array(targets)
    
    smape = smape_np(targets, predictions)
    mae = np.mean(np.abs(targets - predictions))
    rmse = np.sqrt(np.mean((targets - predictions) ** 2))
    
    # R² score
    ss_res = np.sum((targets - predictions) ** 2)
    ss_tot = np.sum((targets - np.mean(targets)) ** 2)
    r2 = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0.0
    
    return {
        f"{split_name}_smape": float(smape),
        f"{split_name}_mae": float(mae),
        f"{split_name}_rmse": float(rmse),
        f"{split_name}_r2": float(r2)
    }


# Initialize metrics history
metrics_history = []
best_val_smape = float('inf')
best_epoch = 0

for epoch in range(1, EPOCHS + 1):
    hybrid_model.train()
    loss_run = reg_run = con_run = 0.0

    optimizer.zero_grad(set_to_none=True)

    for step, batch in enumerate(train_loader, 1):
        clip_input_ids = batch["clip_input_ids"].to(device, non_blocking=True)
        clip_attention_mask = batch["clip_attention_mask"].to(device, non_blocking=True)
        bert_input_ids = batch["bert_input_ids"].to(device, non_blocking=True)
        bert_attention_mask = batch["bert_attention_mask"].to(device, non_blocking=True)
        targets = batch["target"].to(device, non_blocking=True).float()
        img_missing = batch["img_missing"].to(device)

        with torch.cuda.amp.autocast(enabled=FP16):
            pixel_values = batch["pixel_values"].to(
                device,
                dtype=next(clip_model.vision_model.parameters()).dtype,
                non_blocking=True
            )
            
            pred_log, clip_txt_norm, clip_img_norm, bert_attended = hybrid_model(
                clip_input_ids, clip_attention_mask,
                bert_input_ids, bert_attention_mask,
                pixel_values, img_missing
            )
            
            # Regression loss
            reg_loss = reg_loss, reg_info = composite_loss_fn(pred_log.squeeze(), targets.squeeze(),
                                       epoch=epoch-1, total_epochs=EPOCHS,
                                       huber_delta=HUBER_DELTA)
            # huber_loss(pred_log, targets, delta=HUBER_DELTA)

            # Contrastive loss between CLIP image and text features
            con_loss = torch.tensor(0.0, device=device, dtype=clip_txt_norm.dtype)
            valid_idx = (img_missing == 0).nonzero(as_tuple=False).squeeze(-1)
            if valid_idx.numel() > 1 and ALPHA_CONTRAST > 0:
                con_loss = info_nce(clip_img_norm[valid_idx], clip_txt_norm[valid_idx], tau=TAU)

            loss = (1.0 - ALPHA_CONTRAST) * reg_loss + ALPHA_CONTRAST * con_loss

        scaler.scale(loss).backward()

        if step % GRAD_ACCUM == 0:
            scaler.unscale_(optimizer)
            nn.utils.clip_grad_norm_(
                list(clip_model.parameters()) + 
                list(bert_model.parameters()) + 
                list(hybrid_model.parameters()), 
                MAX_GRAD_NORM
            )
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)
            scheduler.step()

        loss_run += float(loss.item())
        reg_run  += float(reg_loss.item())
        con_run  += float(con_loss.item()) if isinstance(con_loss, torch.Tensor) else float(con_loss)

        if step % 200 == 0:
            print(f"epoch {epoch} step {step}/{len(train_loader)} "
                  f"loss={loss_run/step:.4f} reg={reg_run/step:.4f} con={con_run/step:.4f}")

    # Evaluate metrics at end of epoch
    print(f"\n📊 Evaluating epoch {epoch}...")
    # train_metrics = evaluate_metrics(hybrid_model, train_loader, device, split_name="train")
    val_metrics = evaluate_metrics(hybrid_model, val_loader, device, split_name="val")
    
    # Combine all metrics for this epoch
    epoch_metrics = {
        "epoch": epoch,
        "avg_loss": loss_run / max(1, len(train_loader)),
        "avg_reg_loss": reg_run / max(1, len(train_loader)),
        "avg_con_loss": con_run / max(1, len(train_loader)),
        **val_metrics
    }
    metrics_history.append(epoch_metrics)
    
    print(f"✅ Epoch {epoch} completed:")
    # print(f"   Train -> SMAPE: {train_metrics['train_smape']:.4f} | MAE: {train_metrics['train_mae']:.2f} | "
    #       f"RMSE: {train_metrics['train_rmse']:.2f} | R²: {train_metrics['train_r2']:.4f}")
    print(f"   Val   -> SMAPE: {val_metrics['val_smape']:.4f} | MAE: {val_metrics['val_mae']:.2f} | "
          f"RMSE: {val_metrics['val_rmse']:.2f} | R²: {val_metrics['val_r2']:.4f}\n")
    
    # Save checkpoint for this epoch
    epoch_ckpt = os.path.join(OUTPUT_DIR, f"clip_bert_hybrid_epoch_{epoch}.pt")
    torch.save(
        {
            "epoch": epoch,
            "clip_state": clip_model.state_dict(),
            "bert_state": bert_model.state_dict(),
            "hybrid_state": hybrid_model.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "scheduler_state": scheduler.state_dict(),
            "scaler_state": scaler.state_dict(),
            "clip_model_id": CLIP_MODEL_ID,
            "bert_model_id": BERT_MODEL_ID,
            "config": {
                "ALPHA_CONTRAST": ALPHA_CONTRAST,
                "TAU": TAU,
                "MAX_LEN": MAX_LEN,
                "clip_projection_dim": clip_model.config.projection_dim,
                "bert_hidden_size": bert_model.config.hidden_size,
                "num_attention_heads": NUM_ATTENTION_HEADS,
                "attention_dropout": ATTENTION_DROPOUT,
                "IMG_MISSING_POLICY": IMG_MISSING_POLICY,
                "HUBER_DELTA": HUBER_DELTA
            },
            "metrics": epoch_metrics,
            "columns": {"id": ID_COL, "text": TEXT_COL, "image": IMG_COL, "price": PRICE_COL},
        },
        epoch_ckpt
    )
    print(f"💾 Saved epoch {epoch} checkpoint to: {epoch_ckpt}")
    
    # Track best model
    if val_metrics['val_smape'] < best_val_smape:
        best_val_smape = val_metrics['val_smape']
        best_epoch = epoch
        best_ckpt = os.path.join(OUTPUT_DIR, "clip_bert_hybrid_best.pt")
        torch.save(
            {
                "epoch": epoch,
                "clip_state": clip_model.state_dict(),
                "bert_state": bert_model.state_dict(),
                "hybrid_state": hybrid_model.state_dict(),
                "clip_model_id": CLIP_MODEL_ID,
                "bert_model_id": BERT_MODEL_ID,
                "config": {
                    "ALPHA_CONTRAST": ALPHA_CONTRAST,
                    "TAU": TAU,
                    "MAX_LEN": MAX_LEN,
                    "clip_projection_dim": clip_model.config.projection_dim,
                    "bert_hidden_size": bert_model.config.hidden_size,
                    "num_attention_heads": NUM_ATTENTION_HEADS,
                    "attention_dropout": ATTENTION_DROPOUT,
                    "IMG_MISSING_POLICY": IMG_MISSING_POLICY,
                    "HUBER_DELTA": HUBER_DELTA
                },
                "metrics": epoch_metrics,
                "columns": {"id": ID_COL, "text": TEXT_COL, "image": IMG_COL, "price": PRICE_COL},
            },
            best_ckpt
        )
        print(f"⭐ New best model! Val SMAPE: {best_val_smape:.4f}")

# Save final checkpoint
final_ckpt = os.path.join(OUTPUT_DIR, "clip_bert_hybrid_final.pt")
torch.save(
    {
        "epoch": EPOCHS,
        "clip_state": clip_model.state_dict(),
        "bert_state": bert_model.state_dict(),
        "hybrid_state": hybrid_model.state_dict(),
        "clip_model_id": CLIP_MODEL_ID,
        "bert_model_id": BERT_MODEL_ID,
        "config": {
            "ALPHA_CONTRAST": ALPHA_CONTRAST,
            "TAU": TAU,
            "MAX_LEN": MAX_LEN,
            "clip_projection_dim": clip_model.config.projection_dim,
            "bert_hidden_size": bert_model.config.hidden_size,
            "num_attention_heads": NUM_ATTENTION_HEADS,
            "attention_dropout": ATTENTION_DROPOUT,
            "IMG_MISSING_POLICY": IMG_MISSING_POLICY,
            "HUBER_DELTA": HUBER_DELTA
        },
        "metrics": metrics_history[-1],
        "columns": {"id": ID_COL, "text": TEXT_COL, "image": IMG_COL, "price": PRICE_COL},
    },
    final_ckpt
)
print(f"💾 Saved final model checkpoint to: {final_ckpt}")

# Save comprehensive metrics history
metrics_file = os.path.join(OUTPUT_DIR, "training_metrics_history.json")
with open(metrics_file, "w") as f:
    json.dump({
        "training_info": {
            "train_rows": int(len(train_df)),
            "val_rows": int(len(val_df)),
            "train_missing_images": int(train_ds.missing_img_count),
            "val_missing_images": int(val_ds.missing_img_count),
            "dropped_train": int(getattr(train_ds, "dropped_missing", 0)),
            "dropped_val": int(getattr(val_ds, "dropped_missing", 0)),
            "epochs": EPOCHS,
            "batch_size": BATCH_SIZE,
            "lr": LR,
            "weight_decay": WEIGHT_DECAY,
            "grad_accum": GRAD_ACCUM,
            "fp16": FP16,
            "clip_model": CLIP_MODEL_ID,
            "bert_model": BERT_MODEL_ID,
            "num_attention_heads": NUM_ATTENTION_HEADS,
            "best_epoch": best_epoch,
            "best_val_smape": float(best_val_smape)
        },
        "metrics_per_epoch": metrics_history
    }, f, indent=2)
print(f"📈 Saved training metrics history to: {metrics_file}")

print(f"\n🎯 Training completed! Best model at epoch {best_epoch} with Val SMAPE: {best_val_smape:.4f}")

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  with torch.cuda.amp.autocast(enabled=FP16):


epoch 1 step 200/2813 loss=1.3338 reg=1.6646 con=0.3416
epoch 1 step 400/2813 loss=1.1023 reg=1.3921 con=0.2330
epoch 1 step 600/2813 loss=0.8322 reg=1.0447 con=0.1946
epoch 1 step 800/2813 loss=0.6872 reg=0.8590 con=0.1717
epoch 1 step 1000/2813 loss=0.5967 reg=0.7430 con=0.1576
epoch 1 step 1200/2813 loss=0.5361 reg=0.6652 con=0.1485
epoch 1 step 1400/2813 loss=0.4930 reg=0.6082 con=0.1471
epoch 1 step 1600/2813 loss=0.4605 reg=0.5654 con=0.1459
epoch 1 step 1800/2813 loss=0.4347 reg=0.5305 con=0.1471
epoch 1 step 2000/2813 loss=0.4131 reg=0.5022 con=0.1459
epoch 1 step 2200/2813 loss=0.3953 reg=0.4788 con=0.1447
epoch 1 step 2400/2813 loss=0.3801 reg=0.4587 con=0.1442
epoch 1 step 2600/2813 loss=0.3662 reg=0.4412 con=0.1412
epoch 1 step 2800/2813 loss=0.3532 reg=0.4251 con=0.1376

📊 Evaluating epoch 1...


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 1 completed:
   Val   -> SMAPE: 48.9081 | MAE: 11.22 | RMSE: 25.67 | R²: 0.3636

💾 Saved epoch 1 checkpoint to: price_large_clip_infloat_bert_hybrid_10/clip_bert_hybrid_epoch_1.pt
⭐ New best model! Val SMAPE: 48.9081


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 2 step 200/2813 loss=0.1531 reg=0.1902 con=0.0421
epoch 2 step 400/2813 loss=0.1555 reg=0.1930 con=0.0429
epoch 2 step 600/2813 loss=0.1599 reg=0.1958 con=0.0522
epoch 2 step 800/2813 loss=0.1603 reg=0.1960 con=0.0531
epoch 2 step 1000/2813 loss=0.1642 reg=0.1989 con=0.0602
epoch 2 step 1200/2813 loss=0.1670 reg=0.2015 con=0.0633
epoch 2 step 1400/2813 loss=0.1682 reg=0.2020 con=0.0666
epoch 2 step 1600/2813 loss=0.1695 reg=0.2031 con=0.0684
epoch 2 step 1800/2813 loss=0.1702 reg=0.2037 con=0.0696
epoch 2 step 2000/2813 loss=0.1701 reg=0.2033 con=0.0705
epoch 2 step 2200/2813 loss=0.1703 reg=0.2034 con=0.0710
epoch 2 step 2400/2813 loss=0.1706 reg=0.2032 con=0.0726
epoch 2 step 2600/2813 loss=0.1705 reg=0.2032 con=0.0725
epoch 2 step 2800/2813 loss=0.1710 reg=0.2037 con=0.0728

📊 Evaluating epoch 2...


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 2 completed:
   Val   -> SMAPE: 47.7622 | MAE: 10.98 | RMSE: 25.19 | R²: 0.3872

💾 Saved epoch 2 checkpoint to: price_large_clip_infloat_bert_hybrid_10/clip_bert_hybrid_epoch_2.pt
⭐ New best model! Val SMAPE: 47.7622


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 3 step 200/2813 loss=0.1316 reg=0.1632 con=0.0369
epoch 3 step 400/2813 loss=0.1266 reg=0.1582 con=0.0319
epoch 3 step 600/2813 loss=0.1261 reg=0.1579 con=0.0306
epoch 3 step 800/2813 loss=0.1253 reg=0.1572 con=0.0297
epoch 3 step 1000/2813 loss=0.1249 reg=0.1569 con=0.0287
epoch 3 step 1200/2813 loss=0.1250 reg=0.1571 con=0.0289
epoch 3 step 1400/2813 loss=0.1258 reg=0.1582 con=0.0289
epoch 3 step 1600/2813 loss=0.1266 reg=0.1591 con=0.0292
epoch 3 step 1800/2813 loss=0.1270 reg=0.1596 con=0.0294
epoch 3 step 2000/2813 loss=0.1270 reg=0.1596 con=0.0294
epoch 3 step 2200/2813 loss=0.1267 reg=0.1592 con=0.0292
epoch 3 step 2400/2813 loss=0.1263 reg=0.1587 con=0.0289
epoch 3 step 2600/2813 loss=0.1260 reg=0.1585 con=0.0285
epoch 3 step 2800/2813 loss=0.1253 reg=0.1576 con=0.0284

📊 Evaluating epoch 3...


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 3 completed:
   Val   -> SMAPE: 44.2533 | MAE: 9.92 | RMSE: 23.38 | R²: 0.4720

💾 Saved epoch 3 checkpoint to: price_large_clip_infloat_bert_hybrid_10/clip_bert_hybrid_epoch_3.pt
⭐ New best model! Val SMAPE: 44.2533


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 4 step 200/2813 loss=0.0901 reg=0.1139 con=0.0188
epoch 4 step 400/2813 loss=0.0896 reg=0.1134 con=0.0184
epoch 4 step 600/2813 loss=0.0898 reg=0.1136 con=0.0183
epoch 4 step 800/2813 loss=0.0898 reg=0.1136 con=0.0185
epoch 4 step 1000/2813 loss=0.0913 reg=0.1153 con=0.0193
epoch 4 step 1200/2813 loss=0.0927 reg=0.1168 con=0.0205
epoch 4 step 1400/2813 loss=0.0939 reg=0.1184 con=0.0203
epoch 4 step 1600/2813 loss=0.0951 reg=0.1200 con=0.0205
epoch 4 step 1800/2813 loss=0.0964 reg=0.1216 con=0.0207
epoch 4 step 2000/2813 loss=0.0968 reg=0.1221 con=0.0208
epoch 4 step 2200/2813 loss=0.0977 reg=0.1232 con=0.0211
epoch 4 step 2400/2813 loss=0.0985 reg=0.1241 con=0.0215
epoch 4 step 2600/2813 loss=0.0991 reg=0.1249 con=0.0217
epoch 4 step 2800/2813 loss=0.0994 reg=0.1253 con=0.0218

📊 Evaluating epoch 4...


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 4 completed:
   Val   -> SMAPE: 44.8528 | MAE: 10.12 | RMSE: 23.56 | R²: 0.4641

💾 Saved epoch 4 checkpoint to: price_large_clip_infloat_bert_hybrid_10/clip_bert_hybrid_epoch_4.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 5 step 200/2813 loss=0.0831 reg=0.1044 con=0.0191
epoch 5 step 400/2813 loss=0.0817 reg=0.1028 con=0.0183
epoch 5 step 600/2813 loss=0.0797 reg=0.1002 con=0.0182
epoch 5 step 800/2813 loss=0.0789 reg=0.0993 con=0.0178
epoch 5 step 1000/2813 loss=0.0782 reg=0.0985 con=0.0174
epoch 5 step 1200/2813 loss=0.0782 reg=0.0986 con=0.0172
epoch 5 step 1400/2813 loss=0.0783 reg=0.0987 con=0.0172
epoch 5 step 1600/2813 loss=0.0784 reg=0.0988 con=0.0170
epoch 5 step 1800/2813 loss=0.0782 reg=0.0986 con=0.0169
epoch 5 step 2000/2813 loss=0.0779 reg=0.0982 con=0.0169
epoch 5 step 2200/2813 loss=0.0776 reg=0.0978 con=0.0169
epoch 5 step 2400/2813 loss=0.0775 reg=0.0977 con=0.0169
epoch 5 step 2600/2813 loss=0.0776 reg=0.0978 con=0.0167
epoch 5 step 2800/2813 loss=0.0773 reg=0.0976 con=0.0167

📊 Evaluating epoch 5...


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 5 completed:
   Val   -> SMAPE: 43.8980 | MAE: 9.85 | RMSE: 22.70 | R²: 0.5024

💾 Saved epoch 5 checkpoint to: price_large_clip_infloat_bert_hybrid_10/clip_bert_hybrid_epoch_5.pt
⭐ New best model! Val SMAPE: 43.8980


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 6 step 200/2813 loss=0.0628 reg=0.0792 con=0.0136
epoch 6 step 400/2813 loss=0.0631 reg=0.0794 con=0.0143
epoch 6 step 600/2813 loss=0.0623 reg=0.0784 con=0.0142
epoch 6 step 800/2813 loss=0.0623 reg=0.0784 con=0.0140
epoch 6 step 1000/2813 loss=0.0628 reg=0.0791 con=0.0140
epoch 6 step 1200/2813 loss=0.0631 reg=0.0794 con=0.0142
epoch 6 step 1400/2813 loss=0.0632 reg=0.0795 con=0.0142
epoch 6 step 1600/2813 loss=0.0629 reg=0.0791 con=0.0142
epoch 6 step 1800/2813 loss=0.0627 reg=0.0789 con=0.0141
epoch 6 step 2000/2813 loss=0.0623 reg=0.0784 con=0.0139
epoch 6 step 2200/2813 loss=0.0620 reg=0.0781 con=0.0139
epoch 6 step 2400/2813 loss=0.0619 reg=0.0779 con=0.0138
epoch 6 step 2600/2813 loss=0.0621 reg=0.0782 con=0.0138
epoch 6 step 2800/2813 loss=0.0625 reg=0.0787 con=0.0138

📊 Evaluating epoch 6...


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 6 completed:
   Val   -> SMAPE: 44.2246 | MAE: 10.10 | RMSE: 23.40 | R²: 0.4714

💾 Saved epoch 6 checkpoint to: price_large_clip_infloat_bert_hybrid_10/clip_bert_hybrid_epoch_6.pt


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 7 step 200/2813 loss=0.0673 reg=0.0846 con=0.0155
epoch 7 step 400/2813 loss=0.0654 reg=0.0822 con=0.0149
epoch 7 step 600/2813 loss=0.0659 reg=0.0831 con=0.0143
epoch 7 step 800/2813 loss=0.0659 reg=0.0832 con=0.0142
epoch 7 step 1000/2813 loss=0.0655 reg=0.0827 con=0.0140
epoch 7 step 1200/2813 loss=0.0655 reg=0.0827 con=0.0139
epoch 7 step 1400/2813 loss=0.0656 reg=0.0828 con=0.0140
epoch 7 step 1600/2813 loss=0.0657 reg=0.0829 con=0.0142
epoch 7 step 1800/2813 loss=0.0656 reg=0.0827 con=0.0142
epoch 7 step 2000/2813 loss=0.0653 reg=0.0823 con=0.0143
epoch 7 step 2200/2813 loss=0.0651 reg=0.0821 con=0.0141
epoch 7 step 2400/2813 loss=0.0647 reg=0.0816 con=0.0139
epoch 7 step 2600/2813 loss=0.0644 reg=0.0813 con=0.0138
epoch 7 step 2800/2813 loss=0.0640 reg=0.0808 con=0.0137

📊 Evaluating epoch 7...


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 7 completed:
   Val   -> SMAPE: 43.3284 | MAE: 9.68 | RMSE: 22.28 | R²: 0.5206

💾 Saved epoch 7 checkpoint to: price_large_clip_infloat_bert_hybrid_10/clip_bert_hybrid_epoch_7.pt
⭐ New best model! Val SMAPE: 43.3284


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 8 step 200/2813 loss=0.0637 reg=0.0807 con=0.0125
epoch 8 step 400/2813 loss=0.0638 reg=0.0810 con=0.0121
epoch 8 step 600/2813 loss=0.0638 reg=0.0811 con=0.0121
epoch 8 step 800/2813 loss=0.0642 reg=0.0812 con=0.0131
epoch 8 step 1000/2813 loss=0.0644 reg=0.0814 con=0.0132
epoch 8 step 1200/2813 loss=0.0644 reg=0.0815 con=0.0132
epoch 8 step 1400/2813 loss=0.0643 reg=0.0814 con=0.0130
epoch 8 step 1600/2813 loss=0.0642 reg=0.0813 con=0.0130
epoch 8 step 1800/2813 loss=0.0641 reg=0.0811 con=0.0131
epoch 8 step 2000/2813 loss=0.0640 reg=0.0811 con=0.0129
epoch 8 step 2200/2813 loss=0.0640 reg=0.0810 con=0.0129
epoch 8 step 2400/2813 loss=0.0639 reg=0.0809 con=0.0128
epoch 8 step 2600/2813 loss=0.0637 reg=0.0807 con=0.0127
epoch 8 step 2800/2813 loss=0.0634 reg=0.0803 con=0.0127

📊 Evaluating epoch 8...


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 8 completed:
   Val   -> SMAPE: 42.7806 | MAE: 9.59 | RMSE: 22.54 | R²: 0.5095

💾 Saved epoch 8 checkpoint to: price_large_clip_infloat_bert_hybrid_10/clip_bert_hybrid_epoch_8.pt
⭐ New best model! Val SMAPE: 42.7806


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 9 step 200/2813 loss=0.0556 reg=0.0707 con=0.0104
epoch 9 step 400/2813 loss=0.0566 reg=0.0719 con=0.0109
epoch 9 step 600/2813 loss=0.0564 reg=0.0717 con=0.0106
epoch 9 step 800/2813 loss=0.0561 reg=0.0714 con=0.0105
epoch 9 step 1000/2813 loss=0.0562 reg=0.0714 con=0.0106
epoch 9 step 1200/2813 loss=0.0563 reg=0.0715 con=0.0106
epoch 9 step 1400/2813 loss=0.0560 reg=0.0711 con=0.0105
epoch 9 step 1600/2813 loss=0.0558 reg=0.0709 con=0.0105
epoch 9 step 1800/2813 loss=0.0557 reg=0.0708 con=0.0103
epoch 9 step 2000/2813 loss=0.0555 reg=0.0706 con=0.0104
epoch 9 step 2200/2813 loss=0.0554 reg=0.0703 con=0.0104
epoch 9 step 2400/2813 loss=0.0552 reg=0.0701 con=0.0104
epoch 9 step 2600/2813 loss=0.0551 reg=0.0700 con=0.0103
epoch 9 step 2800/2813 loss=0.0550 reg=0.0699 con=0.0104

📊 Evaluating epoch 9...


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


✅ Epoch 9 completed:
   Val   -> SMAPE: 42.3347 | MAE: 9.47 | RMSE: 22.35 | R²: 0.5175

💾 Saved epoch 9 checkpoint to: price_large_clip_infloat_bert_hybrid_10/clip_bert_hybrid_epoch_9.pt
⭐ New best model! Val SMAPE: 42.3347


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


epoch 10 step 200/2813 loss=0.0502 reg=0.0637 con=0.0098


In [None]:
# train_loader = DataLoader(
#     train_ds, 
#     batch_size=48, 
#     shuffle=True,
#     num_workers=2, 
#     pin_memory=True, 
#     collate_fn=collate
# )
# val_loader = DataLoader(
#     val_ds, 
#     batch_size=48, 
#     shuffle=False,  # No shuffle for validation
#     num_workers=2, 
#     pin_memory=True, 
#     collate_fn=collate
# )

# for epoch in range(1):
#     hybrid_model.train()
#     loss_run = reg_run = con_run = 0.0

#     optimizer.zero_grad(set_to_none=True)

#     for step, batch in enumerate(train_loader, 1):
#         clip_input_ids = batch["clip_input_ids"].to(device, non_blocking=True)
#         clip_attention_mask = batch["clip_attention_mask"].to(device, non_blocking=True)
#         bert_input_ids = batch["bert_input_ids"].to(device, non_blocking=True)
#         bert_attention_mask = batch["bert_attention_mask"].to(device, non_blocking=True)
#         targets = batch["target"].to(device, non_blocking=True).float()
#         img_missing = batch["img_missing"].to(device)

#         with torch.cuda.amp.autocast(enabled=FP16):
#             # Get predictions from hybrid model
#             pixel_values = batch["pixel_values"].to(
#                 device,
#                 dtype=next(clip_model.vision_model.parameters()).dtype,
#                 non_blocking=True
#             )
            
#             pred_log, clip_txt_norm, clip_img_norm, bert_attended = hybrid_model(
#                 clip_input_ids, clip_attention_mask,
#                 bert_input_ids, bert_attention_mask,
#                 pixel_values, img_missing
#             )
            
#             # Regression loss
#             reg_loss = huber_loss(pred_log, targets, delta=HUBER_DELTA)

#             # Contrastive loss between CLIP image and text features
#             con_loss = torch.tensor(0.0, device=device, dtype=clip_txt_norm.dtype)
#             valid_idx = (img_missing == 0).nonzero(as_tuple=False).squeeze(-1)
#             if valid_idx.numel() > 1 and ALPHA_CONTRAST > 0:
#                 con_loss = info_nce(clip_img_norm[valid_idx], clip_txt_norm[valid_idx], tau=TAU)

#             loss = (1.0 - ALPHA_CONTRAST) * reg_loss + ALPHA_CONTRAST * con_loss

#         scaler.scale(loss).backward()

#         if step % GRAD_ACCUM == 0:
#             scaler.unscale_(optimizer)
#             nn.utils.clip_grad_norm_(
#                 list(clip_model.parameters()) + 
#                 list(bert_model.parameters()) + 
#                 list(hybrid_model.parameters()), 
#                 MAX_GRAD_NORM
#             )
#             scaler.step(optimizer)
#             scaler.update()
#             optimizer.zero_grad(set_to_none=True)
#             scheduler.step()

#         loss_run += float(loss.item())
#         reg_run  += float(reg_loss.item())
#         con_run  += float(con_loss.item()) if isinstance(con_loss, torch.Tensor) else float(con_loss)

#         if step % 200 == 0:
#             print(f"epoch {epoch} step {step}/{len(train_loader)} "
#                   f"loss={loss_run/step:.4f} reg={reg_run/step:.4f} con={con_run/step:.4f}")

#     # Evaluate SMAPE at end of epoch
#     smape_score = evaluate_smape(hybrid_model, val_loader, device, epoch)
#     print(f"✅ Epoch {epoch} done. avg_loss={loss_run/max(1,len(train_loader)):.4f} | SMAPE={smape_score:.4f}")

# # Save final full-data checkpoint
# full_ckpt = os.path.join(OUTPUT_DIR, "clip_bert_hybrid.pt")
# torch.save(
#     {
#         "clip_state": clip_model.state_dict(),
#         "bert_state": bert_model.state_dict(),
#         "hybrid_state": hybrid_model.state_dict(),
#         "clip_model_id": CLIP_MODEL_ID,
#         "bert_model_id": BERT_MODEL_ID,
#         "config": {
#             "ALPHA_CONTRAST": ALPHA_CONTRAST,
#             "TAU": TAU,
#             "MAX_LEN": MAX_LEN,
#             "clip_projection_dim": clip_model.config.projection_dim,
#             "bert_hidden_size": bert_model.config.hidden_size,
#             "num_attention_heads": NUM_ATTENTION_HEADS,
#             "attention_dropout": ATTENTION_DROPOUT,
#             "IMG_MISSING_POLICY": IMG_MISSING_POLICY,
#             "HUBER_DELTA": HUBER_DELTA
#         },
#         "columns": {"id": ID_COL, "text": TEXT_COL, "image": IMG_COL, "price": PRICE_COL},
#     },
#     full_ckpt
# )
# print(f"💾 Saved hybrid model checkpoint to: {full_ckpt}")

# with open(os.path.join(OUTPUT_DIR, "metrics_hybrid_train.json"), "w") as f:
#     json.dump({
#         "train_rows": int(len(df)),
#         "train_missing_images": int(train_ds.missing_img_count),
#         "dropped_train": int(getattr(train_ds, "dropped_missing", 0)),
#         "epochs": EPOCHS,
#         "batch_size": BATCH_SIZE,
#         "lr": LR,
#         "weight_decay": WEIGHT_DECAY,
#         "grad_accum": GRAD_ACCUM,
#         "fp16": FP16,
#         "clip_model": CLIP_MODEL_ID,
#         "bert_model": BERT_MODEL_ID,
#         "num_attention_heads": NUM_ATTENTION_HEADS,
#     }, f, indent=2)

In [13]:

# --------------------------- INFERENCE (TEST) ---------------------------
def build_test_df(path: str, id_col: str, text_col: str, img_col: str, fallback_img_dir: str) -> pd.DataFrame:
    dft = pd.read_csv(path)
    # ensure id col exists
    if id_col not in dft.columns:
        raise ValueError(f"ID_COL '{id_col}' not in TEST_CSV columns={dft.columns.tolist()}")

    # text column can be absent; if so, create empty
    if text_col not in dft.columns:
        dft[text_col] = ""

    # image path column: create if absent
    if img_col not in dft.columns:
        dft[img_col] = dft[id_col].astype(str).apply(lambda x: os.path.join(fallback_img_dir, f"{x}.jpg"))

    dft[text_col] = dft[text_col].fillna("").astype(str).str.strip()
    dft[img_col]  = dft[img_col].fillna("").astype(str)
    return dft

@torch.no_grad()
def infer_predictions(hybrid_model: ClipBertHybridModel, df_test: pd.DataFrame,
                      text_col: str, img_col: str, batch_size: int = 64) -> np.ndarray:
    hybrid_model.eval()
    ds = ClipBertPriceDataset(df_test, text_col, img_col, prices_log2=None,
                             clip_processor=clip_processor, bert_tokenizer=bert_tokenizer,
                             max_len=MAX_LEN, policy="zero", is_test=True)
    collate = CollateClipBert(clip_processor, bert_tokenizer)
    dl = DataLoader(ds, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True, collate_fn=collate)

    preds_log = []
    for batch in dl:
        clip_input_ids = batch["clip_input_ids"].to(device, non_blocking=True)
        clip_attention_mask = batch["clip_attention_mask"].to(device, non_blocking=True)
        bert_input_ids = batch["bert_input_ids"].to(device, non_blocking=True)
        bert_attention_mask = batch["bert_attention_mask"].to(device, non_blocking=True)
        img_missing = batch["img_missing"].to(device)

        pixel_values = batch["pixel_values"].to(
            device,
            dtype=next(clip_model.vision_model.parameters()).dtype,
            non_blocking=True
        )
        
        pred_log, _, _, _ = hybrid_model(
            clip_input_ids, clip_attention_mask,
            bert_input_ids, bert_attention_mask,
            pixel_values, img_missing
        )
        preds_log.append(pred_log.detach().float().cpu().numpy())

    preds_log = np.concatenate(preds_log, axis=0) if len(preds_log) else np.array([])
    return preds_log, ds.ids, ds.missing_img_count

if TEST_CSV:
    print(f"🔮 Loading TEST_CSV: {TEST_CSV}")
    dft = build_test_df(TEST_CSV, ID_COL, TEXT_COL, IMG_COL, TEST_IMG_DIR)
    print(f"📝 Test rows: {len(dft)}")

    preds_log, test_ids, miss_count = infer_predictions(hybrid_model, dft, TEXT_COL, IMG_COL, batch_size=max(32, BATCH_SIZE))
    if preds_log.size == 0:
        print("⚠️ No predictions generated for test.")
    else:
        pred_price = delog2(preds_log.reshape(-1))
        out_df = pd.DataFrame({ID_COL: test_ids, "price": pred_price})
        out_path = os.path.join(OUTPUT_DIR, "test_predictions.csv")
        out_df.to_csv(out_path, index=False)
        print(f"📤 Saved predictions to: {out_path}")
        print(f"⚠️ Test missing images encountered: {miss_count}")
else:
    print("ℹ️ TEST_CSV not set — skipping inference.")
# %%

🔮 Loading TEST_CSV: test_updated.csv
📝 Test rows: 75000


You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.

KeyboardInterrupt



In [11]:
# %% [markdown]
# --- Inference: load best checkpoint and predict on TEST_CSV ---

# %%
import os, json, math
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from transformers import CLIPModel, AutoProcessor

# ---- Config / paths ----
TEST_CSV        = os.environ.get("TEST_CSV", "test_updated.csv")   # must contain ID + text + image path
ID_COL          = os.environ.get("ID_COL", "sample_id")
TEXT_COL        = os.environ.get("TEXT_COL", "catalog_content")
IMG_COL         = os.environ.get("IMG_COL",  "image_path")

OUTPUT_DIR      = os.environ.get("OUTPUT_DIR", "price_large_clip_infloat_bert_hybrid_10")
CKPT_PATH       = os.environ.get("CKPT_PATH", os.path.join(OUTPUT_DIR, "clip_bert_hybrid_epoch_10.pt"))

BATCH_SIZE      = int(os.environ.get("INF_BATCH_SIZE", "64"))
MAX_LEN_ENV     = os.environ.get("MAX_LEN", None)  # if you want to override tokenizer max len
FP16            = os.environ.get("FP16", "true").lower() == "true"

assert os.path.exists(CKPT_PATH), f"Checkpoint not found at {CKPT_PATH}"
assert os.path.exists(TEST_CSV),  f"Test CSV not found at {TEST_CSV}"

device = "cuda" if torch.cuda.is_available() else "cpu"

# ---- Load checkpoint ----
ckpt = torch.load(CKPT_PATH, map_location="cpu")
clip_model_id = ckpt.get("clip_model_id", "openai/clip-vit-large-patch14")
bert_model_id = ckpt.get("bert_model_id", "distilbert/distilbert-base-cased")
cfg = ckpt.get("config", {})
clip_projection_dim = cfg.get("clip_projection_dim")
bert_hidden_size = cfg.get("bert_hidden_size")
num_attention_heads = cfg.get("num_attention_heads", 8)
attention_dropout = cfg.get("attention_dropout", 0.1)
img_missing_policy = cfg.get("IMG_MISSING_POLICY", "zero")
max_len = int(cfg.get("MAX_LEN", 64)) if MAX_LEN_ENV is None else int(MAX_LEN_ENV)

print(f"📦 Loaded checkpoint from: {CKPT_PATH}")
print(f"🔤 CLIP_MODEL={clip_model_id} | BERT_MODEL={bert_model_id}")
print(f"🔤 clip_dim={clip_projection_dim} | bert_dim={bert_hidden_size} | heads={num_attention_heads} | MAX_LEN={max_len}")

# ---- Recreate processors & models ----
clip_processor = AutoProcessor.from_pretrained(clip_model_id)
clip_model = CLIPModel.from_pretrained(clip_model_id)
clip_model.load_state_dict(ckpt["clip_state"], strict=True)
clip_model.to(device).eval()

bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_id)
bert_model = AutoModel.from_pretrained(bert_model_id)
bert_model.load_state_dict(ckpt["bert_state"], strict=True)
bert_model.to(device).eval()

# Recreate and load hybrid model
hybrid_model = ClipBertHybridModel(
    clip_model=clip_model,
    bert_model=bert_model,
    clip_dim=clip_projection_dim,
    bert_dim=bert_hidden_size,
    num_attention_heads=num_attention_heads,
    dropout=0.0  # No dropout during inference
)
hybrid_model.load_state_dict(ckpt["hybrid_state"], strict=True)
hybrid_model.to(device).eval()

# ---- Load test data ----
dft = pd.read_csv(TEST_CSV)
dft["image_path"] = dft["sample_id"].apply(lambda x : f"jl_fs/images/test/{x}.jpg")
for col, name in [(ID_COL, "ID_COL"), (TEXT_COL, "TEXT_COL"), (IMG_COL, "IMG_COL")]:
    if col not in dft.columns:
        raise ValueError(f"{name} '{col}' missing from test CSV. Columns={dft.columns.tolist()}")

# Basic clean
dft[TEXT_COL] = dft[TEXT_COL].fillna("").astype(str).str.strip()
dft[IMG_COL]  = dft[IMG_COL].fillna("").astype(str)

# Build dataset/dataloader with no targets
test_ds = ClipBertPriceDataset(
    df=dft[[ID_COL, TEXT_COL, IMG_COL]].copy(),
    text_col=TEXT_COL,
    img_col=IMG_COL,
    prices_log2=None,
    clip_processor=clip_processor,
    bert_tokenizer=bert_tokenizer,
    max_len=max_len,
    policy=img_missing_policy
)
collate = CollateClipBert(clip_processor, bert_tokenizer)

dl_te = DataLoader(
    test_ds, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=2, pin_memory=True, collate_fn=collate
)

print(f"🖥 Device: {device}")
print(f"🧪 Test rows: {len(test_ds)} | Missing images encountered (during getitem): {test_ds.missing_img_count}")
print(f"🗑 Dropped due to policy=drop: {getattr(test_ds, 'dropped_missing', 0)}")

# ---- Inference loop ----
clip_model_dtype = next(clip_model.vision_model.parameters()).dtype
preds_log2 = []

with torch.no_grad():
    for batch in tqdm(dl_te, total = len(dl_te)):
        clip_input_ids = batch["clip_input_ids"].to(device, non_blocking=True)
        clip_attention_mask = batch["clip_attention_mask"].to(device, non_blocking=True)
        bert_input_ids = batch["bert_input_ids"].to(device, non_blocking=True)
        bert_attention_mask = batch["bert_attention_mask"].to(device, non_blocking=True)
        img_missing = batch["img_missing"].to(device)

        # Get predictions from hybrid model
        with torch.cuda.amp.autocast(enabled=FP16):
            pixel_values = batch["pixel_values"].to(device, dtype=clip_model_dtype, non_blocking=True)
            
            pred_log, _, _, _ = hybrid_model(
                clip_input_ids, clip_attention_mask,
                bert_input_ids, bert_attention_mask,
                pixel_values, img_missing
            )
            preds_log2.append(pred_log.detach().float().cpu().numpy())

📦 Loaded checkpoint from: price_large_clip_infloat_bert_hybrid_10/clip_bert_hybrid_epoch_10.pt
🔤 CLIP_MODEL=openai/clip-vit-large-patch14 | BERT_MODEL=intfloat/e5-base-v2
🔤 clip_dim=768 | bert_dim=768 | heads=8 | MAX_LEN=77
🖥 Device: cuda
🧪 Test rows: 75000 | Missing images encountered (during getitem): 0
🗑 Dropped due to policy=drop: 0


  0%|          | 0/1172 [00:00<?, ?it/s]

You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a CLIPTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  with torch.cuda.amp.autocast(enabled=FP16):


In [12]:
# ---- Convert back to price (delog2) and save ----
if len(preds_log2):
    preds_log2 = np.concatenate(preds_log2, axis=0)
    preds_log2_delog = np.pow(2,preds_log2)
    preds_price = np.clip(preds_log2_delog, 0,10000)  # safe de-log clamp
else:
    preds_price = np.array([])

out = pd.DataFrame({
    ID_COL: dft[ID_COL].values[: len(preds_price)],
    "price": preds_price
})
pred_path = os.path.join(OUTPUT_DIR, "test_predictions_large_clip_infloat_bert_hybrid_smapeloss.csv")
os.makedirs(OUTPUT_DIR, exist_ok=True)
out.to_csv(pred_path, index=False)

print(f"✅ Done. Wrote {len(out)} predictions to: {pred_path}")
print(f"   Missing images counted during dataset load: {test_ds.missing_img_count}")
print(f"   Dropped rows (policy=drop): {getattr(test_ds,'dropped_missing',0)}")

✅ Done. Wrote 75000 predictions to: price_large_clip_infloat_bert_hybrid_10/test_predictions_large_clip_infloat_bert_hybrid_smapeloss.csv
   Missing images counted during dataset load: 0
   Dropped rows (policy=drop): 0


# CLIP-BERT Hybrid Model Architecture Summary

## Key Changes Made:

### 1. **Dual Model Architecture**
- **CLIP Base Model**: `openai/clip-vit-base-patch32` (smaller, faster than large)
- **BERT Model**: `bert-base-uncased` for enhanced text understanding
- Both models are trained end-to-end

### 2. **Cross-Attention Mechanism**
- **Multi-Head Cross-Attention**: 8 attention heads by default
- **Bidirectional Attention**: 
  - BERT features attend to CLIP text features
  - CLIP text features attend to BERT features
- **Feature Fusion**: Concatenates CLIP image + attended CLIP text + attended BERT features

### 3. **Enhanced Dataset & Collation**
- **Dual Tokenization**: Both CLIP and BERT tokenizers process the same text
- **Separate Input Streams**: `clip_input_ids`, `bert_input_ids`, etc.
- **Unified Collation**: Handles both tokenization schemes in batch processing

### 4. **SMAPE Tracking**
- **Per-Epoch Evaluation**: SMAPE calculated on training subset every epoch
- **Real-time Monitoring**: Track model performance during training
- **Early Stopping Potential**: Can be used to prevent overfitting

### 5. **Model Architecture Flow**
```
Text Input → [CLIP Tokenizer + BERT Tokenizer]
Image Input → [CLIP Vision Encoder]
                    ↓
            [CLIP Text Features] ←→ [BERT Features] (Cross-Attention)
                    ↓
            [CLIP Image Features] + [Attended Features] → [Regression Head] → Price Prediction
```

### 6. **Training Improvements**
- **Contrastive Loss**: Between CLIP image and text features
- **Huber Loss**: Robust regression loss for price prediction
- **Gradient Clipping**: Prevents exploding gradients
- **Mixed Precision**: FP16 training for efficiency

### 7. **Configuration Options**
- `NUM_ATTENTION_HEADS`: Number of cross-attention heads (default: 8)
- `ATTENTION_DROPOUT`: Dropout rate for attention layers (default: 0.1)
- `ALPHA_CONTRAST`: Weight for contrastive loss (default: 0.25)
- `TAU`: Temperature for InfoNCE loss (default: 0.07)

This hybrid approach leverages the strengths of both CLIP (vision-language alignment) and BERT (deep text understanding) while using cross-attention to create rich, contextually-aware feature representations for price prediction.
