In [None]:
import os
from pathlib import Path
from typing import List


import numpy as np
import pandas as pd
from PIL import Image


import torch
from transformers import CLIPProcessor, CLIPModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

In [None]:
def ensure_columns(df: pd.DataFrame):
  required = ['catalog_content_clean', 'image_name', 'price_log']
  for c in required:
    if c not in df.columns:
     raise ValueError(f"Required column '{c}' not found in CSV")




def make_placeholder_image(size=(224, 224)) -> Image.Image:
# black placeholder for missing images
  return Image.new('RGB', size, (0, 0, 0))




def load_image(image_path: str, resize_to=(224, 224)) -> Image.Image:
  try:
    img = Image.open(image_path).convert('RGB')
    # deterministic resize to (256,256) as requested
    img = img.resize(resize_to, resample=Image.BICUBIC)
    return img
  except Exception:
    return make_placeholder_image(size=resize_to)



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPModel, CLIPProcessor
from PIL import Image
import pandas as pd
import os
import numpy as np
from tqdm import tqdm

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import CLIPModel, CLIPProcessor
from PIL import Image
import pandas as pd
import os
import numpy as np
from tqdm import tqdm


class ProductDataset(Dataset):
    """Dataset for image-text pairs from e-commerce products."""

    def __init__(self, df, images_dir, processor, resize_to=(224, 224)):
        self.df = df.reset_index(drop=True)
        self.images_dir = images_dir
        self.processor = processor
        self.resize_to = resize_to

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row['item_name']) if pd.notna(row['item_name']) else ""

        # Load image
        img_path = os.path.join(self.images_dir, str(row['image_name']))
        try:
            image = Image.open(img_path).convert('RGB')
            image = image.resize(self.resize_to, resample=Image.BICUBIC)
        except Exception:
            image = Image.new('RGB', self.resize_to, (0, 0, 0))

        # Process inputs
        inputs = self.processor(
            text=[text],
            images=image,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=77
        )

        # Remove batch dimension
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'pixel_values': inputs['pixel_values'].squeeze(0)
        }

In [None]:
def finetune_clip(df: pd.DataFrame,
                  images_dir: str,
                  model_name: str = 'openai/clip-vit-base-patch32',
                  output_dir: str = 'finetuned_clip',
                  batch_size: int = 32,
                  num_epochs: int = 5,
                  learning_rate: float = 5e-6,
                  train_split: float = 0.9,
                  temperature: float = 0.07,
                  device: str = None):
    """
    Fine-tune CLIP model on e-commerce product images and descriptions.

    Args:
        df: DataFrame with 'catalog_content_clean' and 'image_name' columns
        images_dir: Directory containing product images
        model_name: Pretrained CLIP model name
        output_dir: Where to save the fine-tuned model
        batch_size: Training batch size
        num_epochs: Number of training epochs
        learning_rate: Learning rate for optimizer
        train_split: Fraction of data to use for training (rest for validation)
        temperature: Temperature parameter for contrastive loss
        device: Device to use ('cuda' or 'cpu')

    Returns:
        Fine-tuned model
    """
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load model and processor
    print("Loading pretrained CLIP model...")
    model = CLIPModel.from_pretrained(model_name)
    processor = CLIPProcessor.from_pretrained(model_name)
    model.to(device)

    # Split data
    n = len(df)
    n_train = int(n * train_split)
    train_df = df.iloc[:n_train]
    val_df = df.iloc[n_train:]

    print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")

    # Create datasets and dataloaders
    train_dataset = ProductDataset(train_df, images_dir, processor)
    val_dataset = ProductDataset(val_df, images_dir, processor)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

    # Setup optimizer (only fine-tune vision and text encoders)
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

    # Training loop
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch + 1}/{num_epochs}")

        train_loss = train_epoch(model, train_loader, optimizer, device, temperature)
        val_loss = validate(model, val_loader, device, temperature)

        print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            os.makedirs(output_dir, exist_ok=True)
            model.save_pretrained(output_dir)
            processor.save_pretrained(output_dir)
            print(f"Saved best model to {output_dir}")

    print("\nTraining complete!")
    return model


In [None]:

def compute_embeddings_with_finetuned_model(df: pd.DataFrame,
                                            images_dir: str,
                                            model_path: str,
                                            batch_size: int = 32,
                                            device: str = None,
                                            out_dir: str = 'outputs_finetuned'):
    """
    Compute embeddings using the fine-tuned model.

    Args:
        df: DataFrame with product data
        images_dir: Directory containing images
        model_path: Path to fine-tuned model
        batch_size: Batch size for processing
        device: Device to use
        out_dir: Output directory for embeddings

    Returns:
        text_embeddings, image_embeddings
    """
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    # Load fine-tuned model
    print(f"Loading fine-tuned model from {model_path}...")
    model = CLIPModel.from_pretrained(model_path)
    processor = CLIPProcessor.from_pretrained(model_path)
    model.to(device)
    model.eval()

    def load_image_safe(path: str, resize_to=(224, 224)):
        try:
            img = Image.open(path).convert('RGB')
            img = img.resize(resize_to, resample=Image.BICUBIC)
            return img
        except Exception:
            return Image.new('RGB', resize_to, (0, 0, 0))

    text_embs = []
    image_embs = []

    for idx in tqdm(range(0, len(df), batch_size), desc="Computing embeddings"):
        end = min(len(df), idx + batch_size)
        batch = df.iloc[idx:end]

        texts = batch['catalog_content_clean'].fillna('').tolist()
        images = [load_image_safe(os.path.join(images_dir, str(fn)))
                 for fn in batch['image_name'].tolist()]

        inputs = processor(text=texts, images=images, return_tensors='pt',
                          padding=True, truncation=True)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        t_emb = outputs.text_embeds.detach().cpu().numpy()
        i_emb = outputs.image_embeds.detach().cpu().numpy()

        # L2 normalize
        t_emb = t_emb / (np.linalg.norm(t_emb, axis=1, keepdims=True) + 1e-12)
        i_emb = i_emb / (np.linalg.norm(i_emb, axis=1, keepdims=True) + 1e-12)

        text_embs.append(t_emb)
        image_embs.append(i_emb)

    text_embeddings = np.vstack(text_embs)
    image_embeddings = np.vstack(image_embs)

    # Save embeddings
    os.makedirs(out_dir, exist_ok=True)
    np.save(os.path.join(out_dir, 'text_embeddings_finetuned.npy'), text_embeddings)
    np.save(os.path.join(out_dir, 'image_embeddings_finetuned.npy'), image_embeddings)

    print(f"Saved embeddings. Shapes: {text_embeddings.shape}, {image_embeddings.shape}")
    return text_embeddings, image_embeddings

In [None]:
df = pd.read_csv('catalog_25k.csv')

In [None]:
images_dir = "/content/drive/MyDrive/images_train"

In [None]:
finetuned_model = finetune_clip(
    df=df,
    images_dir=images_dir,
    num_epochs=5,
    batch_size=32,
    learning_rate=5e-6,
    output_dir='/content/fine_tune_model'
)

In [None]:
text_embs, img_embs = compute_embeddings_with_finetuned_model(
    df=df,
    images_dir=images_dir,
    model_path='/content/drive/MyDrive/finetuned_clip'
)

Using device: cuda
Loading fine-tuned model from /content/drive/MyDrive/finetuned_clip...



Computing embeddings:   0%|          | 0/782 [00:00<?, ?it/s][A
Computing embeddings:   0%|          | 1/782 [00:01<15:59,  1.23s/it][A
Computing embeddings:   0%|          | 2/782 [00:02<17:38,  1.36s/it][A
Computing embeddings:   0%|          | 3/782 [00:05<23:26,  1.81s/it][A
Computing embeddings:   1%|          | 4/782 [00:06<23:12,  1.79s/it][A
Computing embeddings:   1%|          | 5/782 [00:08<24:13,  1.87s/it][A
Computing embeddings:   1%|          | 6/782 [00:10<22:35,  1.75s/it][A
Computing embeddings:   1%|          | 7/782 [00:12<22:50,  1.77s/it][A
Computing embeddings:   1%|          | 8/782 [00:14<23:32,  1.83s/it][A
Computing embeddings:   1%|          | 9/782 [00:15<21:51,  1.70s/it][A
Computing embeddings:   1%|▏         | 10/782 [00:17<23:11,  1.80s/it][A
Computing embeddings:   1%|▏         | 11/782 [00:19<25:31,  1.99s/it][A
Computing embeddings:   2%|▏         | 12/782 [00:21<23:39,  1.84s/it][A
Computing embeddings:   2%|▏         | 13/782 [00:22<21

In [None]:
def gated_fuse_embeddings(
    text_embeddings: np.ndarray,
    image_embeddings: np.ndarray,
    out_dir: str = 'outputs',
    gate_type: str = 'scalar'  # 'scalar' or 'vector'
) -> np.ndarray:
    """
    Fuse text and image embeddings using gated fusion.

    Parameters
    ----------
    text_embeddings : np.ndarray of shape (N, D_text)
    image_embeddings : np.ndarray of shape (N, D_img)
    out_dir : str
        Directory to save fused embeddings.
    gate_type : str
        'scalar': same gate for all dimensions
        'vector': different gate per dimension (requires D_text == D_img)

    Returns
    -------
    fused : np.ndarray
        Fused embeddings
    """
    if text_embeddings.shape[0] != image_embeddings.shape[0]:
        raise ValueError('Text and Image embeddings must have same number of rows')

    if gate_type == 'scalar':
        # learn a single scalar gate (0 <= alpha <= 1) for the entire embedding
        alpha = 0.5  # you can tune this, or make it learnable later
        fused = alpha * text_embeddings + (1 - alpha) * image_embeddings
    elif gate_type == 'vector':
        if text_embeddings.shape[1] != image_embeddings.shape[1]:
            raise ValueError('For vector gating, text and image dims must match')
        alpha_vec = 0.5  # same for all samples; could also be array of shape (D,)
        fused = alpha_vec * text_embeddings + (1 - alpha_vec) * image_embeddings
    else:
        raise ValueError("gate_type must be 'scalar' or 'vector'")

    # Save
    os.makedirs(out_dir, exist_ok=True)
    np.save(os.path.join(out_dir, 'fused_x.npy'), fused)
    print(f"Saved fused_x.npy with shape {fused.shape}")
    return fused

In [None]:
fused = gated_fuse_embeddings(text_embeddings=text_embs, image_embeddings=img_embs, out_dir='outputs', gate_type='scalar')

In [None]:
def robust_compute_embeddings(df: pd.DataFrame,
                              images_dir: str,
                              model_name: str = 'openai/clip-vit-base-patch32',
                              batch_size: int = 32,
                              device: str = None,
                              out_dir: str = 'outputs'):
    """Robustly compute text and image embeddings for all rows, falling back to single-row processing on failure."""
    device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    model = CLIPModel.from_pretrained(model_name)
    processor = CLIPProcessor.from_pretrained(model_name)
    model.to(device)
    model.eval()

    def load_image_safe(path: str, resize_to=(224,224)):
        try:
            img = Image.open(path).convert('RGB')
            img = img.resize(resize_to, resample=Image.BICUBIC)
            return img
        except Exception:
            return Image.new('RGB', resize_to, (0,0,0))

    n = len(df)
    text_embs = []
    image_embs = []
    idx = 0

    while idx < n:
        end = min(n, idx + batch_size)
        batch = df.iloc[idx:end]

        texts = batch['catalog_content_clean'].fillna('').tolist()
        images = [ load_image_safe(os.path.join(images_dir, str(fn))) for fn in batch['image_name'].tolist() ]

        try:
            inputs = processor(text=texts, images=images, return_tensors='pt', padding=True, truncation=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = model(**inputs)

            t_emb = outputs.text_embeds.detach().cpu().numpy()
            i_emb = outputs.image_embeds.detach().cpu().numpy()

            # ensure batch output count matches expectation
            if t_emb.shape[0] != (end - idx) or i_emb.shape[0] != (end - idx):
                raise RuntimeError("batch output length mismatch")

            # L2 normalize
            t_emb = t_emb / (np.linalg.norm(t_emb, axis=1, keepdims=True) + 1e-12)
            i_emb = i_emb / (np.linalg.norm(i_emb, axis=1, keepdims=True) + 1e-12)

            text_embs.append(t_emb)
            image_embs.append(i_emb)
            print(f"Processed rows {idx}..{end-1}")
            idx = end

        except Exception as e:
            # fallback: process items one-by-one for this batch
            print(f"Batch failed at rows {idx}..{end-1} with error: {e}. Falling back to single-item processing.")
            for j in range(idx, end):
                row = df.iloc[[j]]
                t = row['catalog_content_clean'].fillna('').tolist()
                p = os.path.join(images_dir, str(row['image_name'].values[0]))
                im = load_image_safe(p)
                inputs = processor(text=t, images=[im], return_tensors='pt', padding=True, truncation=True)
                inputs = {k: v.to(device) for k, v in inputs.items()}
                with torch.no_grad():
                    out = model(**inputs)
                t_emb = out.text_embeds.detach().cpu().numpy()
                i_emb = out.image_embeds.detach().cpu().numpy()
                t_emb = t_emb / (np.linalg.norm(t_emb, axis=1, keepdims=True) + 1e-12)
                i_emb = i_emb / (np.linalg.norm(i_emb, axis=1, keepdims=True) + 1e-12)
                text_embs.append(t_emb)
                image_embs.append(i_emb)
                print(f"Processed single row {j}")
            idx = end

    text_embeddings = np.vstack(text_embs)
    image_embeddings = np.vstack(image_embs)

    os.makedirs(out_dir, exist_ok=True)
    np.save(os.path.join(out_dir, 'text_embeddings.npy'), text_embeddings)
    np.save(os.path.join(out_dir, 'image_embeddings.npy'), image_embeddings)
    print(f"Saved embeddings. shapes: {text_embeddings.shape}, {image_embeddings.shape}")
    return text_embeddings, image_embeddings

In [None]:
fused = gated_fuse_embeddings(text_embeddings=text_emb, image_embeddings=img_emb, out_dir='outputs', gate_type='scalar')

Saved fused_x.npy with shape (50000, 512)


In [None]:
 # sanity-check shapes before training
print('Shapes -> fused:', fused.shape, 'y:', y.shape)
assert fused.shape[0] == y.shape[0], 'Row count mismatch between fused features and target y. Aborting.'

Shapes -> fused: (50000, 512) y: (50000,)


In [None]:
# train and evaluate
metrics = train_and_evaluate(fused, y, out_dir=out_dir, seed=seed)


Iteration 1, loss = 0.35368328
Validation score: 0.308386
Iteration 2, loss = 0.26331708
Validation score: 0.337818
Iteration 3, loss = 0.21889687
Validation score: 0.333717
Iteration 4, loss = 0.18110540
Validation score: 0.317339
Iteration 5, loss = 0.14854704
Validation score: 0.306425
Iteration 6, loss = 0.11550208
Validation score: 0.297766
Iteration 7, loss = 0.09315878
Validation score: 0.298476
Iteration 8, loss = 0.07429024
Validation score: 0.279432
Iteration 9, loss = 0.06067729
Validation score: 0.278972
Iteration 10, loss = 0.05301992
Validation score: 0.279647
Iteration 11, loss = 0.04624707
Validation score: 0.274169
Iteration 12, loss = 0.04275590
Validation score: 0.279820
Iteration 13, loss = 0.03814903
Validation score: 0.286585
Iteration 14, loss = 0.03432167
Validation score: 0.290033
Iteration 15, loss = 0.03224676
Validation score: 0.280038
Iteration 16, loss = 0.03174972
Validation score: 0.291243
Iteration 17, loss = 0.03151827
Validation score: 0.280407
Iterat

In [None]:
print("df shape:", df.shape)                     # should be (1000, ...)
import numpy as np
try:
    fused = np.load('outputs/fused_x.npy')       # if you previously saved fused
    print("fused shape (loaded):", fused.shape)
except Exception as e:
    print("No fused_x.npy found or failed to load:", e)

try:
    text_emb = np.load('outputs/text_embeddings.npy')
    img_emb = np.load('outputs/image_embeddings.npy')
    print("text_emb shape:", text_emb.shape)
    print("image_emb shape:", img_emb.shape)
except Exception as e:
    print("Failed to load embeddings:", e)

print("y shape:", y.shape)

df shape: (50000, 5)
fused shape (loaded): (50000, 512)
text_emb shape: (50000, 512)
image_emb shape: (50000, 512)
y shape: (50000,)
