In [None]:
import os
from pathlib import Path
from typing import List


import numpy as np
import pandas as pd
from PIL import Image


import torch
from transformers import CLIPProcessor, CLIPModel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

In [None]:
def ensure_columns(df: pd.DataFrame):
  required = ['catalog_content_clean', 'image_name', 'price_log']
  for c in required:
    if c not in df.columns:
     raise ValueError(f"Required column '{c}' not found in CSV")




def make_placeholder_image(size=(224, 224)) -> Image.Image:
# black placeholder for missing images
  return Image.new('RGB', size, (0, 0, 0))




def load_image(image_path: str, resize_to=(224, 224)) -> Image.Image:
  try:
    img = Image.open(image_path).convert('RGB')
    # deterministic resize to (256,256) as requested
    img = img.resize(resize_to, resample=Image.BICUBIC)
    return img
  except Exception:
    return make_placeholder_image(size=resize_to)



In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import CLIPProcessor, CLIPModel, get_scheduler
from tqdm import tqdm

def finetune_clip(df, images_dir, model_name='openai/clip-vit-base-patch32',
                  out_dir='finetuned_clip', batch_size=16, epochs=3, lr=5e-6):

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Using device: {device}")

    # Load model + processor
    processor = CLIPProcessor.from_pretrained(model_name)
    model = CLIPModel.from_pretrained(model_name).to(device)

    # Optional: freeze all but projection layers for faster domain adaptation
    for name, param in model.named_parameters():
        if not any(layer in name for layer in ["projection", "logit_scale"]):
            param.requires_grad = False

    dataset = CatalogDataset(df, images_dir, processor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    num_training_steps = len(dataloader) * epochs
    lr_scheduler = get_scheduler("linear", optimizer=optimizer,
                                 num_warmup_steps=0, num_training_steps=num_training_steps)

    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch, return_loss=True)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1} completed — avg loss: {avg_loss:.4f}")

    os.makedirs(out_dir, exist_ok=True)
    model.save_pretrained(out_dir)
    processor.save_pretrained(out_dir)
    print(f"✅ Model fine-tuned and saved to: {out_dir}")


In [None]:
def gated_fuse_embeddings(
    text_embeddings: np.ndarray,
    image_embeddings: np.ndarray,
    out_dir: str = 'outputs',
    gate_type: str = 'scalar'  # 'scalar' or 'vector'
) -> np.ndarray:
    """
    Fuse text and image embeddings using gated fusion.

    Parameters
    ----------
    text_embeddings : np.ndarray of shape (N, D_text)
    image_embeddings : np.ndarray of shape (N, D_img)
    out_dir : str
        Directory to save fused embeddings.
    gate_type : str
        'scalar': same gate for all dimensions
        'vector': different gate per dimension (requires D_text == D_img)

    Returns
    -------
    fused : np.ndarray
        Fused embeddings
    """
    if text_embeddings.shape[0] != image_embeddings.shape[0]:
        raise ValueError('Text and Image embeddings must have same number of rows')

    if gate_type == 'scalar':
        # learn a single scalar gate (0 <= alpha <= 1) for the entire embedding
        alpha = 0.5  # you can tune this, or make it learnable later
        fused = alpha * text_embeddings + (1 - alpha) * image_embeddings
    elif gate_type == 'vector':
        if text_embeddings.shape[1] != image_embeddings.shape[1]:
            raise ValueError('For vector gating, text and image dims must match')
        alpha_vec = 0.5  # same for all samples; could also be array of shape (D,)
        fused = alpha_vec * text_embeddings + (1 - alpha_vec) * image_embeddings
    else:
        raise ValueError("gate_type must be 'scalar' or 'vector'")

    # Save
    os.makedirs(out_dir, exist_ok=True)
    np.save(os.path.join(out_dir, 'fused_x.npy'), fused)
    print(f"Saved fused_x.npy with shape {fused.shape}")
    return fused

In [None]:
def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    # avoid division by zero
    mask = denominator != 0
    smape_value = np.mean(np.abs(y_true[mask] - y_pred[mask]) / denominator[mask]) * 100
    return smape_value

In [None]:
def train_and_evaluate(X: np.ndarray, y: np.ndarray, out_dir: str = 'outputs', seed: int = 42):
    os.makedirs(out_dir, exist_ok=True)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # You can swap model here. RandomForest is a good baseline for regression tasks.
    model = RandomForestRegressor(n_estimators=200, random_state=seed, n_jobs=-1, verbose=1)
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)  # squared=True by default
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    smape_val = smape(y_test, y_pred)

    print("Evaluation on test set:")
    print(f"MAE: {mae:.6f}")
    print(f"RMSE: {rmse:.6f}")
    print(f"R2: {r2:.6f}")
    print(f"SMAPE: {smape_val:.2f}%")

    # save model + scaler
    joblib.dump(model, os.path.join(out_dir, 'regressor_rf.joblib'))
    joblib.dump(scaler, os.path.join(out_dir, 'scaler.joblib'))

    # dump predictions and true for later analysis
    np.save(os.path.join(out_dir, 'y_test.npy'), y_test)
    np.save(os.path.join(out_dir, 'y_pred.npy'), y_pred)

    print(f"Saved model and scaler to {out_dir}")
    return {'mae': mae, 'rmse': rmse, 'r2': r2}


In [None]:
# User-specified fixed inputs (Drive-ready as requested)
csv_file = 'catalog_1k.csv'  # script expects this CSV in working dir or provide full path
images_dir = '/content/drive/My Drive/images'  # Drive folder containing images
out_dir = 'outputs'
batch_size = 32
model_name = 'openai/clip-vit-base-patch32'
seed = 42

In [None]:
np.random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7d4902d60510>

In [None]:
print(f"Reading CSV: {csv_file}")
df = pd.read_csv(csv_file)
ensure_columns(df)

Reading CSV: catalog_1k.csv


In [None]:
for f in ['text_embeddings.npy', 'image_embeddings.npy', 'fused_x.npy', 'y_price_log.npy']:
  p = os.path.join(out_dir, f)
  if os.path.exists(p):
    os.remove(p)
    print('Removed stale file:', p)

In [None]:
#compute embeddings robustly
text_emb, img_emb = robust_compute_embeddings(df, images_dir=images_dir, model_name=model_name,
                                                  batch_size=batch_size, out_dir=out_dir)


Using device: cpu


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Processed rows 0..31
Processed rows 32..63
Processed rows 64..95
Processed rows 96..127
Processed rows 128..159
Processed rows 160..191
Processed rows 192..223
Processed rows 224..255
Processed rows 256..287
Processed rows 288..319
Processed rows 320..351
Processed rows 352..383
Processed rows 384..415
Processed rows 416..447
Processed rows 448..479
Processed rows 480..511
Processed rows 512..543
Processed rows 544..575
Processed rows 576..607
Processed rows 608..639
Processed rows 640..671
Processed rows 672..703
Processed rows 704..735
Processed rows 736..767
Processed rows 768..799
Processed rows 800..831
Processed rows 832..863
Processed rows 864..895
Processed rows 896..927
Processed rows 928..959
Processed rows 960..991
Processed rows 992..999
Saved embeddings. shapes: (1000, 512), (1000, 512)


In [None]:
import pandas as pd
csv_file = 'catalog_1k.csv'
df = pd.read_csv(csv_file)
out_dir = 'outputs'
seed = 42

In [None]:
 # save target vector y (price_log)
y = df['price_log'].values.astype(float)
#np.save(os.path.join(out_dir, 'y_price_log.npy'), y)
print(f"Saved y_price_log.npy with shape {y.shape}")


Saved y_price_log.npy with shape (1000,)


In [None]:
import numpy as np
text_embd = np.load('text_embeddings.npy')

In [None]:
 # fuse
fused = gated_fuse_embeddings(text_embeddings=text_emb, image_embeddings=img_emb, out_dir='outputs', gate_type='scalar')

Saved fused_x.npy with shape (1000, 1024)


In [None]:
import numpy as np
from sklearn.decomposition import PCA

# Load fused embeddings
fused = np.load('outputs/fused_x.npy')
print("Original shape:", fused.shape)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=256, random_state=42)
fused_reduced = pca.fit_transform(fused)

print("Reduced shape:", fused_reduced.shape)

# Save reduced embeddings
np.save('outputs/fused_x_reduced.npy', fused_reduced)
print("Saved reduced embeddings as 'fused_x_reduced.npy'")


In [None]:
 # sanity-check shapes before training
print('Shapes -> fused:', fused.shape, 'y:', y.shape)
assert fused.shape[0] == y.shape[0], 'Row count mismatch between fused features and target y. Aborting.'

Shapes -> fused: (1000, 1024) y: (1000,)


In [None]:
text_embd = np.load('text_embeddings.npy')

In [None]:
# train and evaluate
metrics = train_and_evaluate(text_embd, y, out_dir=out_dir, seed=seed)


NameError: name 'train_and_evaluate' is not defined

In [None]:
print("df shape:", df.shape)                     # should be (1000, ...)
import numpy as np
try:
    fused = np.load('outputs/fused_x.npy')       # if you previously saved fused
    print("fused shape (loaded):", fused.shape)
except Exception as e:
    print("No fused_x.npy found or failed to load:", e)

try:
    text_emb = np.load('outputs/text_embeddings.npy')
    img_emb = np.load('outputs/image_embeddings.npy')
    print("text_emb shape:", text_emb.shape)
    print("image_emb shape:", img_emb.shape)
except Exception as e:
    print("Failed to load embeddings:", e)

print("y shape:", y.shape)

df shape: (1000, 4)
fused shape (loaded): (1000, 1024)
text_emb shape: (1000, 512)
image_emb shape: (1000, 512)
y shape: (1000,)
