In [1]:
import pandas as pd
import random

# === CONFIG ===
SOURCE_FILE = "meta_Amazon_Fashion.jsonl"  # your large dataset
OUTPUT_FILE = "fashion_subset_10000.json"  # smaller subset for experiments
CATEGORY_FILTER = "dress"            # keep only items with this word in title
NUM_SAMPLES = 10000                   # number of products to keep
KEEP_COLS = ['title', 'features', 'description', 'images', 'bought_together']

# === LOAD DATA ===
print("Loading dataset...")
df = pd.read_json(SOURCE_FILE, lines=True)

# === FILTER BY CATEGORY ===
print(f"Filtering by '{CATEGORY_FILTER}' in title...")
df = df[df['title'].str.contains(CATEGORY_FILTER, case=False, na=False)]

# === KEEP ONLY NEEDED COLUMNS ===
df = df[KEEP_COLS]

# === DROP EMPTY FIELDS ===
df = df.dropna(subset=['title', 'images', 'description'])

# === SAMPLE A SMALLER SET ===
if NUM_SAMPLES < len(df):
    df = df.sample(n=NUM_SAMPLES, random_state=42)

# === KEEP FIRST IMAGE ONLY (OPTIONAL) ===
df['images'] = df['images'].apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)

# === SAVE SMALLER DATASET ===
df.to_json(OUTPUT_FILE, orient="records", lines=True)
print(f"Subset saved to {OUTPUT_FILE} ({len(df)} products)")


Loading dataset...
Filtering by 'dress' in title...
Subset saved to fashion_subset_10000.json (10000 products)


In [9]:
# preprocess_images.py
import os
import time
import pandas as pd
import torch
from PIL import Image, UnidentifiedImageError
import requests
from io import BytesIO
from sentence_transformers import SentenceTransformer
from concurrent.futures import ThreadPoolExecutor

# ===============================
# CONFIG
# ===============================
DATA_FILE = "amazon_fashion_trimmed_10000.csv"
IMAGE_EMB_CACHE = "image_embeddings.pt"
MODEL_NAME = "clip-ViT-B-32"
NUM_THREADS = 20
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ===============================
# Load dataset
# ===============================
df = pd.read_csv(DATA_FILE)

# ===============================
# Load CLIP model
# ===============================
print("Loading CLIP model...")
model = SentenceTransformer(MODEL_NAME, device=DEVICE)

# Determine embedding dimension using a dummy image
dummy_img = Image.new("RGB", (224, 224), color="white")
embedding_dim = model.encode(dummy_img, convert_to_tensor=True).shape[-1]

# ===============================
# Load cache safely
# ===============================
cached_embeddings = {}
if os.path.exists(IMAGE_EMB_CACHE):
    print(f"Loading cached embeddings from {IMAGE_EMB_CACHE}...")
    try:
        loaded_cache = torch.load(IMAGE_EMB_CACHE, map_location=DEVICE)
        if isinstance(loaded_cache, dict):
            cached_embeddings = loaded_cache
        else:
            print("[WARN] Cache file is not a dictionary. Ignoring and starting fresh.")
    except Exception as e:
        print(f"[WARN] Failed to load cache: {e}. Starting fresh.")

# ===============================
# Find which images need processing
# ===============================
urls = df["image_large"].tolist()
to_process = [url for url in urls if url not in cached_embeddings]
print(f"Found {len(to_process)} new images to process out of {len(urls)} total.")

# ===============================
# Image processing function
# ===============================
def process_image(url):
    try:
        response = requests.get(url, timeout=5)
        response.raise_for_status()
        img = Image.open(BytesIO(response.content)).convert("RGB")
        emb = model.encode(img, convert_to_tensor=True)
        return url, emb
    except (UnidentifiedImageError, requests.RequestException, OSError) as e:
        print(f"[WARN] Failed to process {url} ({e})")
        return url, torch.zeros(embedding_dim)

# ===============================
# Process new images in parallel with progress printing
# ===============================
if to_process:
    print("Downloading & encoding new images...")
    processed_count = 0
    last_print_time = time.time()

    with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
        for url, emb in executor.map(process_image, to_process):
            cached_embeddings[url] = emb
            processed_count += 1

            # Print progress every 5 seconds
            if time.time() - last_print_time >= 5:
                print(f"Progress: {processed_count}/{len(to_process)} images processed")
                last_print_time = time.time()

# ===============================
# Save updated cache
# ===============================
print("Saving updated image embeddings cache...")
torch.save(cached_embeddings, IMAGE_EMB_CACHE)

print("✅ Image preprocessing complete.")


Loading CLIP model...
Loading cached embeddings from image_embeddings.pt...
[WARN] Cache file is not a dictionary. Ignoring and starting fresh.
Found 10000 new images to process out of 10000 total.
Downloading & encoding new images...
Progress: 133/10000 images processed
Progress: 339/10000 images processed
Progress: 520/10000 images processed
Progress: 688/10000 images processed
Progress: 888/10000 images processed
Progress: 1096/10000 images processed
Progress: 1292/10000 images processed
Progress: 1492/10000 images processed
Progress: 1678/10000 images processed
Progress: 1853/10000 images processed
Progress: 2050/10000 images processed
Progress: 2186/10000 images processed
Progress: 2379/10000 images processed
Progress: 2571/10000 images processed
[WARN] Failed to process https://m.media-amazon.com/images/I/31DHbGX05ML._AC_.jpg (404 Client Error: Not Found for url: https://m.media-amazon.com/images/I/31DHbGX05ML._AC_.jpg)
Progress: 2767/10000 images processed
Progress: 2944/10000 i

In [None]:
import pandas as pd

# Load dataset (replace with your file path)
df = pd.read_json("fashion_subset_10000.json", lines=True)  # lines=True if it's JSON Lines format

print(df.head())
# Helper function to get large image URL
def get_large_image(images):
    if isinstance(images, dict):  # Direct dict case
        return images.get("large")
    elif isinstance(images, list):  # List of dicts case
        for img in images:
            if isinstance(img, dict) and "large" in img:
                return img["large"]
    return None



# Create new dataframe with only needed columns
df_trimmed = pd.DataFrame({
    "title": df["title"],
    "image_large": df["images"].apply(get_large_image)
})

# Drop rows without images or titles
# df_trimmed = df_trimmed.dropna(subset=["image_large", "title"]).reset_index(drop=True)

# Save trimmed dataset for experiments
df_trimmed.to_csv("amazon_fashion_trimmed_10000.csv", index=False)

print(f"Trimmed dataset shape: {df_trimmed.shape}")
print(df_trimmed.head())