# Preparation

In [1]:
!pip install faiss-cpu open-clip-torch Pillow requests

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting open-clip-torch
  Downloading open_clip_torch-2.32.0-py3-none-any.whl.metadata (31 kB)
Collecting ftfy (from open-clip-torch)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.9.0->open-clip-torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.9.0->open-clip-torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.9.0->open-clip-torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.9.0->open-clip-torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.met

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import torch
import faiss
import requests
from PIL import Image
from io import BytesIO
import open_clip
import numpy as np

In [3]:
# URL of the JSONL file
url = "https://huggingface.co/datasets/McAuley-Lab/Amazon-Reviews-2023/resolve/main/raw/meta_categories/meta_Amazon_Fashion.jsonl"

# Local filename to save the file
output_file = "meta_Amazon_Fashion.jsonl"

# Send GET request
response = requests.get(url, stream=True)

# Check if request was successful
if response.status_code == 200:
    with open(output_file, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print(f"File downloaded successfully and saved as '{output_file}'")
else:
    print(f"Failed to download file. Status code: {response.status_code}")

File downloaded successfully and saved as 'meta_Amazon_Fashion.jsonl'


## Data preparation

In [4]:
EXTRACT_SIZE = 100000

In [5]:
# Path to the JSONL file
file_path = "meta_Amazon_Fashion.jsonl"

# Load JSONL into a pandas DataFrame
#extract_df = pd.read_json(file_path, lines=True)

# Read the first lines manually
with open(file_path, 'r', encoding='utf-8') as f:
    lines = [next(f) for _ in range(EXTRACT_SIZE)]

# Convert the lines (as strings) into a DataFrame
extract_df = pd.read_json(''.join(lines), lines=True)


  extract_df = pd.read_json(''.join(lines), lines=True)


In [6]:
def contains_default_amazon_image(image_array):
  for dico in image_array:
    if '.gif' in dico['thumb']:
      return True
  return False
extract_df["default_image"] = extract_df["images"].apply(contains_default_amazon_image)

In [7]:
extract_df.head(10)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,default_image
0,AMAZON FASHION,YUEDGE 5 Pairs Men's Moisture Control Cushione...,4.6,16,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],GiveGift,[],{'Package Dimensions': '10.31 x 8.5 x 1.73 inc...,B08BHN9PK5,,False
1,AMAZON FASHION,DouBCQ Women's Palazzo Lounge Wide Leg Casual ...,4.1,7,"[Drawstring closure, Machine Wash]",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],DouBCQ,[],{'Package Dimensions': '15 x 10.2 x 0.4 inches...,B08R39MRDW,,False
2,AMAZON FASHION,Pastel by Vivienne Honey Vanilla Girls' Trapez...,4.3,11,"[Zipper closure, Hand Wash Only]",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Pastel by Vivienne,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B077KJHCJ4,,False
3,AMAZON FASHION,Mento Streamtail,2.0,1,"[Thermoplastic Rubber sole, High Density Premi...",[Slip on the Women's Mento and you're ready to...,29.81,[{'thumb': 'https://m.media-amazon.com/images/...,[],Guy Harvey,[],{'Package Dimensions': '11.22 x 4.72 x 4.33 in...,B0811M2JG9,,False
4,AMAZON FASHION,RONNOX Women's 3-Pairs Bright Colored Calf Com...,4.3,3032,"[Pull On closure, Size Guide: ""S"" fits calf 10...",[Ronnox Calf Sleeves - Allowing Your Body to P...,17.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'HONEST Review: RONNOX Women's 3-Pa...,RONNOX,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07SB2892S,,False
5,AMAZON FASHION,12pairs Egowz High Visibility Nylon Latex Foam...,5.0,3,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],KEMUKR,[],"{'Brand': 'KEMUKR', 'Reusability': 'Reusable',...",B07NZ7V22C,,False
6,AMAZON FASHION,Nemidor Women's Vintage 1950s Style Sleeved Pl...,4.0,6,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Nemidor,[],"{'Is Discontinued By Manufacturer': 'No', 'Pro...",B01ISW76HQ,,False
7,AMAZON FASHION,YUNXI 3D grape Drop Earrings Cute Fruit Gold D...,4.7,13,[],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Cubic Zirconia Pearl Drop Earrings...,YUNXI,[],{'Package Dimensions': '4.13 x 3.03 x 0.47 inc...,B08T6KT3N8,,False
8,AMAZON FASHION,LYCKYY Women's Tie Dye Sweatshirt Crewneck Lon...,3.7,52,[Pull On closure],[Tie dye shirts for Women long sleeve crewneck...,9.99,[{'thumb': 'https://m.media-amazon.com/images/...,[],LYCKYY,[],"{'Department': 'womens', 'Date First Available...",B08FMLXY1Z,,False
9,AMAZON FASHION,PattyBoutik Women Crewneck Eyelet Cold Shoulde...,4.6,5,"[97% Cotton, 3% Other Fibers]",[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],PattyBoutik,[],"{'Is Discontinued By Manufacturer': 'No', 'Pac...",B07DXHY5ZX,,False


In [8]:
# Filter null GIFS
extract_df = extract_df[extract_df["default_image"] == False]

In [9]:
# Create the dictionary of idx to URL
images_url_dict = {np.int64(idx): images[0]["large"] for idx, images in extract_df['images'].items()}

In [10]:
extracted_image_urls = list(images_url_dict.values())
extracted_indices = list(images_url_dict.keys())

# Proto

## Create vector DB

In [78]:
dim = 512
index_flat = faiss.IndexFlatIP(dim)  # cosine similarity (normalized vectors)
index = faiss.IndexIDMap(index_flat)

In [79]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from tqdm.auto import tqdm
from datetime import datetime

# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load CLIP model
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
model = model.to(device).eval()

# Storage
embeddings_list = []
ids = []
url_to_index = {}  # optional mapping for later lookup
# Process each image URL
for idx, url in tqdm(enumerate(extracted_image_urls)):
    try:
        response = requests.get(url)
        image = Image.open(BytesIO(response.content)).convert("RGB")
        image_tensor = preprocess(image).unsqueeze(0).to(device)

        with torch.no_grad():
            embedding = model.encode_image(image_tensor).cpu().numpy().astype('float32')
            embedding /= np.linalg.norm(embedding)  # normalize for cosine similarity

        embeddings_list.append(embedding[0])  # squeeze batch dim
        ids.append(extracted_indices[idx])
        url_to_index[url] = idx

        print(f"Processed {idx+1}/{len(extracted_image_urls)}: {url}")
        print(f"Index in DB: {extracted_indices[idx]}")

        if idx % 1000 == 0:
          embedding_array = np.array(embeddings_list)
          ids = np.array(ids, dtype='int64')
          index.add_with_ids(embedding_array, ids)
          current_time = datetime.now().strftime("%Y%m%d%H%M%S")
          index_path = "/content/drive/My Drive/EXPORT_faiss_size_" + str(EXTRACT_SIZE) + "_CLIP_idx_" + str(idx) + "_" + current_time + ".index"
          faiss.write_index(index, index_path)
          print(f"FAISS index built with {index.ntotal} vectors.")
          embeddings_list = []
          ids = []

    except Exception as e:
        print(f"Failed to process {url}: {e}")

# Build FAISS index
if embeddings_list:
    # Create array of embeddings and corresponding IDs
    embedding_array = np.array(embeddings_list)
    ids = np.array(ids, dtype='int64')
    index.add_with_ids(embedding_array, ids)
    print(f"FAISS index built with {index.ntotal} vectors.")
else:
    print("No embeddings created.")
# Current time
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
index_path = "/content/drive/My Drive/EXPORT_faiss_size_" + str(EXTRACT_SIZE) + "_CLIP_total_" + current_time + ".index"
faiss.write_index(index, index_path)

Using device: cuda


0it [00:00, ?it/s]

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
Processed 57145/98662: https://m.media-amazon.com/images/I/31meFjlj0mL._AC_.jpg
Index in DB: 57924
Processed 57146/98662: https://m.media-amazon.com/images/I/31u0BCrdiQL._AC_.jpg
Index in DB: 57925
Processed 57147/98662: https://m.media-amazon.com/images/I/415AVT8s8VL._AC_.jpg
Index in DB: 57926
Processed 57148/98662: https://m.media-amazon.com/images/I/317D5I4ao7L._AC_.jpg
Index in DB: 57927
Processed 57149/98662: https://m.media-amazon.com/images/I/41KseSZpYXL._AC_.jpg
Index in DB: 57928
Processed 57150/98662: https://m.media-amazon.com/images/I/31pchBwl2bL._AC_.jpg
Index in DB: 57929
Processed 57151/98662: https://m.media-amazon.com/images/I/411cOT0TiJL._AC_.jpg
Index in DB: 57930
Processed 57152/98662: https://m.media-amazon.com/images/I/51A9qDRFoxL._AC_.jpg
Index in DB: 57931
Processed 57153/98662: https://m.media-amazon.com/images/I/41xg8Y0uUcL._AC_.jpg
Index in DB: 57932
Processed 57154/9

## Retriever

In [11]:
# Load the index
index_path = "/content/drive/MyDrive/EXPORT_faiss_size_100000_CLIP_idx_50000_20250417024756.index"
index = faiss.read_index(index_path)

In [17]:
# Device setup
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

Using device: cpu


In [18]:
# Load CLIP model
model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
model = model.to(device).eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


open_clip_model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

In [19]:
def search_text(query: str, top_k: int = 5):
    # Encode text to CLIP embedding
    with torch.no_grad():
        text_tokens = open_clip.tokenize([query]).to(device)
        text_embedding = model.encode_text(text_tokens).cpu().numpy().astype('float32')
        text_embedding /= np.linalg.norm(text_embedding)  # normalize for cosine similarity

    # Search FAISS index
    D, I = index.search(text_embedding, top_k)
    print(I[0])
    results = [(images_url_dict[i], float(D[0][j])) for j, i in enumerate(I[0])]
    return results

In [21]:
from IPython.display import display, Image as IPImage

results = search_text("A pair of red pants", top_k=5)
for url, score in results:
    print(f"{score:.3f} - {url}")

for url, score in results:
    print(f"{score:.3f} - {url}")
    display(IPImage(url=url))

[39003 36479 36845 50381 13671]
0.307 - https://m.media-amazon.com/images/I/41gO3yeh2qL._AC_.jpg
0.304 - https://m.media-amazon.com/images/I/31soIbV-OFL._AC_.jpg
0.303 - https://m.media-amazon.com/images/I/31J2MO63i-L._AC_.jpg
0.302 - https://m.media-amazon.com/images/I/317AZDXmUVL._AC_.jpg
0.301 - https://m.media-amazon.com/images/I/31eUc7WQ+3L._AC_.jpg
0.307 - https://m.media-amazon.com/images/I/41gO3yeh2qL._AC_.jpg


0.304 - https://m.media-amazon.com/images/I/31soIbV-OFL._AC_.jpg


0.303 - https://m.media-amazon.com/images/I/31J2MO63i-L._AC_.jpg


0.302 - https://m.media-amazon.com/images/I/317AZDXmUVL._AC_.jpg


0.301 - https://m.media-amazon.com/images/I/31eUc7WQ+3L._AC_.jpg
