# Introduction

This notebook will capture a workflow to use CLIP (vit-base-patch32) for extracting embeddings and FAISS for indexing these embeddings for searching similar sketches. This notebook will capture the embeddings from augmented sketches.

# 1. Without Preprocessing

This block will include the workflow without any preprocessing involved. This workflow will include the following:
1. The sketches will be converted from pdf to png formats.
2. The embeddings will be extracted from these sketches.
3. These embeddings will be stored and indexed in a faiss database.
4. A report will be generated finding similar images for a query image.

### 1.1. PDF to PNG Conversion

In [None]:
import os
from pdf2image import convert_from_path

def convert_pdfs_to_pngs(input_folder, output_folder, dpi=600):
    """
    Converts all single-page PDFs in a folder to PNGs with matching filenames.

    ---
    Parameters:
        input_folder (str): Path to the folder containing pdf files.
        output_folder (str): Path to the folder where PNGs will be saved.
        dpi (int): Resolution for conversion. The default value is 600
    """
    # Ensure output directory exists
    os.makedirs(output_folder, exist_ok=True)

    # Iterate through all PDF files in the input folder
    for file_name in os.listdir(input_folder):
        if file_name.lower().endswith(".pdf"):
            pdf_path = os.path.join(input_folder, file_name)
            base_name = os.path.splitext(file_name)[0]
            output_path = os.path.join(output_folder, f"{base_name}.png")

            # Convert PDF to image
            images = convert_from_path(pdf_path, dpi=dpi)
            img = images[0]

            # Save as PNG
            img.save(output_path, "PNG")
            print(f"Converted: {file_name} -> {base_name}.png")
    
    print("\n Conversion complete!")

# ** ADD THE FOLDER PATH TO PDF SKETCHES AND THE FOLDER PATH WHERE PNG SKETCHES ARE TO BE SAVED.
input_folder = "/home/ayushkum/archimera/inputs/input_pdf"
output_folder = "/home/ayushkum/archimera/inputs/input_png"
dpi = 600 # ** DEFAULT VALUE IS 600, AS IT RESULTS IN BETTER RENDERING, CAN BE CHANGED TO HIGHER OR LOWER (NOT RECOMMENDED)

# ! FUNCTION CALL
convert_pdfs_to_pngs(
    input_folder=input_folder,
    output_folder=output_folder,
    dpi=dpi
)


### 1.2. Extracting Embeddings and FAISS Storage

The following code block will extract embeddings from sketches (assumes PNG) and build a FAISS Index, which would be indexed on `cosine-similarity`. This block would also create a mapping for later retrieval, and store it in a JSON file.

In [1]:
import os
import json
import numpy as np
import torch
import faiss
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

def build_faiss_index(
        image_folder: str,
        index_path: str = "./sketch_index.faiss",
        mapping_path: str = "./id_mapping.json",
        model_name: str = "openai/clip-vit-base-patch32",
        distance_metric: str = "L2",
):
    """
    Build a FAISS index from a folder of images using CLIP embeddings.

    ---
    Parameters:
        image_folder (str): Path to the folder containing images.
        index_path (str): Path where FAISS index file will be saved.
        mapping_path (str): Path to save filename-to-index mapping JSON.
        model_name (str): Pretrained CLIP model to use.
        distance_metric (str): Distance metric to be used for computing similarity. Currently supports "L2" for Euclidean Distance, and "cosine" for cosine similarity.
    
    ---
    Returns:
        tuple (faiss.Index, dict): the faiss Index and filename mapping
    """
    # Load model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = CLIPModel.from_pretrained(model_name).to(device)
    processor = CLIPProcessor.from_pretrained(model_name)

    embeddings = []
    filenames = []

    # Process each image
    for fname in os.listdir(image_folder):
        if fname.lower().endswith(('.png','.jpg', '.jpeg')):
            path = os.path.join(image_folder, fname)
            try:
                image = Image.open(path).convert("RGB")
            except Exception as e:
                print(f"Skipping {fname}: {e}")
                continue

            inputs = processor(images=image, return_tensors='pt').to(device)
            with torch.no_grad():
                image_embeds = model.get_image_features(**inputs)
            
            # Normalize for cosine or L2 comparison
            image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
            emb = image_embeds[0].cpu().numpy().astype('float32')

            embeddings.append(emb)
            filenames.append(fname)
    
    if not embeddings:
        raise ValueError(f"No valid images found in {image_folder}")
    
    embeddings = np.stack(embeddings, axis=0)
    print(f"Total embeddings computed: {embeddings.shape}")

    # Select distance metric
    dim = embeddings.shape[1]
    if distance_metric.lower() == "cosine":
        index = faiss.IndexFlatIP(dim)
    else:
        index = faiss.IndexFlatL2(dim)
    
    index.add(embeddings)

    # Save index and mapping
    faiss.write_index(index, index_path)
    print(f"FAISS index saved at: {index_path}")

    id_mapping = {i: filenames[i] for i in range(len(filenames))}
    with open(mapping_path, "w") as f:
        json.dump(id_mapping, f, indent=2)
    print(f"Mapping saved at: {mapping_path}")

    return index, id_mapping


# ** ADD THE INPUT FOLDER CONTAINING SKETCH IMAGES
image_folder = "/home/ayushkum/archimera/augmented/input_png"
index_path = "/home/ayushkum/archimera/clip/augmented_sketch_index.faiss" # ** ADD PATH WHERE YOU WANT TO SAVE THE FAISS INDEX FILE.
mapping_path = "/home/ayushkum/archimera/clip/augmented_id_mapping.json" # ** ADD PATH WHERE YOU WANT TO SAVE MAPPING BETWEEN FAISS ID AND FILENAME FOR LATER RETRIEVAL.
model_name = "openai/clip-vit-base-patch32" # ** NAME OF THE MODEL TO BE USED FOR EMBEDDING EXTRACTION. CAN BE CHANGED BASED ON NEED.
distance_metric = "cosine" # ** METRIC TO BE USED FOR INDEXING AND FINDING SIMILARITY. CURRENTLY RECOMMENDED TO USE COSINE.

# ! FUNCTION CALL
index, mapping = build_faiss_index(
    image_folder=image_folder,
    index_path=index_path,
    mapping_path=mapping_path,
    model_name=model_name,
    distance_metric=distance_metric
)

2025-10-30 06:01:35.086773: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-30 06:01:35.448996: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-30 06:01:36.916708: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with thi

Total embeddings computed: (88, 512)
FAISS index saved at: /home/ayushkum/archimera/clip/augmented_sketch_index.faiss
Mapping saved at: /home/ayushkum/archimera/clip/augmented_id_mapping.json


### 1.3. Running a Query Image for Similarity Search

The following code block can work with both pdf sketches and image sketches.

> RUN THE FOLLOWING BLOCK ONLY IF YOU HAVE PDF SKETCHES, OTHERWISE SKIP TO NEXT BLOCK

In [None]:

# ! DO NOT RUN IF ALREADY HAVING QUERY SKETCHES IN PNG FORMAT
query_pdf_path = "/home/ayushkum/archimera/query_pdf" # ** PATH TO FOLDER WHERE THE QUERY SKETCHES PDFs ARE STORED.
query_png_path = "/home/ayushkum/archimera/query_png" # ** PATH TO FOLDER WHERE YOU WANT TO SAVE QUERY SKETCHES IN PNG FORM.

# ! FUNCTION CALL
convert_pdfs_to_pngs(
    input_folder=query_pdf_path,
    output_folder=query_png_path,
    dpi=dpi
)

In [3]:
import os
import faiss
import numpy as np
import json
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

def search_similar_sketches(
        query_path: str,
        index_path: str = "./sketch_index.faiss",
        mapping_path: str = "./id_mapping.json",
        model_name: str = "openai/clip-vit-base-patch32",
        top_k: int = 5,
        distance_metric: str = "L2",
):
    """
    Search for similar sketches using CLIP embeddings and a prebuilt FAISS index.

    ---
    Parameters:
        query_path (str): Path to the query image (sketch).
        index_path (str): Path to FAISS index file.
        mapping_path (str): Path to JSON file containing ID -> filename mapping.
        model_name (str): Pretrained CLIP model to use.
        top_k (int): Number of most similar images to retrieve.
        distance_metric (str): 'L2' or 'cosine' for similarity computation.
    
    ---
    Returns:
        list[dict]: Each item contains:
        {
            "rank": int,
            "filename": str,
            "score": float
        }
    """

    # Load FAISS index and mapping
    index = faiss.read_index(index_path)
    with open(mapping_path, "r") as f:
        id_mapping = json.load(f)
    
    # Load CLIP Model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = CLIPModel.from_pretrained(model_name).to(device)
    processor = CLIPProcessor.from_pretrained(model_name)

    # Compute query embedding
    image = Image.open(query_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        query_emb = model.get_image_features(**inputs)
    query_emb = query_emb / query_emb.norm(p=2, dim=-1, keepdim=True)
    query_emb = query_emb.cpu().numpy().astype('float32')

    # Search top-k similar images
    D, I = index.search(query_emb, top_k)

    # Prepare results
    results = []
    for rank, (idx, dist) in enumerate(zip(I[0], D[0]), start=1):
        fname = id_mapping.get(str(idx)) or id_mapping.get(idx)
        # Convert distance to similarity if cosine metric is used
        score = dist if distance_metric.lower() == "cosine" else (1 / (1 + dist))
        results.append(
            {
                "rank": rank,
                "filename": fname,
                "score": float(score),
            }
        )
    
    return results

# ** ADD THE PATH TO FOLDER CONTAINING QUERY SKETCHES PNG
query_sketch_path = "/home/ayushkum/archimera/query_png"
index_path = "/home/ayushkum/archimera/clip/augmented_sketch_index.faiss" # ** ADD PATH TO THE FAISS INDEX FILE.
mapping_path = "/home/ayushkum/archimera/clip/augmented_id_mapping.json" # ** ADD PATH TO ID MAPPING JSON FILE TO GET SIMILAR FILENAMES.
top_k = 10 # ** NUMBER OF SIMILAR SKETCHES REQUIRED
distance_metric = "cosine" # ** THE DISTANCE METRIC TO BE USED FOR COMPUTING SIMILARITY. DO USE SIMILAR METRIC WHICH WAS USED TO CREATE INDEX FILE.

for filename in os.listdir(query_sketch_path):
    if filename.lower().endswith(".png"):
        query_image_path = os.path.join(query_sketch_path, filename)
        base_name = os.path.splitext(filename)[0]
        # ! FUNCTION CALL
        results = search_similar_sketches(
            query_path=query_image_path,
            index_path=index_path,
            mapping_path=mapping_path,
            top_k=top_k,
            distance_metric=distance_metric
        )
        print(f"\n Top similar sketches for {base_name}: ")
        for r in results:
            print(f"{r['rank']}. {r['filename']} - score: {round(r['score'] * 100, 2)}%")



 Top similar sketches for pdf3_SIM: 
1. pdf7_aug7.png - score: 93.41%
2. pdf5_aug6.png - score: 93.34%
3. pdf5_aug3.png - score: 92.84%
4. pdf8_aug8.png - score: 92.69%
5. pdf5_aug1.png - score: 92.6%
6. pdf5_aug8.png - score: 92.37%
7. pdf5_aug5.png - score: 92.22%
8. pdf7.png - score: 92.14%
9. pdf8_aug10.png - score: 91.98%
10. pdf7_aug3.png - score: 91.71%


# 2. With Preprocessing

This block will include the workflow with preprocessing. This workflow will include the following:
1. The sketches will be converted from pdf to png formats. (CURRENTLY SKIPPING AS DONE IN `1.1`).
2. The sketches will be preprocessed.
3. The embeddings will be extracted from these sketches.
4. These embeddings will be stored and indexed in a faiss database.
5. A report will be generated finding similar images for a query image.

### 2.1. Preprocessing Sketches

The following code block will preprocess sketches.

In [9]:
import os
import cv2
import numpy as np

def preprocess_sketch(img_path):
    import cv2, numpy as np

    img = cv2.imread(img_path)
    if img is None:
        raise ValueError(f"Could not read {img_path}")

    # Step 1: Optional small padding to preserve edges after augmentations
    img = cv2.copyMakeBorder(img, 5, 5, 5, 5, cv2.BORDER_REFLECT)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Step 2: Conditional smoothing (avoid oversmoothing augmented images)
    blur = cv2.bilateralFilter(gray, 9, 75, 75) if np.std(gray) > 20 else gray

    norm = cv2.normalize(blur, None, 0, 255, cv2.NORM_MINMAX)
    edges = cv2.Canny(norm, 30, 100)
    edges = cv2.dilate(edges, np.ones((2, 2), np.uint8), iterations=1)

    binary = cv2.adaptiveThreshold(
        edges, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
    )

    kernel = np.ones((3, 3), np.uint8)
    refined = cv2.morphologyEx(binary, cv2.MORPH_CLOSE, kernel)
    # processed = cv2.bitwise_not(refined)

    processed_rgb = cv2.cvtColor(refined, cv2.COLOR_GRAY2RGB)#(processed, cv2.COLOR_GRAY2RGB)
    return processed_rgb


def batch_preprocess_sketches(input_folder, output_folder):
    """
    Applies `preprocess_sketch()` to all PNG images in a given folder
    and saves them into an output folder with identical filenames.

    Args:
        input_folder (str): Path to the directory containing input PNGs.
        output_folder (str): Path to save the processed PNGs.

    Example:
        >>> batch_preprocess_sketches("input_png", "output_png")
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    image_files = [
        f for f in os.listdir(input_folder)
        if f.lower().endswith(".png")
    ]

    if not image_files:
        print(f"No PNG images found in '{input_folder}'.")
        return

    print(f"Processing {len(image_files)} image(s) from '{input_folder}' → '{output_folder}'")

    for filename in image_files:
        input_path = os.path.join(input_folder, filename)
        output_path = os.path.join(output_folder, filename)

        try:
            processed_img = preprocess_sketch(input_path)
            cv2.imwrite(output_path, processed_img)
            print(f"✅ Saved: {output_path}")
        except Exception as e:
            print(f"❌ Error processing {filename}: {e}")


# ** ADD PATH TO FOLDER CONTAINING INPUT PNG SKETCHES
input_path = "/home/ayushkum/archimera/augmented/input_png"
output_path = "/home/ayushkum/archimera/augmented/preprocessed_png" # ** ADD PATH TO FOLDER WHERE YOU WANT TO SAVE PREPROCESSED SKETCHES

# ! FUNCTION CALL
batch_preprocess_sketches(input_folder=input_path, output_folder=output_path)

Processing 88 image(s) from '/home/ayushkum/archimera/augmented/input_png' → '/home/ayushkum/archimera/augmented/preprocessed_png'
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf7_aug3.png
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf8_aug6.png
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf5_aug2.png
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf2_aug5.png
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf6_aug6.png
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf6_aug9.png
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf4_aug1.png
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf3_aug8.png
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf6_aug8.png
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf1_aug5.png
✅ Saved: /home/ayushkum/archimera/augmented/preprocessed_png/pdf3_aug9.png
✅ Saved: /home/ayushkum/archimera/augmented/

### 2.2. Extracting Embeddings and FAISS storage

In [10]:
# ** ADD THE INPUT FOLDER CONTAINING SKETCH IMAGES
image_folder = "/home/ayushkum/archimera/augmented/preprocessed_png"
index_path = "/home/ayushkum/archimera/clip/augmented_preprocessed_sketch_index.faiss" # ** ADD PATH WHERE YOU WANT TO SAVE THE FAISS INDEX FILE.
mapping_path = "/home/ayushkum/archimera/clip/augmented_preprocessed_id_mapping.json" # ** ADD PATH WHERE YOU WANT TO SAVE MAPPING BETWEEN FAISS ID AND FILENAME FOR LATER RETRIEVAL.
model_name = "openai/clip-vit-base-patch32" # ** NAME OF THE MODEL TO BE USED FOR EMBEDDING EXTRACTION. CAN BE CHANGED BASED ON NEED.
distance_metric = "cosine" # ** METRIC TO BE USED FOR INDEXING AND FINDING SIMILARITY. CURRENTLY RECOMMENDED TO USE COSINE.

# ! FUNCTION CALL
index, mapping = build_faiss_index(
    image_folder=image_folder,
    index_path=index_path,
    mapping_path=mapping_path,
    model_name=model_name,
    distance_metric=distance_metric
)

Total embeddings computed: (88, 512)
FAISS index saved at: /home/ayushkum/archimera/clip/augmented_preprocessed_sketch_index.faiss
Mapping saved at: /home/ayushkum/archimera/clip/augmented_preprocessed_id_mapping.json


### 2.3 Running a Query Sketch for Similarity Search

We would have to first preprocess the query sketch.

In [11]:
# ** ADD PATH TO FOLDER CONTAINING QUERY PNG SKETCHES
input_path = "/home/ayushkum/archimera/query_png"
output_path = "/home/ayushkum/archimera/preprocessed_query_png" # ** ADD PATH TO FOLDER WHERE YOU WANT TO SAVE PREPROCESSED SKETCHES

# ! FUNCTION CALL
batch_preprocess_sketches(input_folder=input_path, output_folder=output_path)

Processing 1 image(s) from '/home/ayushkum/archimera/query_png' → '/home/ayushkum/archimera/preprocessed_query_png'
✅ Saved: /home/ayushkum/archimera/preprocessed_query_png/pdf3_SIM.png


In [12]:
# ** ADD THE PATH TO FOLDER CONTAINING QUERY PREPROCESSED SKETCHES PNG
query_sketch_path = "/home/ayushkum/archimera/preprocessed_query_png"
index_path = "/home/ayushkum/archimera/clip/augmented_preprocessed_sketch_index.faiss" # ** ADD PATH TO THE FAISS INDEX FILE.
mapping_path = "/home/ayushkum/archimera/clip/augmented_preprocessed_id_mapping.json" # ** ADD PATH TO ID MAPPING JSON FILE TO GET SIMILAR FILENAMES.
top_k = 10 # ** NUMBER OF SIMILAR SKETCHES REQUIRED
distance_metric = "cosine" # ** THE DISTANCE METRIC TO BE USED FOR COMPUTING SIMILARITY. DO USE SIMILAR METRIC WHICH WAS USED TO CREATE INDEX FILE.

for filename in os.listdir(query_sketch_path):
    if filename.lower().endswith(".png"):
        query_image_path = os.path.join(query_sketch_path, filename)
        base_name = os.path.splitext(filename)[0]
        # ! FUNCTION CALL
        results = search_similar_sketches(
            query_path=query_image_path,
            index_path=index_path,
            mapping_path=mapping_path,
            top_k=top_k,
            distance_metric=distance_metric
        )
        print(f"\n Top similar sketches for {base_name}: ")
        for r in results:
            print(f"{r['rank']}. {r['filename']} - score: {round(r['score'] * 100, 2)}%")



 Top similar sketches for pdf3_SIM: 
1. pdf4.png - score: 94.31%
2. pdf4_aug3.png - score: 94.16%
3. pdf4_aug7.png - score: 93.06%
4. pdf4_aug10.png - score: 92.75%
5. pdf4_aug2.png - score: 92.4%
6. pdf4_aug9.png - score: 91.56%
7. pdf3_aug5.png - score: 91.43%
8. pdf5_aug10.png - score: 91.4%
9. pdf3.png - score: 91.36%
10. pdf3_aug4.png - score: 91.27%
