# Build CLIP Embeddings with OpenVINO

This notebook converts CLIP model to OpenVINO format and builds embeddings database for the dataset.

In [4]:
! pip install git+https://github.com/openai/CLIP.git
! pip install openvino faiss-cpu pillow tqdm

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to c:\users\sanjay nithin\appdata\local\temp\pip-req-build-8ci2kg1c
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting ftfy (from clip==1.0)
  Obtaining dependency information for ftfy from https://files.pythonhosted.org/packages/ab/6e/81d47999aebc1b155f81eca4477a616a70f238a2549848c38983f3c22a82/ftfy-6.3.1-py3-none-any.whl.metadata
  Using cached ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting torchvision (from clip==1.0)
  Obtaining dependency information for torchvision from https://files.pythonhosted.

  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git 'C:\Users\Sanjay Nithin\AppData\Local\Temp\pip-req-build-8ci2kg1c'

[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import os
import numpy as np
import pickle
from pathlib import Path
from PIL import Image
import torch
import clip
from openvino.runtime import Core
import faiss
from tqdm import tqdm

## Configuration

In [6]:
DATASET_PATH = Path(r"e:\Projects\AI Based\RecTrio\V1\datasets\animals\raw-img")
MODEL_DIR = Path(r"e:\Projects\AI Based\RecTrio\V2\models")
VECTOR_DB_DIR = Path(r"e:\Projects\AI Based\RecTrio\V2\vector_db")

CLIP_MODEL_NAME = "ViT-B/32"  # Fast GitHub CLIP model
VISION_MODEL_PATH = MODEL_DIR / "clip_vision_model.xml"
TEXT_MODEL_PATH = MODEL_DIR / "clip_text_model.xml"

MODEL_DIR.mkdir(parents=True, exist_ok=True)
VECTOR_DB_DIR.mkdir(parents=True, exist_ok=True)

## Load and Convert CLIP Model to OpenVINO

In [7]:
print("Loading CLIP model from GitHub (fast loading)...")
model, preprocess = clip.load(CLIP_MODEL_NAME, device="cpu")
model.eval()
print(f"✓ CLIP {CLIP_MODEL_NAME} loaded successfully")
print(f"Image input size: 224x224")
print(f"Embedding dimension: 512")

Loading CLIP model from GitHub (fast loading)...
✓ CLIP ViT-B/32 loaded successfully
Image input size: 224x224
Embedding dimension: 512
✓ CLIP ViT-B/32 loaded successfully
Image input size: 224x224
Embedding dimension: 512


In [8]:
if not VISION_MODEL_PATH.exists():
    print("Converting vision model to OpenVINO...")
    
    # Create dummy input for vision model
    dummy_input = torch.randn(1, 3, 224, 224)
    
    import openvino as ov
    vision_ov_model = ov.convert_model(
        model.visual,
        example_input=dummy_input
    )
    
    ov.save_model(vision_ov_model, VISION_MODEL_PATH)
    print(f"✓ Vision model saved to {VISION_MODEL_PATH}")
else:
    print("✓ Vision model already exists")

Converting vision model to OpenVINO...
✓ Vision model saved to e:\Projects\AI Based\RecTrio\V2\models\clip_vision_model.xml
✓ Vision model saved to e:\Projects\AI Based\RecTrio\V2\models\clip_vision_model.xml


In [10]:
if not TEXT_MODEL_PATH.exists():
    print("Converting text encoder to OpenVINO...")
    
    # Create dummy input for text model (tokenized text)
    dummy_text = clip.tokenize(["a photo of a cat"])
    
    # We need to create a wrapper for the text encoding part
    class CLIPTextEncoder(torch.nn.Module):
        def __init__(self, clip_model):
            super().__init__()
            self.clip_model = clip_model
        
        def forward(self, text):
            return self.clip_model.encode_text(text)
    
    text_encoder = CLIPTextEncoder(model)
    text_encoder.eval()
    
    import openvino as ov
    text_ov_model = ov.convert_model(
        text_encoder,
        example_input=dummy_text
    )
    
    ov.save_model(text_ov_model, TEXT_MODEL_PATH)
    print(f"✓ Text encoder saved to {TEXT_MODEL_PATH}")
else:
    print("✓ Text encoder already exists")

Converting text encoder to OpenVINO...
✓ Text encoder saved to e:\Projects\AI Based\RecTrio\V2\models\clip_text_model.xml
✓ Text encoder saved to e:\Projects\AI Based\RecTrio\V2\models\clip_text_model.xml


## Initialize OpenVINO Runtime

In [12]:
core = Core()

print("Loading vision model for inference...")
vision_compiled_model = core.compile_model(str(VISION_MODEL_PATH), "CPU")
vision_input_layer = vision_compiled_model.input(0)
vision_output_layer = vision_compiled_model.output(0)

print("✓ Vision model loaded on CPU")
print(f"Input shape: {vision_input_layer.partial_shape}")
print(f"Output shape: {vision_output_layer.partial_shape}")

Loading vision model for inference...
✓ Vision model loaded on CPU
Input shape: [?,3,?,?]
Output shape: [?,512]
✓ Vision model loaded on CPU
Input shape: [?,3,?,?]
Output shape: [?,512]


## Collect Dataset Images

In [13]:
image_paths = []
supported_formats = {'.jpg', '.jpeg', '.png', '.bmp', '.webp'}

for category_dir in DATASET_PATH.iterdir():
    if category_dir.is_dir():
        for img_path in category_dir.iterdir():
            if img_path.suffix.lower() in supported_formats:
                image_paths.append(str(img_path))

print(f"Found {len(image_paths)} images in dataset")

Found 26179 images in dataset


## Generate Image Embeddings

In [14]:
def get_image_embedding(image_path):
    """Generate embedding for a single image using OpenVINO CLIP vision model"""
    try:
        # Load and preprocess image using CLIP's preprocessing
        image = Image.open(image_path).convert('RGB')
        image_tensor = preprocess(image).unsqueeze(0)
        
        # Run inference with OpenVINO
        pixel_values = image_tensor.numpy()
        result = vision_compiled_model([pixel_values])[vision_output_layer]
        
        # Get the embedding and normalize
        embedding = result[0]
        embedding = embedding / np.linalg.norm(embedding)
        
        return embedding
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

In [None]:
embeddings = []
valid_image_paths = []

print("Generating embeddings...")
for img_path in tqdm(image_paths, desc="Processing images"):
    embedding = get_image_embedding(img_path)
    if embedding is not None:
        embeddings.append(embedding)
        valid_image_paths.append(img_path)

embeddings = np.array(embeddings).astype('float32')
print(f"Generated {len(embeddings)} embeddings with shape {embeddings.shape}")

## Build FAISS Index

In [None]:
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)

index.add(embeddings)
print(f"FAISS index built with {index.ntotal} vectors")

## Save Index and Metadata

In [None]:
faiss_index_path = VECTOR_DB_DIR / "faiss_index.bin"
faiss.write_index(index, str(faiss_index_path))
print(f"FAISS index saved to {faiss_index_path}")

metadata = {
    'image_paths': valid_image_paths,
    'total_images': len(valid_image_paths),
    'embedding_dim': dimension
}

metadata_path = VECTOR_DB_DIR / "metadata.pkl"
with open(metadata_path, 'wb') as f:
    pickle.dump(metadata, f)
print(f"Metadata saved to {metadata_path}")

print("\nEmbedding database built successfully!")