In [1]:
#faiss install on colab issue: https://discuss.huggingface.co/t/cannot-install-faiss-in-google-collab/39451/3 | https://stackoverflow.com/questions/58957169/faiss-error-could-not-find-a-version-that-satisfies-the-requirement-faiss-from
!sudo apt-get install libomp-dev
!pip install faiss-gpu-cu12
#do this in Collab since transformers lib have mismatch numpy lib with faiss-gpu
!pip uninstall transformers
!pip install transformers

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libomp-dev is already the newest version (1:14.0-55~exp2).
0 upgraded, 0 newly installed, 0 to remove and 34 not upgraded.
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Would remove:
    /usr/local/bin/transformers-cli
    /usr/local/lib/python3.11/dist-packages/transformers-4.51.3.dist-info/*
    /usr/local/lib/python3.11/dist-packages/transformers/*
Proceed (Y/n)? y
  Successfully uninstalled transformers-4.51.3
Collecting transformers
  Using cached transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Using cached transformers-4.51.3-py3-none-any.whl (10.4 MB)
Installing collected packages: transformers
Successfully installed transformers-4.51.3


In [2]:
import torch
from transformers import pipeline, CLIPProcessor, CLIPModel
import faiss
from PIL import Image
import os
#google drive auth
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device: {device}")

device: cuda


In [6]:
class clip_image_search():
  def __init__(self):
    #clip model init
    self.clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    self.clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    self.clip_model.eval()

    #faiss for indexing img embeddings
    self.faiss_index = None

  def init_and_index_faiss(self, embedding_dimension, embeddings):
    self.faiss_index = faiss.IndexFlatL2(embedding_dimension)
    self.faiss_index.add(embeddings)

  def encode_imgs(self, img_folder):
    embedded_imgs_list = []
    img_paths = [os.path.join(img_folder, img_path) for img_path in os.listdir(img_folder)]
    imgs = [Image.open(img_path).convert('RGB') for img_path in img_paths]
    with torch.no_grad():
      processed_imgs = self.clip_processor(text=None, images=imgs, return_tensors="pt", padding=True, use_fast=True)
      img_embeddings = self.clip_model.get_image_features(**processed_imgs.to(device))

      #faiss needs embeedings in f32
      img_embeddings = img_embeddings.astype('float32')
      embedding_dimenstion = img_embeddings.shape[1]

      for an_img_path, an_img_embedding in zip(img_paths, img_embeddings):
        embedded_imgs_list.append([an_img_path, an_img_embedding.cpu().numpy()])
    return embedded_imgs_list

  def encode_text_prompt(self, text):
    with torch.no_grad():
      text_inputs = self.clip_processor(text=text, return_tensors="pt", padding=True)
      text_embeddings = self.clip_model.get_text_features(**text_inputs.to(device))
    return text_embeddings

  def index_imgs(self, embedded_imgs_list):
    pass

In [7]:
clip_img_search_obj = clip_image_search()
img_folder = '/content/drive/MyDrive/image-search-project/image-search-data'
clip_img_search_obj.encode_imgs(img_folder)

[['/content/drive/MyDrive/image-search-project/image-search-data/2023_Subaru_Outback_Premium,_front_right,_09-09-2023.jpg',
  array([ 5.95408916e-01,  4.42018539e-01, -4.12071168e-01, -6.47077978e-01,
          1.18115179e-01,  9.18579549e-02, -1.34224534e-01,  4.13121462e-01,
          3.68002027e-01, -3.21268618e-01,  3.99387553e-02,  2.91329592e-01,
          1.70254037e-02, -6.34506866e-02,  3.60006899e-01,  1.89423695e-01,
          8.02328348e-01,  3.87036145e-01, -1.46725401e-01, -3.39871645e-03,
          1.82914168e-01,  2.44966149e-01,  4.74974841e-01, -5.19604623e-01,
         -3.17894816e-01,  1.39100611e-01,  5.83196580e-02, -4.24343407e-01,
         -2.33254194e-01, -1.50635809e-01, -1.37975261e-01, -2.79160470e-01,
          2.65696049e-02,  5.83281405e-02, -1.63765207e-01,  4.63764071e-02,
         -3.82339776e-01, -2.65418768e-01, -9.37541947e-02,  1.82145476e+00,
          8.79600644e-02,  3.87010664e-01, -5.08700088e-02,  7.04986826e-02,
          3.09625208e-01, -1.