## Image Embeddings

This notebook creates extracts all the images from owners manual and creates image embeddings 
using clip model in roboflow, which are stored in a QDrant Collection. Metadata containg image_id will be stored aswell, which are used for image based search.

In [None]:
!pip install faiss-cpu
!pip install sentence_transformers
!pip install pymupdf
!pip install qdrant-client

In [1]:
import torch
import base64
import requests

from transformers import CLIPProcessor, CLIPModel, CLIPTokenizer
from sentence_transformers.util import cos_sim

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def get_model_info(model_ID, device):
	model = CLIPModel.from_pretrained(model_ID).to(device)
	processor = CLIPProcessor.from_pretrained(model_ID)
	tokenizer = CLIPTokenizer.from_pretrained(model_ID)
	return model, processor, tokenizer


device = "cuda" if torch.cuda.is_available() else "cpu"
model_ID = "openai/clip-vit-base-patch32"
model, processor, tokenizer = get_model_info(model_ID, device)

In [None]:
def save_model(file_path = "F:\\psg\\bosch_hackathon\\models"):
    from transformers import Trainer

    trainer = Trainer(
        model,
        tokenizer=tokenizer,
    )
    trainer.save_model(f"{file_path}\\clip_model")
    tokenizer.save_pretrained(f"{file_path}\\clip_model")


def load_model(file_path = "F:\\psg\\bosch_hackathon\\models"):
    model = CLIPModel.from_pretrained(f"{file_path}\\clip_model")
    processor = CLIPProcessor.from_pretrained(model_ID)
    tokenizer = CLIPTokenizer.from_pretrained(f"{file_path}\\clip_model")

    return model, processor, tokenizer

In [None]:
from PIL import Image

def get_single_image_embedding_clip(image_path):
    my_image = Image.open(image_path)

    image = processor(
        text=None,
        images=my_image,
        return_tensors="pt"
    )["pixel_values"].to(device)

    # Get the image features
    embedding = model.get_image_features(image)
    embedding_as_np = embedding.cpu().detach().numpy()

    return embedding_as_np

In [None]:
def encode_image(image_path):
    ''' Getting the base64 string '''
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

In [None]:
def get_single_image_embedding_roboflow(image_path):
    encoded_val = encode_image(image_path)
    infer_clip_payload = {
        "image": {
            "type": "base64",
            "value": f"{encoded_val}",
        },
    }

    base_url = "https://infer.roboflow.com"
    api_key = "pUmnI6Vv3mdDdmDiEtqz"

    res = requests.post(
        f"{base_url}/clip/embed_image?api_key={api_key}",
        json=infer_clip_payload,
    )

    embeddings = res.json()

    if "embeddings" in embeddings:
        return embeddings['embeddings'][0]
    else:
        return None

In [None]:
import io
import pymupdf
import fitz
from qdrant_client import QdrantClient
from qdrant_client.http import models

In [None]:
# Sriram's QDrant
qdrant_client = QdrantClient(
    "https://35ebdc7d-ec99-4ebd-896c-ff5705cf369b.us-east4-0.gcp.cloud.qdrant.io:6333",
    prefer_grpc=True,
    api_key="9dKJsKOYwT0vGlWPrZXBSIlbUzvRdJ1XkM0_floo8FmYCOHX_Y0y-Q",
)

QDRANT_URL = "https://35ebdc7d-ec99-4ebd-896c-ff5705cf369b.us-east4-0.gcp.cloud.qdrant.io:6333"
QDRANT_API_KEY = "9dKJsKOYwT0vGlWPrZXBSIlbUzvRdJ1XkM0_floo8FmYCOHX_Y0y-Q"

In [None]:
def create_QDrant_collection():
		embeddings = model
		qdrant_client.recreate_collection(
		collection_name="owners_manual_images",
		vectors_config = models.VectorParams(size=512, distance=models.Distance.COSINE),
	)

create_QDrant_collection()

  qdrant_client.recreate_collection(


In [None]:
def extract_data(file_path, source_file, car_name, start_idx):
    doc = pymupdf.open(file_path)
    pdf_file = fitz.open(file_path)
    data = list()

    records_to_upload = []
    for i, page in enumerate(doc):
        print(f"Page no is {i}")

        image_list = page.get_images(full=True)

        if image_list:
            print(f"[+] Found a total of {len(image_list)} images in page {i}")
        else:
            print("[!] No images found on page", i)


        for image_index, img in enumerate(page.get_images(full=True), start=1):
            xref = img[0]
            base_image = pdf_file.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image = Image.open(io.BytesIO(image_bytes))
            image_id = f"image_{source_file}_{i}_{image_index}.{image_ext}"
            img_path = f"/content/Manual_Images/{source_file}/{image_id}"
            image.save(open(img_path, "wb"))

            image_embeddings = get_single_image_embedding_roboflow(img_path)

            chunk_metadata = {
                "image_id": image_id,
                "car_name": car_name,
            }
            payload = {"metadata": chunk_metadata}

            record = models.PointStruct(
                id=start_idx + image_index,
                vector=image_embeddings[0].tolist(),
                payload=payload
            )
            records_to_upload.append(record)
        start_idx += len(image_list)

    qdrant_client.upload_points(
        collection_name="owners_manual_images",
        points=records_to_upload
    )
    return start_idx

In [None]:
start_idx = extract_data("/content/hyundai_exter.pdf", "hyundai_exter", "Hyundai Exter", 0)
start_idx