<a href="https://colab.research.google.com/github/anagha-h/PLAI-Project/blob/main/VectorDBPLAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install pillow

Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install pandas sentence-transformers pandarallel faiss-cpu



In [4]:
pip install pymupdf

Collecting pymupdf
  Using cached PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Using cached PyMuPDF-1.24.14-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (19.8 MB)
Installing collected packages: pymupdf
Successfully installed pymupdf-1.24.14
Note: you may need to restart the kernel to use updated packages.


In [8]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import faiss
import fitz  # PyMuPDF for PDF
import numpy as np
import os
import json
import io
#import nltk

In [9]:
# Initialize CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [10]:
#Convert image to embedding
def process_image(image):
    with torch.no_grad():
        image_embedding = model.get_image_features(image_input["pixel_values"])
    return image_embedding

In [11]:
# Function to process text
def process_text(text):
    with torch.no_grad():
        text_embedding = model.get_text_features(text_input["input_ids"])
    return text_embedding

In [13]:
# Folder containing PDF documents
folder_path = "/home/nagarw48/Projects/PLAI/PLAI-Project-3/trainingData/Training_images_solutions_updated"

In [6]:
# Loop through all files in the folder

preprocessed_images= []
image_bank= []
document_text = []
metadata_store = []

for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        document_id = os.path.splitext(filename)[0]
        #checkp
        print(f"Processing file: {filename}")
        doc = fitz.open(pdf_path)

        for page_num in range(len(doc)):
            page = doc[page_num]
            #print(len(doc))

            # Extract images
            for img_index, img in enumerate(page.get_images(full=True)):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                image = Image.open(io.BytesIO(image_bytes))
                image_input = processor(images=image, return_tensors="pt")
                preprocessed_images.append(image_input['pixel_values'])
                image_bank.append(image)
                image.close()
                metadata_store.append({
                    "type": "image",
                    "document_id": document_id,
                    "page_num": page_num,
                    "image_index": img_index,
                })
                #checkpoint
                #print("i")

            # Extract text
            text = page.get_text()
            if text.strip():  # Skip empty text
                text_input = processor(text=[text], return_tensors="pt", padding=True, truncation=True)
                document_text.append(text_input)
                metadata_store.append({
                    "type": "text",
                    "document_id": document_id,
                    "page_num": page_num,
                    "content": text,
                })
                # checkpoint
                #print("t")

        doc.close()

Processing file: Grid-world_file66-78.pdf
Processing file: gridworld_set_2_new.pdf
Processing file: gridworld_test_new.pdf
Processing file: PLAI_Output (2).pdf
Processing file: Grid_world_file21-30.pdf
Processing file: last_18_new.pdf
Processing file: Grid-world_file31-40.pdf
Processing file: Grid-world_file1-10.pdf
Processing file: Grid-world_files41-50.pdf
Processing file: Grid-world_file51-65.pdf
Processing file: Grid-world_file11-20.pdf


In [7]:
batch_input = torch.cat(preprocessed_images , dim=0)
image_embedding = process_image(batch_input)
print(image_embedding)

tensor([[ 2.6872e-01, -4.3306e-02, -1.7488e-01, -2.2593e-01,  3.1330e-01,
         -4.0300e-01,  2.9498e-01,  2.1205e-01,  3.0699e-01,  3.6545e-01,
         -3.1153e-03,  4.1484e-01,  1.1744e-01, -2.3069e-01,  7.5065e-02,
         -1.3279e-01,  5.6340e-01, -2.4897e-02, -1.7695e-01,  9.1751e-02,
         -1.8304e-01, -3.3347e-01,  1.2787e-01, -2.3397e-01, -3.8082e-01,
          4.4188e-02, -4.8348e-01,  3.4675e-01,  8.3958e-02, -4.1608e-01,
          6.5728e-01,  2.7235e-01,  3.5981e-01,  7.6991e-02,  2.7617e-01,
         -3.4390e-01,  2.1433e-01,  3.2886e-03,  2.5814e-01, -1.5308e+00,
         -8.8266e-02, -8.4072e-02, -1.8548e-01, -1.5733e-01,  5.8053e-02,
         -3.7138e-01, -8.3881e-01, -1.2056e-01,  2.0676e-02,  1.0401e-01,
         -7.2531e-02, -5.5054e-01,  1.6262e-02, -3.4487e-01, -7.0159e-01,
         -1.1391e-01, -1.3295e-01, -2.9955e-01,  4.8086e-02,  4.8735e-01,
          9.2433e-01,  7.9777e-02, -4.7849e-01, -2.4734e-01, -1.0972e-01,
         -1.1486e-02,  1.1345e-01,  1.

In [8]:
text_embedding =process_text(document_text)
#Checkpoint
#print(text_embedding)

In [9]:
# Initialize FAISS index
index = faiss.IndexFlatL2(512)

In [10]:
#for image
image_embedding_np = image_embedding.cpu().numpy()
image_dimension = image_embedding_np.shape[1]
#checkpoint
#print (image_dimension)

index.add(image_embedding_np)

In [11]:
#for text
text_embedding_np = text_embedding.cpu().numpy()
text_dimension = text_embedding_np.shape[1]
#checkpoint
#print (text_dimension)

index.add(text_embedding_np)

In [12]:
faiss.write_index(index, "vector_db_f.faiss")
with open("metadata_store_f.json", "w") as f:
    json.dump(metadata_store, f)
#checkpoint
#print(metadata_store)