In [1]:
import os
import subprocess
import base64
import dotenv
import os
import time
from pdf2image import convert_from_path
from tqdm import tqdm
from PIL import Image
from io import BytesIO
from google import genai
from google.genai import types
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType, utility
from pymilvus.model.hybrid import BGEM3EmbeddingFunction

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import warnings
warnings.filterwarnings('ignore')

## Reading PPT or PDF image

In [3]:
dotenv.load_dotenv()
GEMINI_API_KEY = os.getenv('GEMINI_API_KEY')
client = genai.Client(api_key=GEMINI_API_KEY)


In [4]:
def pptx_to_pdf(pptx_path, output_dir):
    """Convert PPTX to PDF using LibreOffice."""
    os.makedirs(output_dir, exist_ok=True)
    subprocess.run([
        "libreoffice",
        "--headless",
        "--convert-to", "pdf",
        "--outdir", output_dir,
        pptx_path
    ], check=True)

    filename = os.path.splitext(os.path.basename(pptx_path))[0]
    return os.path.join(output_dir, f"{filename}.pdf")


In [5]:
def pdf_to_images(file_path, output_dir):
    ext = os.path.splitext(file_path)[1].lower()
    if ext in ['.ppt', '.pptx']:
        print ("converting ppt to pdf first")
        file_path = pptx_to_pdf(file_path, "raw_data")
        
    """Convert PDF pages to images using pdf2image."""
    filename = os.path.splitext(os.path.basename(file_path))[0]
    os.makedirs(os.path.join(output_dir,filename), exist_ok=True)
    images = convert_from_path(file_path)
    image_paths = []
    for i, img in tqdm(enumerate(images)):
        image_file = os.path.join(output_dir,filename, f"slide_{i+1}.png")
        img.save(image_file, "PNG")
        image_paths.append(image_file)

    return image_paths

In [6]:
files_path = pdf_to_images("Case Study Summary (1).pptx", "all_doc_images")

converting ppt to pdf first


Error: source file could not be loaded
44it [00:06,  7.22it/s]


### Generating Summary and Embedding

In [7]:
ef = BGEM3EmbeddingFunction(
    model_name='BAAI/bge-m3',
    device='cpu',  # Change to 'cuda' if GPU support is required
    use_fp16=False
)

2025-05-12 21:08:19.662553: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-12 21:08:19.670747: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-12 21:08:19.731976: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-12 21:08:19.767196: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747064299.800564    6400 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747064299.81

In [8]:
def generate_summary(image):
    return "temp summary"
    response = client.models.generate_content(
        model="gemini-2.0-flash",
        config=types.GenerateContentConfig(
            systemInstruction="You are an expert in analyzing document images. Your task is to provide detailed and accurate summaries for each page of the document. The summaries should capture all key topics, specific information, and any references to other pages. The summaries will be used to facilitate precise retrieval of relevant pages in a retrieval-augmented generation (RAG) system."
        ),
        contents=[image]
    )
    return response.text

In [9]:
import numpy as np

In [34]:
def generate_dense_embeddings(text):
    # result = client.models.embed_content(
    #         model="text-embedding-004",
    #         contents=text)
    # embedding = result.embeddings[0].values
    # return embedding
    # print(embedding)
    # print(type(embedding), embedding)
    # print("=======================================================")
    global ef
    embedding = ef([text])['dense'][0].tolist()
    print(type(embedding), embedding )
    return embedding



In [35]:
def generate_sparse_embeddings(text):
    global ef
    sparse_embedding = ef([summary])
    return sparse_embedding['sparse']



In [36]:
metadata = {"image_path": [], "summary": [], "dense_vector": [], "sparse_vector": []}
for path in tqdm(files_path[3:5]):
    image = Image.open(path)
    try:
        summary = generate_summary(image)
        dense_embedding = generate_dense_embeddings(summary)
        sparse_embedding = generate_sparse_embeddings(summary)
    except Exception as E:
        print("Error", E)
        pass
    #     time.sleep(65)
    #     summary = generate_summary(image)
    #     dense_embedding = generate_dense_embeddings(summary)
    #     sparse_embedding = generate_sparse_embeddings(summary)
    
    metadata['image_path'].append(path)
    metadata['summary'].append(summary)
    metadata['dense_vector'].append(dense_embedding)
    metadata['sparse_vector'].append(sparse_embedding)

  0%|                                                     | 0/2 [00:00<?, ?it/s]

<class 'list'> [-0.04651330038905144, 0.02745952270925045, -0.06381236761808395, -0.021016618236899376, -0.029706165194511414, -0.08941186964511871, -0.0408870168030262, 0.020505348220467567, -0.0051467688754200935, 0.023200178518891335, 0.01730448193848133, 0.01456847321242094, 0.016355201601982117, 0.011762797832489014, -0.013069248758256435, -0.013139311224222183, 0.022626018151640892, 0.011705520562827587, 0.011379276402294636, -0.014507769607007504, 0.01126062124967575, 0.034446850419044495, -0.011521917767822742, -0.014672037214040756, 0.000828984600957483, 0.007978038862347603, -0.027107298374176025, 0.007051726337522268, -0.008116387762129307, 0.06196747347712517, 0.05402252823114395, 0.00263989414088428, -0.010114043951034546, -0.05002018064260483, -0.021576672792434692, -0.049181509763002396, -0.0017192831728607416, -0.04207289218902588, -0.030178125947713852, 0.010262641124427319, -0.0023911683820188046, -0.056261807680130005, -0.008012999780476093, -0.03525751829147339, 0.0

 50%|██████████████████████▌                      | 1/2 [00:00<00:00,  1.58it/s]

<class 'list'> [-0.04651330038905144, 0.02745952270925045, -0.06381236761808395, -0.021016618236899376, -0.029706165194511414, -0.08941186964511871, -0.0408870168030262, 0.020505348220467567, -0.0051467688754200935, 0.023200178518891335, 0.01730448193848133, 0.01456847321242094, 0.016355201601982117, 0.011762797832489014, -0.013069248758256435, -0.013139311224222183, 0.022626018151640892, 0.011705520562827587, 0.011379276402294636, -0.014507769607007504, 0.01126062124967575, 0.034446850419044495, -0.011521917767822742, -0.014672037214040756, 0.000828984600957483, 0.007978038862347603, -0.027107298374176025, 0.007051726337522268, -0.008116387762129307, 0.06196747347712517, 0.05402252823114395, 0.00263989414088428, -0.010114043951034546, -0.05002018064260483, -0.021576672792434692, -0.049181509763002396, -0.0017192831728607416, -0.04207289218902588, -0.030178125947713852, 0.010262641124427319, -0.0023911683820188046, -0.056261807680130005, -0.008012999780476093, -0.03525751829147339, 0.0

100%|█████████████████████████████████████████████| 2/2 [00:01<00:00,  1.68it/s]


In [37]:
len(metadata['dense_vector'][0])

1024

### Saving embedding into milvus database

In [38]:
connections.connect("default", host="localhost", port="19530")

In [39]:
utility.list_collections()

['embedding_db']

In [40]:
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="image_path", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="summary", dtype=DataType.VARCHAR, max_length=10_000),
    FieldSchema(name="dense_vec", dtype=DataType.FLOAT_VECTOR, dim=768),
    FieldSchema(name="sparse_vec", dtype=DataType.SPARSE_FLOAT_VECTOR)
]

In [41]:
schema = CollectionSchema(fields, description="schema defined to store embedding")

In [42]:
utility.list_collections()

['embedding_db']

In [44]:
# collection_name = "embedding_db"
# # Drop the collection
# if collection_name in utility.list_collections():
#     collection = Collection(name=collection_name)
#     collection.drop()
#     print(f"Collection '{collection_name}' deleted.")
# else:
#     print(f"Collection '{collection_name}' does not exist.")

In [45]:
collection_name = "embedding_db"
collection = Collection(name=collection_name,schema=schema)

In [46]:
type(metadata['dense_vector'][0])

list

In [47]:
collection.insert([metadata['image_path'], 
                   metadata['summary'], 
                   metadata['dense_vector'], 
                   metadata['sparse_vector']
                  ]
                 )

collection.flush()

In [48]:
collection.num_entities

2