In [4]:
import fitz  # PyMuPDF
import os
import json
import uuid
from langchain.schema import Document
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

In [5]:
PDF_PATH = "document.pdf"
OUTPUT_DIR = "pdf_chunks"
CHROMA_DB_DIR = "chroma_db"
EMBEDDING_MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"

In [6]:
def save_image(doc, img_info, page_number):
    xref = img_info[0]
    base_image = doc.extract_image(xref)
    image_bytes = base_image["image"]
    img_ext = base_image["ext"]
    img_filename = f"page_{page_number}_img_{xref}.{img_ext}"
    img_path = os.path.join(OUTPUT_DIR, img_filename)
    with open(img_path, "wb") as f:
        f.write(image_bytes)
    return img_path

In [21]:
def create_chunks_from_page(blocks, images, page_number):
    chunks = []
    current_chunk = {"text": "", "table": None, "image_path": None, "image_caption": None, "y": None}
    last_y1 = 0
    chunk_y_positions = []

    for block in sorted(blocks, key=lambda b: b[1]):  # sort by y0
        x0, y0, x1, y1, text, block_no = block
        text = text.strip()
        if not text:
            continue

        if current_chunk["y"] is None:
            current_chunk["y"] = y0

        # Ngắt chunk nếu khoảng cách giữa 2 block quá lớn
        if y0 - last_y1 > 40 and current_chunk["text"]:
            chunks.append(current_chunk)
            current_chunk = {"text": "", "table": None, "image_path": None, "image_caption": None, "y": y0}

        current_chunk["text"] += text + "\n"
        last_y1 = y1

    if current_chunk["text"]:
        chunks.append(current_chunk)
    # print(len(chunks))
    # Gắn ảnh vào chunk gần nhất
    if chunks:
        for img_path, img_y in images:
            closest_chunk = min(chunks, key=lambda c: abs(c.get("y", 0) - img_y))
            closest_chunk["image_path"] = img_path
            closest_chunk["image_caption"] = "Ảnh minh hoạ từ trang PDF."

    return chunks

In [22]:
def process_pdf(pdf_path):
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    doc = fitz.open(pdf_path)
    all_chunks = []

    for page_number in range(len(doc)):
        page = doc[page_number]
        blocks = page.get_text("blocks")

        # Get block info: x0, y0, x1, y1, text, block_no
        block_info = [
            (b[0], b[1], b[2], b[3], b[4], i) for i, b in enumerate(blocks)
        ]

        # Get image info
        img_list = page.get_images(full=True)
        images = []
        for img in img_list:
            img_path = save_image(doc, img, page_number)
            # Approximate y position (center y)
            y_pos = (page.rect.height / 2)  # fallback
            images.append((img_path, y_pos))

        chunks = create_chunks_from_page(block_info, images, page_number)
        for i, chunk in enumerate(chunks):
            chunk_id = f"page_{page_number}_chunk_{i}_{uuid.uuid4().hex[:6]}"
            chunk["id"] = chunk_id
            chunk["page"] = page_number
            all_chunks.append(chunk)

    with open(os.path.join(OUTPUT_DIR, "chunks.json"), "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, indent=2, ensure_ascii=False)

    return all_chunks

In [23]:
def embed_and_store_chunks(chunks):
    docs = []
    for chunk in chunks:
        content = chunk["text"]
        if chunk.get("image_caption"):
            content += f"\n[Hình ảnh]: {chunk['image_caption']}"
        if chunk.get("table"):
            content += f"\n[Bảng]: {chunk['table']}"

        metadata = {
                "chunk_id": chunk["id"],
                "page": chunk["page"],
        }

        # Chỉ thêm image_path nếu khác None
        if chunk.get("image_path") is not None:
            metadata["image_path"] = str(chunk["image_path"])

        docs.append(Document(page_content=content, metadata=metadata))

    embedder = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    vectordb = Chroma.from_documents(docs, embedder, persist_directory=CHROMA_DB_DIR)
    vectordb.persist()
    print("✅ Vectorstore saved to:", CHROMA_DB_DIR)

In [24]:
chunks = process_pdf(PDF_PATH)
embed_and_store_chunks(chunks)
print(f"✅ Total chunks: {len(chunks)}")

✅ Vectorstore saved to: chroma_db
✅ Total chunks: 29


  vectordb.persist()
