In [10]:
import os, torch
from pypdf import PdfReader
import chromadb
from sentence_transformers import SentenceTransformer

from docx import Document
from pptx import Presentation


# ================================
# 1. Load file (PDF, DOCX, PPTX)
# ================================
def load_file_pages(path):
    ext = os.path.splitext(path)[1].lower()

    # ========== PDF ==========
    if ext == ".pdf":
        reader = PdfReader(path)
        pages = []
        for page in reader.pages:
            text = page.extract_text() or ""
            pages.append(text)
        return pages

    # ========== DOCX ==========
    elif ext == ".docx":
        doc = Document(path)
        pages = []

        buffer = []
        paragraph_count = 0

        for para in doc.paragraphs:
            text = para.text.strip()
            if text:
                buffer.append(text)
                paragraph_count += 1

            # Cho thành "page" sau mỗi 20 đoạn (tùy chỉnh)
            if paragraph_count >= 20:
                pages.append("\n".join(buffer))
                buffer = []
                paragraph_count = 0

        if buffer:
            pages.append("\n".join(buffer))

        return pages

    # ========== PPTX ==========
    elif ext == ".pptx":
        pres = Presentation(path)
        pages = []

        for slide in pres.slides:
            slide_text = []
            for shape in slide.shapes:
                if hasattr(shape, "text"):
                    slide_text.append(shape.text)
            pages.append("\n".join(slide_text))

        return pages

    # ========== Không hỗ trợ ==========
    else:
        raise ValueError("Unsupported file format: only PDF, DOCX, PPTX.")



# ================================
# 2. Chunk 1 trang
# ================================
def chunk_page(text, chunk_size=800, overlap=200):
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start = end - overlap

    return chunks


# ================================
# 3. Build chunks + metadata
# ================================
def build_chunks(path):
    pages = load_file_pages(path)

    all_chunks = []
    all_ids = []
    all_meta = []

    for page_idx, text in enumerate(pages):
        page_number = page_idx + 1
        chunks = chunk_page(text)

        for ci, c in enumerate(chunks):
            all_chunks.append(c)
            all_ids.append(f"{os.path.basename(path)}_p{page_number}_c{ci}")
            all_meta.append({
                "page": page_number,
                "chunk": ci
            })

    return all_chunks, all_ids, all_meta, pages



# ================================
# 4. Model + Vector DB
# ================================
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")

chroma = chromadb.Client()
collection = chroma.get_or_create_collection(
    name="pdf_docs",
    metadata={"hnsw:space": "cosine"}
)



# ================================
# 5. Index PDF/DOCX/PPTX
# ================================
def index_pdf(path, batch_size=2):
    chunks, ids, metadata, pages = build_chunks(path)

    all_embeds = []

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i : i + batch_size]

        with torch.no_grad():
            vec = model.encode(batch).tolist()

        all_embeds.extend(vec)

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        print(f"Đã embed {i + len(batch)}/{len(chunks)} chunks", end="\r")

    collection.add(
        ids=ids,
        documents=chunks,
        metadatas=metadata,
        embeddings=all_embeds
    )

    print(f"\nIndexed {len(chunks)} chunks từ file {path}")
    return pages



# ================================
# 6. Lấy trang lân cận
# ================================
def get_surrounding_pages(page, pages):
    prev_page = pages[page - 2] if page > 1 else None
    this_page = pages[page - 1]
    next_page = pages[page] if page < len(pages) else None

    return prev_page, this_page, next_page



# ============================================================
# 7. SEARCH 2 BƯỚC: TopK → Expand → Re-chunk → Rerank
# ============================================================
def search(query, pages, top_k_first=3, top_k_second=5, chunk_size=800, overlap=200):

    # STEP 1 ───────────────────────────────────────────────
    q_emb = model.encode([query]).tolist()

    result = collection.query(
        query_embeddings=q_emb,
        n_results=top_k_first,
        include=["documents", "metadatas"]
    )

    metas = result["metadatas"][0]

    # STEP 2 ───────────────────────────────────────────────
    expanded_pages = []
    seen = set()

    for meta in metas:
        page = meta["page"]
        prev_page, this_page, next_page = get_surrounding_pages(page, pages)
        candidates = [prev_page, this_page, next_page]

        for p in candidates:
            if p and p not in seen:
                expanded_pages.append(p)
                seen.add(p)

    # STEP 3 ───────────────────────────────────────────────
    big_text = "\n\n".join(expanded_pages)
    words = big_text.split()

    re_chunks = []
    start = 0
    while start < len(words):
        end = start + chunk_size
        chunk = " ".join(words[start:end])
        re_chunks.append(chunk)
        start = end - overlap

    # STEP 4 ───────────────────────────────────────────────
    embeds = model.encode(re_chunks).tolist()

    temp = chromadb.Client().create_collection(
        name="temp_rerank",
        metadata={"hnsw:space": "cosine"},
        get_or_create=True
    )

    temp_ids = [f"rechunk_{i}" for i in range(len(re_chunks))]
    temp.add(ids=temp_ids, embeddings=embeds, documents=re_chunks)

    result2 = temp.query(
        query_embeddings=q_emb,
        n_results=top_k_second,
        include=["documents", "distances"]
    )

    docs2 = result2["documents"][0]
    dists2 = result2["distances"][0]

    # STEP 5 ───────────────────────────────────────────────
    output = []
    for i, (doc, dist) in enumerate(zip(docs2, dists2), start=1):
        output.append({
            "rank": i,
            "matched_chunk": doc,
            "score": 1 - dist,
            "raw_distance": dist
        })

    return output



# # ============================================================
# # 8. RUN
# # ============================================================
# if __name__ == "__main__":
#     pages = index_pdf("test.docx")   # hoặc test.pdf, test.pptx

#     results = search("Mô hình nào được dùng?", pages)

#     for r in results:
#         print("="*80)
#         print("Rank:", r["rank"])
#         print("Score:", r["score"])
#         print(r["matched_chunk"])


In [11]:
pages = index_pdf("test2.docx")

Đã embed 20/20 chunks
Indexed 20 chunks từ file test2.docx


In [12]:
results = search("Điều 20. Trách nhiệm triển khai Quy chế", pages, top_k_first=3, top_k_second=5, chunk_size=300, overlap=100)

for r in results:
    print("="*80)
    print("Rank:", r["rank"])
    print("Score:", r["score"])
    print(r["matched_chunk"])

IndexError: list index out of range