In [None]:
# 1) Cài đặt + import
!python -m pip install -q chromadb sentence-transformers pypdf python-docx

from pathlib import Path
from typing import List, Tuple

from chromadb import PersistentClient
from chromadb.config import Settings
from docx import Document
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer

def load_text_with_pages(file_path: Path) -> List[Tuple[int, str]]:
    suffix = file_path.suffix.lower()
    if suffix == ".pdf":
        reader = PdfReader(str(file_path))
        return [(i + 1, (page.extract_text() or "")) for i, page in enumerate(reader.pages)]
    if suffix in {".docx", ".doc"}:
        doc = Document(str(file_path))
        paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
        return [(1, "\n".join(paragraphs))]
    if suffix == ".txt":
        text = file_path.read_text(encoding="utf-8", errors="ignore")
        return [(1, text)]
    raise ValueError(f"Unsupported file type: {suffix}")

def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
    text = " ".join(text.split())
    if not text:
        return []
    chunks = []
    step = max(1, chunk_size - overlap)
    start = 0
    while start < len(text):
        chunk = text[start : start + chunk_size]
        if chunk.strip():
            chunks.append(chunk)
        start += step
    return chunks

def build_ids(base: str, page: int, count: int) -> List[str]:
    safe_base = base.replace(" ", "_")
    return [f"{safe_base}_p{page}_c{i:04d}" for i in range(count)]

def safe_upsert(collection, ids, documents, metadatas, embeddings):
    if hasattr(collection, "upsert"):
        collection.upsert(ids=ids, documents=documents, metadatas=metadatas, embeddings=embeddings)
        return
    try:
        collection.add(ids=ids, documents=documents, metadatas=metadatas, embeddings=embeddings)
    except Exception:
        try:
            collection.delete(ids=ids)
        except Exception:
            pass
        collection.add(ids=ids, documents=documents, metadatas=metadatas, embeddings=embeddings)

In [None]:
# 2) Chọn tài liệu từ data/
DOC_PATH = Path("data/your_file.pdf")  # chỉnh lại file cần dùng
MODEL_DIR = Path("./all-MiniLM-L6-v2")
DB_DIR = Path("./chroma_db")
COLLECTION_NAME = "my_docs"
CHUNK_SIZE = 800
CHUNK_OVERLAP = 150
TOP_K = 3

In [None]:
# 3) Đọc tài liệu + preview
assert DOC_PATH.exists(), f"File không tồn tại: {DOC_PATH}"
pages = load_text_with_pages(DOC_PATH)
print(f"Pages read: {len(pages)}")
preview_segments = []
for _, text in pages:
    for seg in text.splitlines():
        if seg.strip():
            preview_segments.append(seg.strip())
        if len(preview_segments) >= 2:
            break
    if len(preview_segments) >= 2:
        break
print("Preview:\n- " + "\n- ".join(preview_segments[:2]))

In [None]:
# 4) Chunking
all_chunks = []
metas = []
for page_num, text in pages:
    chunks = chunk_text(text, CHUNK_SIZE, CHUNK_OVERLAP)
    ids = build_ids(DOC_PATH.stem, page_num, len(chunks))
    all_chunks.extend(chunks)
    metas.extend([{'source': str(DOC_PATH), 'page': page_num}] * len(chunks))
print(f"Total chunks: {len(all_chunks)}")
for i, c in list(enumerate(all_chunks[:2])):
    print(f"Chunk {i}: {c[:200]}...")

In [None]:
# 5) Load model local + embedding
model = SentenceTransformer(str(MODEL_DIR))
embeddings = model.encode(all_chunks, normalize_embeddings=True)
print("Embeddings shape:", getattr(embeddings, 'shape', (len(embeddings), len(embeddings[0]) if embeddings else 0)))

In [None]:
# 6) Lưu ChromaDB (persist)
DB_DIR.mkdir(parents=True, exist_ok=True)
client = PersistentClient(path=str(DB_DIR), settings=Settings())
collection = client.get_or_create_collection(name=COLLECTION_NAME)
ids = [f"{DOC_PATH.stem}_p{m['page']}_c{i:04d}" for i, m in enumerate(metas)]
safe_upsert(collection, ids, all_chunks, metas, embeddings.tolist())
print("Collection size:", collection.count())

In [None]:
# 7) Query thử 3 câu hỏi mẫu
queries = [
    "Tóm tắt tài liệu",
    "Các ý chính là gì?",
    "Chi tiết quan trọng trong văn bản",
]
for q in queries:
    q_emb = model.encode([q], normalize_embeddings=True).tolist()[0]
    res = collection.query(
        query_embeddings=[q_emb],
        n_results=TOP_K,
        include=["documents", "metadatas", "distances"],
    )
    docs = res.get("documents", [[]])[0]
    metas_res = res.get("metadatas", [[]])[0]
    dists = res.get("distances", [[]])[0]
    print(f"
Query: {q}")
    for rank, (doc, meta, dist) in enumerate(zip(docs, metas_res, dists), start=1):
        source = meta.get('source', '') if isinstance(meta, dict) else ''
        page = meta.get('page', '?') if isinstance(meta, dict) else '?'
        preview = (doc or '')[:400].replace('\n', ' ')
        print(f"#{rank} | dist={dist:.4f} | source={source} | page={page}
  {preview}")