In [40]:
!pip install chromadb pypdf sentence-transformers




In [41]:
!pip install chromadb sentence-transformers pypdf nltk




In [42]:
import os
import uuid
from typing import List

import chromadb
from chromadb.config import Settings
from pypdf import PdfReader
from sentence_transformers import SentenceTransformer

In [None]:
PDF_FOLDER = "cvs"      # folder containing your CV PDFs
CHROMA_DIR = "chroma_cv_db"  # where Chroma will store its files
COLLECTION_NAME = "cv_chunks"


MIN_CHUNK_CHARS = 300      # minimum size after merging
MAX_CHUNK_CHARS = 1200     # max safety cap to avoid huge chunks

In [None]:
client = chromadb.PersistentClient(
    path=CHROMA_DIR,
    
)


collection = client.get_or_create_collection(
    name=COLLECTION_NAME,
    
)

f
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [45]:
def read_pdf_text(path: str) -> str:
    reader = PdfReader(path)
    pages_text = []
    for page in reader.pages:
        pages_text.append(page.extract_text() or "")
    return "\n".join(pages_text)


In [None]:
def split_to_paragraphs(text: str) -> List[str]:
    
    raw_paras = [p.strip() for p in text.split("\n\n")]
    return [p for p in raw_paras if p]

In [None]:
from nltk.tokenize import sent_tokenize

def chunk_text_semantic(text: str,
                        max_chars: int = 800,
                        min_chars: int = 300):
 
    sentences = sent_tokenize(text)
    chunks = []
    current = ""

    for sent in sentences:
        if len(current) + len(sent) + 1 <= max_chars:
            current += (" " if current else "") + sent
        else:
            if len(current) >= min_chars:
                chunks.append(current)
                current = sent
            else:
                
                current += (" " if current else "") + sent

    if current.strip():
        chunks.append(current)

    return chunks


In [48]:
def semantic_chunk(text: str) -> List[str]:
    paragraphs = split_to_paragraphs(text)
    if not paragraphs:
        return []
    return merge_paragraphs_to_chunks(paragraphs)

In [None]:
def index_all_pdfs(pdf_dir: str):
    all_docs = []
    all_ids = []
    all_metas = []

    for filename in os.listdir(pdf_dir):
        if not filename.lower().endswith(".pdf"):
            continue

        path = os.path.join(pdf_dir, filename)
        print(f"[+] Reading {path}")
        raw_text = read_pdf_text(path)

        chunks = chunk_text_semantic(raw_text)
        print(f"    -> {len(chunks)} chunks")

        for i, chunk in enumerate(chunks):
            chunk_id = str(uuid.uuid4())
            all_ids.append(chunk_id)
            all_docs.append(chunk)
            all_metas.append({
                "source_file": filename,
                "chunk_index": i
            })

    if not all_docs:
        print("No documents to add!")
        return

    print(f"[+] Embedding {len(all_docs)} chunks...")
    embeddings = embedder.encode(all_docs).tolist()

    print("[+] Saving to Chroma...")
    collection.add(
        ids=all_ids,
        documents=all_docs,
        metadatas=all_metas,
        embeddings=embeddings,
    )
    print("[✓] Done. Chunks in collection:", collection.count())



index_all_pdfs(PDF_FOLDER)


[+] Reading cvs\cv_training.pdf
    -> 3 chunks
[+] Reading cvs\cv`.pdf
    -> 5 chunks
[+] Reading cvs\ToqaAsedahCV (2).pdf
    -> 5 chunks
[+] Embedding 13 chunks...
[+] Saving to Chroma...
[✓] Done. Chunks in collection: 50


In [50]:
def index_all_pdfs(pdf_folder: str = PDF_FOLDER):
    pdf_files = [
        f for f in os.listdir(pdf_folder)
        if f.lower().endswith(".pdf")
    ]

    if not pdf_files:
        print("No PDF files found.")
        return

    for pdf_file in pdf_files:
        pdf_path = os.path.join(pdf_folder, pdf_file)
        index_single_pdf(pdf_path)



In [51]:
if __name__ == "__main__":
    index_all_pdfs()
    print(" Finished indexing all CV PDFs into Chroma.")

Indexing: cv_training.pdf
  -> 1 chunks
  Stored 1 chunks from cv_training.pdf in Chroma.
Indexing: cv`.pdf
  -> 1 chunks
  Stored 1 chunks from cv`.pdf in Chroma.
Indexing: ToqaAsedahCV (2).pdf
  -> 1 chunks
  Stored 1 chunks from ToqaAsedahCV (2).pdf in Chroma.
 Finished indexing all CV PDFs into Chroma.


In [52]:
query = "Senior backend Java developer with Spring Boot and microservices"
query_embed = embedder.encode([query]).tolist()[0]

results = collection.query(
    query_embeddings=[query_embed],
    n_results=5,
)

for doc, meta, dist in zip(
    results["documents"][0],
    results["metadatas"][0],
    results["distances"][0],
):
    print("----")
    print("Source:", meta["source_file"], " | Chunk:", meta["chunk_index"], " | Score:", dist)
    print(doc[:300], "...")


----
Source: cv`.pdf  | Chunk: 3  | Score: 0.648133397102356
The
 
frontend
 
uses
 
EJS
 
templates
 
and
 
Bootstrap
 
for
 
a
 
responsive
 
UI.The
 
project
 
follows
 
MVC
 
architecture
 
with
 
dedicated
 
controllers,
 
models,
 
and
 
routes. Environment
 
variables
 
are
 
managed
 
via
 
dotenv,
 
and
 
the
 
development
 
workflow
 
uses
 
Nodemon ...
----
Source: cv`.pdf  | Chunk: 3  | Score: 0.648133397102356
The
 
frontend
 
uses
 
EJS
 
templates
 
and
 
Bootstrap
 
for
 
a
 
responsive
 
UI.The
 
project
 
follows
 
MVC
 
architecture
 
with
 
dedicated
 
controllers,
 
models,
 
and
 
routes. Environment
 
variables
 
are
 
managed
 
via
 
dotenv,
 
and
 
the
 
development
 
workflow
 
uses
 
Nodemon ...
----
Source: cv`.pdf  | Chunk: 3  | Score: 0.648133397102356
The
 
frontend
 
uses
 
EJS
 
templates
 
and
 
Bootstrap
 
for
 
a
 
responsive
 
UI.The
 
project
 
follows
 
MVC
 
architecture
 
with
 
dedicated
 
controllers,
 
models,
 
and
 
routes. Environment
 
va