# Chunking

In [1]:
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
import torch
import hashlib
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import ColbertReranker
import ollama
import os
import json
from tqdm.notebook import tqdm
import re, unicodedata
import subprocess


def clean_docling_chunk_strings(chunks):
    cleaned_chunks = []
    
    for chunk in chunks:
        # 2️⃣ Normalize Unicode and replace problematic punctuation
        chunk = unicodedata.normalize("NFKD", chunk).replace("\u00A0", " ")
        chunk = chunk.translate(str.maketrans({
            "–": "-", "—": "-", "‘": "'", "’": "'", "“": '"', "”": '"'
        }))

        # 3️⃣ Remove URLs (massive tokenizers killers)
        chunk = re.sub(r"http\S+", "", chunk)

        # 4️⃣ Normalize whitespace but preserve paragraphs
        chunk = re.sub(r"[ \t]+", " ", chunk)
        chunk = re.sub(r"\n\s*\n", "\n\n", chunk)  # merge single newlines, keep double
        chunk = chunk.strip()

        cleaned_chunks.append(chunk)

    return cleaned_chunks



EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
MAX_TOKENS = 2000
OLLAMA_MODEL_NAME= "chunker_full_doc"

converter = DocumentConverter()
tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME),
    max_tokens=MAX_TOKENS # Optional, uses the max token number of the HF tokenizer by default
)
chunker = HybridChunker(
    tokenizer=tokenizer,
    merge_peers=True #Optional, defaults to true
)

input_dir = "smoking/policy"
study_names = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
chunks_with_metadata_file_name = "tobacco_sliding_chunks_with_metadata.json"
processed_chunks=[]
try:
    with open(chunks_with_metadata_file_name, "r", encoding="utf-8") as f:
        processed_chunks = json.load(f)
except FileNotFoundError:
    print(f"No existing {chunks_with_metadata_file_name} file found, starting fresh.")
    

chunks_with_metadata = processed_chunks.copy()
processed_studies = set(chunk["document"] for chunk in processed_chunks)

study_names = [f for f in study_names if f not in processed_studies]
print(f"Found {len(processed_studies)} studies which are already processed.\nStudies which STILL need to be processed: {len(study_names)}:\n{study_names}...")


Found 4 studies which are already processed.
Studies which STILL need to be processed: 0:
[]...


# Creating chunks and adding Metadata

As well as semantic context with ollama (Anthropic style)

In [None]:
for source in tqdm(study_names, desc="Chunking documents..."):        
    entire_doc = ""
    doc = converter.convert(f"{input_dir}/{source}").document
    chunks = list(chunker.chunk(dl_doc=doc))
    chunks_str = [chunk.text for chunk in chunks]
    chunks_str = clean_docling_chunk_strings(chunks_str)

    # Free up CUDA memory right after we got the results from Docling, so that Ollama can use the entire GPU
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    for chunk in tqdm(chunks, desc=f"Adding context for chunks of {source[:20]}...", leave=False):    
        entire_doc = ""
        chunk_index = chunks.index(chunk)

        context_length = 16_000 # Reduce window to save memory
        context_length = context_length - 2 * MAX_TOKENS # We need to reserve space for the chunk itself (twice, the context contains the chunk itself)
        total_context_chunk_number = context_length // (MAX_TOKENS*2) # 2x, cuz before and after the chunk

        start_index_original = chunk_index - total_context_chunk_number
        start_index_truncated = max(0, start_index_original) # Avoid index out of bounds

        end_index_original = chunk_index + total_context_chunk_number
        end_index_truncated = min(len(chunks)-1, end_index_original)

        if start_index_original < 0: # We are at the start of the document, so we need to add more chunks at the end
            end_index_truncated = min(len(chunks)-1, end_index_truncated + abs(start_index_original))
        if end_index_original > len(chunks)-1: # We are at the end of the document, so we need to add more chunks at the start
            start_index_truncated = max(0, start_index_truncated + abs(end_index_original - end_index_truncated))

        for i in range(start_index_truncated, end_index_truncated + 1):
            entire_doc += " " + chunks_str[i]

        entire_doc = "FULL DOCUMENT:\n" + entire_doc
        ollama_prompt = f"CHUNK:\n{chunks_str[chunk_index]}"
        history =  [{'role': 'user', 'content': entire_doc}, {'role': 'user', 'content': ollama_prompt}]

        response = ollama.chat(
            model=OLLAMA_MODEL_NAME,
            messages=history,
            # options={
            #     'gpu_layers': 100  # use  GPU for model layers if VRAM allows
            # }
        )
        context = response['message']['content']
        # print(f"Context for chunk: {context}")
        text_to_embed = chunks_str[chunk_index] + "\n\n" + context # We put the context AFTER the chunk to not mess up cosine similarity but still
        # print(context)
        pages = set(
                prov.page_no
                for doc_item in chunk.meta.doc_items
                for prov in doc_item.prov
            )
        id = hashlib.sha256(chunks_str[chunk_index].encode()).hexdigest()
        chunks_with_metadata.append({'text': text_to_embed, 'original_text':chunks_str[chunk_index], 'context':context, 'document':source, 'pages':list(pages), 'id': id})
        
    # Free up ollama from GPU memory so that Docling can semantically analyze the next doc even if it's like 100 pages
    subprocess.run(["ollama", "stop", OLLAMA_MODEL_NAME], check=True)

Chunking documents...: 0it [00:00, ?it/s]

In [7]:
# Save the the processed chunks in case VectorDB upload goes wrong.
# Luckily since this is a notebook, if the chunking is interrupted, we can still save the partial results here.
# Append new chunks to the existing file if it exists, otherwise create it
if os.path.exists(chunks_with_metadata_file_name):
    print(f"Appending to existing {chunks_with_metadata_file_name} file.")
    with open(chunks_with_metadata_file_name, "r", encoding="utf-8") as f:
        existing_data = json.load(f)
    # Avoid duplicate entries by id
    existing_ids = {chunk['id'] for chunk in existing_data}
    new_chunks = [chunk for chunk in chunks_with_metadata if chunk['id'] not in existing_ids]
    chunks_with_metadata = existing_data + new_chunks

with open(chunks_with_metadata_file_name, "w", encoding="utf-8") as f:
    json.dump(chunks_with_metadata, f, ensure_ascii=False, indent=2)

print(f"Results saved to {chunks_with_metadata_file_name}")

Appending to existing tobacco_sliding_chunks_with_metadata.json file.
Results saved to tobacco_sliding_chunks_with_metadata.json


# Creating Database

In [2]:
registry = get_registry()
hf = registry.get("huggingface").create(name=EMBEDDING_MODEL_NAME, trust_remote_code=True, device="cuda" if torch.cuda.is_available() else "cpu")


# Define model
class MyDocument(LanceModel):
    text: str = hf.SourceField()
    vector: Vector(hf.ndims()) = hf.VectorField()
    original_text: str
    context: str
    document: str
    pages: list[int]  # Any additional metadata
    id: str  # Unique identifier for the chunk




db = lancedb.connect("./db")
db.create_table("my_sliding_tobacco_table", schema=MyDocument, mode="overwrite") # Uncomment this line when running this cell for the first time
table = db.open_table("my_sliding_tobacco_table")

# Upload in batches with progress bar
with open(chunks_with_metadata_file_name, "r", encoding="utf-8") as f:
    chunks_with_metadata = json.load(f)

batch_size = 100
for i in tqdm(range(0, len(chunks_with_metadata), batch_size), desc="Uploading chunks to VectorDB"):
    batch = chunks_with_metadata[i:i+batch_size]
    table.add(batch)

table.create_scalar_index("id", replace=True) # Index based on the chunk's id, used to manually prevent duplicates

reranker = ColbertReranker()
table.create_fts_index("text", replace=True) # Used by the reranker as well as the hybrid search's BM25 index
table.wait_for_index(["text_idx"])  # Wait for the indexing to finish

<All keys matched successfully>
[90m[[0m2025-10-12T18:00:26Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /home/martin/projects/Quantwise/Quantwise-Chunking/db/my_sliding_tobacco_table.lance, it will be created


Uploading chunks to VectorDB:   0%|          | 0/2 [00:00<?, ?it/s]

<All keys matched successfully>
<All keys matched successfully>


Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device cuda
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting


# Example query

In [3]:
prompt = "How were different kinds of tobacco products assessed based on composite odour intensity"
results = table.search(prompt, query_type="hybrid", vector_column_name="vector", fts_columns="text") \
            .rerank(reranker=reranker) \
            .limit(3) \
            .to_pandas()


results

<All keys matched successfully>


Unnamed: 0,text,vector,original_text,context,document,pages,id,_relevance_score
0,A rank rating test was used to assess the inte...,"[-0.12656964, 0.45683643, -3.8943672, -1.77620...",A rank rating test was used to assess the inte...,Details the methodology for assessing whether ...,methodology_technical-assessment_test-products...,[21],df5a96217dca6d7d6c5d976a18c9b98a7a79263efd8080...,1.140071
1,Because the term clearly noticeable has no spe...,"[0.7338957, -0.12551239, -3.632481, -1.7281826...",Because the term clearly noticeable has no spe...,Defines criteria for determining if a detected...,methodology_technical-assessment_test-products...,[28],ae02281e5221ed329315c4c68c244bdb2f9ade29764b6c...,1.024306
2,Sensory analysis through the descriptive profi...,"[0.3010092, -0.24398738, -3.4934468, -1.605134...",Sensory analysis through the descriptive profi...,"Details the methodology for sensory analysis, ...",methodology_technical-assessment_test-products...,[20],b4006357111f7d58b30ea71328ec9a1f790defd0ff60e3...,0.956337


In [4]:
results.iloc[0,0]

'A rank rating test was used to assess the intensity of the overall composite aroma for samples that were flagged as possibly containing a characterising flavour, during descriptive profiling. During rankrating, the composite odour of the sample is evaluated, rather than individual odour elements, reflecting how they are likely to be perceived by consumers. For example, a product may possess a composite aroma of mint chocolate comprising individual odour attributes of spearmint, peppermint, burnt sugar, vanilla and dark chocolate.\nSensory panellists were instructed to rate the intensity of such a specified composite aroma in each flagged sample using a scale of 0 -10. For assessing if a tobacco product has a clearly noticeable flavour, the average intensity of the composite odour of the undiluted test sample is statistically compared with that of a reference product and with cut-off limits considered to represent a clearly noticeable odour intensity.\n\nDetails the methodology for ass