# Chunking

In [1]:
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
import torch
import hashlib
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import ColbertReranker
import ollama
import os
import json
from tqdm.notebook import tqdm
import re, unicodedata
import subprocess


def clean_docling_chunk_strings(chunks):
    cleaned_chunks = []
    
    for chunk in chunks:
        # 2️⃣ Normalize Unicode and replace problematic punctuation
        chunk = unicodedata.normalize("NFKD", chunk).replace("\u00A0", " ")
        chunk = chunk.translate(str.maketrans({
            "–": "-", "—": "-", "‘": "'", "’": "'", "“": '"', "”": '"'
        }))

        # 3️⃣ Remove URLs (massive tokenizers killers)
        chunk = re.sub(r"http\S+", "", chunk)

        # 4️⃣ Normalize whitespace but preserve paragraphs
        chunk = re.sub(r"[ \t]+", " ", chunk)
        chunk = re.sub(r"\n\s*\n", "\n\n", chunk)  # merge single newlines, keep double
        chunk = chunk.strip()

        cleaned_chunks.append(chunk)

    return cleaned_chunks



EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
MAX_TOKENS = 2000
OLLAMA_MODEL_NAME= "chunker_full_doc"
CHUNKS_WITH_METADATA_FILE_NAME = "sliding_chunks_with_metadata.json"
INPUT_DIR = "input"


converter = DocumentConverter()
tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME),
    max_tokens=MAX_TOKENS # Optional, uses the max token number of the HF tokenizer by default
)
chunker = HybridChunker(
    tokenizer=tokenizer,
    merge_peers=True #Optional, defaults to true
)

study_names = [f for f in os.listdir(INPUT_DIR) if f.endswith('.pdf')]
processed_chunks=[]
try:
    with open(CHUNKS_WITH_METADATA_FILE_NAME, "r", encoding="utf-8") as f:
        processed_chunks = json.load(f)
except FileNotFoundError:
    print(f"No existing {CHUNKS_WITH_METADATA_FILE_NAME} file found, starting fresh.")
    

chunks_with_metadata = processed_chunks.copy()
processed_studies = set(chunk["document"] for chunk in processed_chunks)

study_names = [f for f in study_names if f not in processed_studies]
print(f"Found {len(processed_studies)} studies which are already processed.\nStudies which STILL need to be processed: {len(study_names)}:\n{study_names}...")


Found 25 studies which are already processed.
Studies which STILL need to be processed: 0:
[]...


In [None]:
with open("tobacco_sliding_chunks_with_metadata.json", "r", encoding="utf-8") as f:
        tobacco_chunks = json.load(f)

with open("sliding_chunks_with_metadata.json", "r", encoding="utf-8") as f:
        scientific_chunks = json.load(f)

tobacco_chunks = [chunk['original_text'] for chunk in tobacco_chunks]
scientific_chunks = [chunk['original_text'] for chunk in scientific_chunks]

tobacco_chunk_token_length = [len(tokenizer.tokenizer.tokenize(chunk)) for chunk in tobacco_chunks]
scientific_chunk_token_length = [len(tokenizer.tokenizer.tokenize(chunk)) for chunk in scientific_chunks]

import numpy as np
print(f"Average chunk token count in different categories of documents\nTobacco: \t\t{np.mean(np.array(tobacco_chunk_token_length))}\nScientific papers:\t{np.mean(np.array(scientific_chunk_token_length))}")

# PREVIOUS VALUES
# Average chunk token count in different categories of documents
# Tobacco: 		420.94444444444446
# Scientific papers:	671.1837270341207

# VALUES WITH NEW ALGORITHM ON SCIENTIFIC PAPERS
# Average chunk token count in different categories of documents
# Tobacco: 		420.94444444444446
# Scientific papers:	663.5811518324607

Average chunk token count in different categories of documents
Tobacco: 		420.94444444444446
Scientific papers:	663.5811518324607


# Creating chunks and adding Metadata

As well as semantic context with ollama (Anthropic style)

In [None]:
for source in tqdm(study_names, desc="Chunking documents..."):        
    entire_doc = ""
    doc = converter.convert(f"{INPUT_DIR}/{source}").document
    chunks = list(chunker.chunk(dl_doc=doc))
    chunks_str = [chunk.text for chunk in chunks]
    chunks_str = clean_docling_chunk_strings(chunks_str)

    # Free up CUDA memory right after we got the results from Docling, so that Ollama can use the entire GPU
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    for chunk in tqdm(chunks, desc=f"Adding context for chunks of {source[:20]}...", leave=False):    
        entire_doc = ""
        chunk_index = chunks.index(chunk)

        context_length = 16_000 # Reduce window to save memory
        context_length = context_length - 2 * MAX_TOKENS # We need to reserve space for the chunk itself (twice, the context contains the chunk itself)
        total_context_chunk_number = context_length // (MAX_TOKENS*2) # 2x, cuz before and after the chunk

        start_index_original = chunk_index - total_context_chunk_number
        start_index_truncated = max(0, start_index_original) # Avoid index out of bounds

        end_index_original = chunk_index + total_context_chunk_number
        end_index_truncated = min(len(chunks)-1, end_index_original)

        if start_index_original < 0: # We are at the start of the document, so we need to add more chunks at the end
            end_index_truncated = min(len(chunks)-1, end_index_truncated + abs(start_index_original))
        if end_index_original > len(chunks)-1: # We are at the end of the document, so we need to add more chunks at the start
            start_index_truncated = max(0, start_index_truncated - abs(end_index_original - end_index_truncated))

        for i in range(start_index_truncated, end_index_truncated + 1):
            entire_doc += " " + chunks_str[i]

        entire_doc = "FULL DOCUMENT:\n" + entire_doc
        ollama_prompt = f"CHUNK:\n{chunks_str[chunk_index]}"
        history =  [{'role': 'user', 'content': entire_doc}, {'role': 'user', 'content': ollama_prompt}]

        response = ollama.chat(
            model=OLLAMA_MODEL_NAME,
            messages=history,
            # options={
            #     'gpu_layers': 100  # use  GPU for model layers if VRAM allows
            # }
        )
        context = response['message']['content']
        # print(f"Context for chunk: {context}")
        # ---- OWN APPROACH TO CONTEXT ----
        # text_to_embed = chunks_str[chunk_index] + "\n\n" + context # We put the context AFTER the chunk to not mess up cosine similarity but still benefit keyword search for exact matches

        # ---- ANTHROPIC'S APPROACH TO CONTEXT ----
        text_to_embed = context + "\n\n" + chunks_str[chunk_index] # The context is PREPENDED to the chunk as per Anthropic's original algporithm
        # print(context)
        pages = set(
                prov.page_no
                for doc_item in chunk.meta.doc_items
                for prov in doc_item.prov
            )
        id = hashlib.sha256(chunks_str[chunk_index].encode()).hexdigest()
        chunks_with_metadata.append({'text': text_to_embed, 'original_text':chunks_str[chunk_index], 'context':context, 'document':source, 'pages':list(pages), 'id': id})
        
    # Free up ollama from GPU memory so that Docling can semantically analyze the next doc even if it's like 100 pages
    subprocess.run(["ollama", "stop", OLLAMA_MODEL_NAME], check=True)

In [4]:
# Save the the processed chunks in case VectorDB upload goes wrong.
# Luckily since this is a notebook, if the chunking is interrupted, we can still save the partial results here.
# Append new chunks to the existing file if it exists, otherwise create it
if os.path.exists(CHUNKS_WITH_METADATA_FILE_NAME):
    print(f"Appending to existing {CHUNKS_WITH_METADATA_FILE_NAME} file.")
    with open(CHUNKS_WITH_METADATA_FILE_NAME, "r", encoding="utf-8") as f:
        existing_data = json.load(f)
    # Avoid duplicate entries by id
    existing_ids = {chunk['id'] for chunk in existing_data}
    new_chunks = [chunk for chunk in chunks_with_metadata if chunk['id'] not in existing_ids]
    chunks_with_metadata = existing_data + new_chunks

with open(CHUNKS_WITH_METADATA_FILE_NAME, "w", encoding="utf-8") as f:
    json.dump(chunks_with_metadata, f, ensure_ascii=False, indent=2)

print(f"Results saved to {CHUNKS_WITH_METADATA_FILE_NAME}")

Results saved to sliding_chunks_with_metadata.json


# REORDER CONTEXT AND CHUNK

In [None]:
# CONVENIENCE STEP: Prepare the chunks with metadata file for Anthropic's original approach (context PREPENDED to chunk)
# # CHUNKS_WITH_METADATA_FILE_NAME = "tobacco_sliding_chunks_with_metadata.json"
# # if os.path.exists(CHUNKS_WITH_METADATA_FILE_NAME):
# #     with open(CHUNKS_WITH_METADATA_FILE_NAME, "r", encoding="utf-8") as f:
# #         chunks_with_metadata = json.load(f)

# #     for chunk in chunks_with_metadata:
# #         chunk['text'] = chunk['context'] + "\n\n" + chunk['original_text']

# #     with open(f"anthropic_{CHUNKS_WITH_METADATA_FILE_NAME}", "w", encoding="utf-8") as f:
# #         json.dump(chunks_with_metadata, f, ensure_ascii=False, indent=2)


# Creating Database

In [None]:
registry = get_registry()
hf = registry.get("huggingface").create(name=EMBEDDING_MODEL_NAME, trust_remote_code=True, device="cuda" if torch.cuda.is_available() else "cpu")


# Define model
class MyDocument(LanceModel):
    text: str = hf.SourceField()
    vector: Vector(hf.ndims()) = hf.VectorField()
    original_text: str
    context: str
    document: str
    pages: list[int]  # Any additional metadata
    id: str  # Unique identifier for the chunk


db = lancedb.connect("./db")
db.create_table("my_anthropic_sliding_tobacco_table", schema=MyDocument, mode="overwrite") # Uncomment this line when running this cell for the first time
table = db.open_table("my_anthropic_sliding_tobacco_table")

# Upload in batches with progress bar
with open(CHUNKS_WITH_METADATA_FILE_NAME, "r", encoding="utf-8") as f:
    chunks_with_metadata = json.load(f)

batch_size = 100
for i in tqdm(range(0, len(chunks_with_metadata), batch_size), desc="Uploading chunks to VectorDB"):
    batch = chunks_with_metadata[i:i+batch_size]
    table.add(batch)

table.create_scalar_index("id", replace=True) # Index based on the chunk's id, used to manually prevent duplicates

reranker = ColbertReranker()
table.create_fts_index("text", replace=True) # Used by the reranker as well as the hybrid search's BM25 index
table.wait_for_index(["text_idx"])  # Wait for the indexing to finish

<All keys matched successfully>
[90m[[0m2026-01-04T16:41:20Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /home/martin/projects/Quantwise/Quantwise-Chunking/db/my_anthropic_sliding_tobacco_table.lance, it will be created


Uploading chunks to VectorDB:   0%|          | 0/2 [00:00<?, ?it/s]

<All keys matched successfully>
<All keys matched successfully>


Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device cuda
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting


# Example query

In [5]:
prompt = "How was stock market data gathered?"
results = table.search(prompt, query_type="hybrid", vector_column_name="vector", fts_columns="text") \
            .rerank(reranker=reranker) \
            .limit(3) \
            .to_pandas()


results

<All keys matched successfully>


Unnamed: 0,text,vector,original_text,context,document,pages,id,_relevance_score
0,Details the collection and organization of dat...,"[0.8555312, 1.101829, -3.8134143, -0.13130203,...",We collected stock market-related information ...,Details the collection and organization of dat...,Stock_Market_Prediction_via_Multi-Source_Multi...,[6],4cf733a743ce1b6eb4e3c41e23b999ed51cd3d280449ef...,1.001428
1,Provides a detailed methodology for stock mark...,"[0.6753454, 0.95625216, -2.8495104, -0.5628190...",Experimental design. Our paper relates to rese...,Provides a detailed methodology for stock mark...,s41598-020-77823-3.pdf,"[4, 5]",4eecb9240c936f76259c30feaf4292800c84483b696ec2...,0.983982
2,This chunk introduces the multi-source data in...,"[0.2004797, 1.7068079, -3.6609893, -0.33517683...","Stock markets are impacted by various factors,...",This chunk introduces the multi-source data in...,Stock_Market_Prediction_via_Multi-Source_Multi...,[3],39c825d2635eebb324ba7686fde1bb602ed93eea99b7c3...,0.859257


In [8]:
results.iloc[0,0]

'We collected stock market-related information from Jan. 1, 2015 to Dec. 31, 2016, and separate the information into two data sets, one for the year 2015 and the other for 2016. The data consist of three parts, the historical quantitative data, the news articles and the posts on the social network, which are introduced in detail as follows.\n- GLYPH<15> Quantitative data : the source of quantitative data is Wind, 2 a widely used GLYPH<28>nancial information service provider in China. The data we collect are the average prices, market index change and turnover rate of the Shanghai Composite Index in each trading day.\n- GLYPH<15> News data : we collect the news articles on the macro economy through Wind, and get 38,727 and 39,465 news articles in 2015 and 2016 respectively. The news articles are aggregated by Wind from major GLYPH<28>nancial news websites in China, such as and We process the news titles rather than the whole articles to extract the events, as the main topic of a news ar

In [9]:
table.stats()

{'total_bytes': 3504703,
 'num_rows': 382,
 'num_indices': 2,
 'fragment_stats': {'num_fragments': 4,
  'num_small_fragments': 4,
  'lengths': {'min': 82,
   'max': 100,
   'mean': 95,
   'p25': 100,
   'p50': 100,
   'p75': 100,
   'p99': 100}}}