# Chunking

In [None]:
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
import torch
import hashlib
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import ColbertReranker
import ollama
import os
import json
from tqdm.notebook import tqdm

embedding_model_name = "nomic-ai/nomic-embed-text-v1.5"
MAX_TOKENS = 2000

converter = DocumentConverter()
tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(embedding_model_name),
    max_tokens=MAX_TOKENS # Optional, uses the max token number of the HF tokenizer by default
)
chunker = HybridChunker(
    tokenizer=tokenizer,
    merge_peers=True #Optional, defaults to true
)

study_names = [f for f in os.listdir("input") if f.endswith('.pdf')]
processed_chunks=[]
try:
    with open("chunks_with_metadata.json", "r", encoding="utf-8") as f:
        processed_chunks = json.load(f)
except FileNotFoundError:
    print("No existing chunks_with_metadata.json file found, starting fresh.")
    

chunks_with_metadata = processed_chunks.copy()
processed_studies = set(chunk["document"] for chunk in processed_chunks)

study_names = [f for f in study_names if f not in processed_studies]
print(f"Found {len(processed_studies)} studies which are already processed.\nStudies which STILL need to be processed: {len(study_names)}:\n{study_names}...")


# Creating chunks and adding Metadata

As well as semantic context with ollama (Anthropic style)

In [None]:
for source in tqdm(study_names, desc="Chunking documents..."):
    entire_doc = ""
    doc = converter.convert(f"input/{source}").document
    chunks = list(chunker.chunk(dl_doc=doc))
    # for chunk in chunks:                  # This is leftover from before we implemented sliding window for context provision
    #     entire_doc += " " +chunk.text

    entire_doc = "FULL DOCUMENT:\n" + entire_doc

    for chunk in tqdm(chunks, desc=f"Adding context for chunks of {source[:20]}...", leave=False):    
        entire_doc = ""
        chunk_index = chunks.index(chunk)

        context_length = 16000 # This is the window we are working with
        context_length = context_length - 2 * MAX_TOKENS # We need to reserve space for the chunk itself (twice, the context contains the chunk itself)
        total_context_chunk_number = context_length // (MAX_TOKENS*2) # 2x, cuz before and after the chunk

        start_index_original = chunk_index - total_context_chunk_number
        start_index_truncated = max(0, start_index_original) # Avoid index out of bounds

        end_index_original = chunk_index + total_context_chunk_number
        end_index_truncated = min(len(chunks)-1, end_index_original)

        if start_index_original < 0: # We are at the start of the document, so we need to add more chunks at the end
            end_index_truncated = min(len(chunks)-1, end_index_truncated + abs(start_index_original))
        if end_index_original > len(chunks)-1: # We are at the end of the document, so we need to add more chunks at the start
            start_index_truncated = max(0, start_index_truncated + abs(end_index_original - end_index_truncated))

        for i in range(start_index_truncated, end_index_truncated + 1):
            entire_doc += " " + chunks[i].text

        entire_doc = "FULL DOCUMENT:\n" + entire_doc

        ollama_prompt = f"CHUNK:\n{chunk.text}"
        history =  [{'role': 'user', 'content': entire_doc}, {'role': 'user', 'content': ollama_prompt}]

        response = ollama.chat(
            model="chunker_full_doc",
            messages=history
        )
        context = response['message']['content']
        # print(f"Context for chunk: {context}")
        text_to_embed = chunk.text + "\n\n" + context # We put the context AFTER the chunk to not mess up cosine similarity but still

        pages = set(
                prov.page_no
                for doc_item in chunk.meta.doc_items
                for prov in doc_item.prov
            )
        id = hashlib.sha256(chunk.text.encode()).hexdigest()
        chunks_with_metadata.append({'text': text_to_embed, 'original_text':chunk.text, 'context':context, 'document':source, 'pages':list(pages), 'id': id})

In [None]:
# Save the the processed chunks in case VectorDB upload goes wrong.
# Luckily since this is a notebook, if the chunking is interrupted, we can still save the partial results here.
with open("chunks_with_metadata.json", "w", encoding="utf-8") as f:
    json.dump(chunks_with_metadata, f, ensure_ascii=False, indent=2)

# Creating Database

In [None]:
registry = get_registry()
hf = registry.get("huggingface").create(name=embedding_model_name, trust_remote_code=True, device="cuda" if torch.cuda.is_available() else "cpu")


# Define model
class MyDocument(LanceModel):
    text: str = hf.SourceField()
    vector: Vector(hf.ndims()) = hf.VectorField()
    original_text: str
    context: str
    document: str
    pages: list[int]  # Any additional metadata
    id: str  # Unique identifier for the chunk




db = lancedb.connect("./db")
# # db.create_table("my_table", schema=MyDocument, mode="overwrite") # Uncomment this line when running this cell for the first time
# table = db.open_table("my_table")

# Upload in batches with progress bar
# batch_size = 100
# for i in tqdm(range(0, len(chunks_with_metadata), batch_size), desc="Uploading chunks to VectorDB"):
#     batch = chunks_with_metadata[i:i+batch_size]
#     table.add(batch)

# table.create_scalar_index("id", replace=True) # Index based on the chunk's id, used to manually prevent duplicates

reranker = ColbertReranker()
# table.create_fts_index("text", replace=True) # Used by the reranker as well as the hybrid search's BM25 index
# table.wait_for_index(["text_idx"])  # Wait for the indexing to finish

# Example query

In [None]:
table = db.open_table("my_table")
prompt = "How was the stock-market related information collected?"
results = table.search(prompt, query_type="hybrid", vector_column_name="vector", fts_columns="text") \
            .rerank(reranker=reranker) \
            .limit(3) \
            .to_pandas()


results