# Chunking

In [1]:
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
import torch
import hashlib
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import ColbertReranker
import ollama
import os
import json
from tqdm.notebook import tqdm

embedding_model_name = "nomic-ai/nomic-embed-text-v1.5"
MAX_TOKENS = 2000

converter = DocumentConverter()
tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(embedding_model_name),
    max_tokens=MAX_TOKENS # Optional, uses the max token number of the HF tokenizer by default
)
chunker = HybridChunker(
    tokenizer=tokenizer,
    merge_peers=True #Optional, defaults to true
)

study_names = [f for f in os.listdir("input") if f.endswith('.pdf')]
processed_chunks=[]
try:
    with open("sliding_chunks_with_metadata.json", "r", encoding="utf-8") as f:
        processed_chunks = json.load(f)
except FileNotFoundError:
    print("No existing chunks_with_metadata.json file found, starting fresh.")
    

chunks_with_metadata = processed_chunks.copy()
processed_studies = set(chunk["document"] for chunk in processed_chunks)

study_names = [f for f in study_names if f not in processed_studies]
print(f"Found {len(processed_studies)} studies which are already processed.\nStudies which STILL need to be processed: {len(study_names)}:\n{study_names}...")


Found 25 studies which are already processed.
Studies which STILL need to be processed: 0:
[]...


# Creating chunks and adding Metadata

As well as semantic context with ollama (Anthropic style)

In [2]:
for source in tqdm(study_names, desc="Chunking documents..."):
    # Free up CUDA memory between documents
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        
    entire_doc = ""
    doc = converter.convert(f"input/{source}").document
    chunks = list(chunker.chunk(dl_doc=doc))
    # for chunk in chunks:                  # This is leftover from before we implemented sliding window for context provision
    #     entire_doc += " " +chunk.text

    entire_doc = "FULL DOCUMENT:\n" + entire_doc

    

    for chunk in tqdm(chunks, desc=f"Adding context for chunks of {source[:20]}...", leave=False):    
        entire_doc = ""
        chunk_index = chunks.index(chunk)

        context_length = 16_000 # Reduce window to save memory
        context_length = context_length - 2 * MAX_TOKENS # We need to reserve space for the chunk itself (twice, the context contains the chunk itself)
        total_context_chunk_number = context_length // (MAX_TOKENS*2) # 2x, cuz before and after the chunk

        start_index_original = chunk_index - total_context_chunk_number
        start_index_truncated = max(0, start_index_original) # Avoid index out of bounds

        end_index_original = chunk_index + total_context_chunk_number
        end_index_truncated = min(len(chunks)-1, end_index_original)

        if start_index_original < 0: # We are at the start of the document, so we need to add more chunks at the end
            end_index_truncated = min(len(chunks)-1, end_index_truncated + abs(start_index_original))
        if end_index_original > len(chunks)-1: # We are at the end of the document, so we need to add more chunks at the start
            start_index_truncated = max(0, start_index_truncated + abs(end_index_original - end_index_truncated))

        for i in range(start_index_truncated, end_index_truncated + 1):
            entire_doc += " " + chunks[i].text

        entire_doc = "FULL DOCUMENT:\n" + entire_doc

        ollama_prompt = f"CHUNK:\n{chunk.text}"
        history =  [{'role': 'user', 'content': entire_doc}, {'role': 'user', 'content': ollama_prompt}]

        response = ollama.chat(
            model="chunker_full_doc",
            messages=history
        )
        context = response['message']['content']
        # print(f"Context for chunk: {context}")
        text_to_embed = chunk.text + "\n\n" + context # We put the context AFTER the chunk to not mess up cosine similarity but still

        pages = set(
                prov.page_no
                for doc_item in chunk.meta.doc_items
                for prov in doc_item.prov
            )
        id = hashlib.sha256(chunk.text.encode()).hexdigest()
        chunks_with_metadata.append({'text': text_to_embed, 'original_text':chunk.text, 'context':context, 'document':source, 'pages':list(pages), 'id': id})

Chunking documents...:   0%|          | 0/10 [00:00<?, ?it/s]

Adding context for chunks of srep04487.pdf...:   0%|          | 0/11 [00:00<?, ?it/s]

Adding context for chunks of srep03578.pdf...:   0%|          | 0/10 [00:00<?, ?it/s]

Adding context for chunks of srep01684.pdf...:   0%|          | 0/8 [00:00<?, ?it/s]

Adding context for chunks of s41586-019-1138-y.pd...:   0%|          | 0/33 [00:00<?, ?it/s]

Adding context for chunks of s41598-020-77823-3.p...:   0%|          | 0/13 [00:00<?, ?it/s]

Adding context for chunks of Electron_Paramagneti...:   0%|          | 0/12 [00:00<?, ?it/s]

Adding context for chunks of Thermal_Imagery_for_...:   0%|          | 0/21 [00:00<?, ?it/s]

Adding context for chunks of The_Graph_Database_J...:   0%|          | 0/10 [00:00<?, ?it/s]

Adding context for chunks of The_Application_of_t...:   0%|          | 0/17 [00:00<?, ?it/s]

Adding context for chunks of Realization_of_a_Rub...:   0%|          | 0/11 [00:00<?, ?it/s]

In [3]:
# Save the the processed chunks in case VectorDB upload goes wrong.
# Luckily since this is a notebook, if the chunking is interrupted, we can still save the partial results here.
# Append new chunks to the existing file if it exists, otherwise create it
if os.path.exists("sliding_chunks_with_metadata.json"):
    print("Appending to existing sliding_chunks_with_metadata.json file.")
    with open("sliding_chunks_with_metadata.json", "r", encoding="utf-8") as f:
        existing_data = json.load(f)
    # Avoid duplicate entries by id
    existing_ids = {chunk['id'] for chunk in existing_data}
    new_chunks = [chunk for chunk in chunks_with_metadata if chunk['id'] not in existing_ids]
    chunks_with_metadata = existing_data + new_chunks

with open("sliding_chunks_with_metadata.json", "w", encoding="utf-8") as f:
    json.dump(chunks_with_metadata, f, ensure_ascii=False, indent=2)

print("Results saved to sliding_chunks_with_metadata.json")

Appending to existing sliding_chunks_with_metadata.json file.
Results saved to sliding_chunks_with_metadata.json


# Creating Database

In [2]:
registry = get_registry()
hf = registry.get("huggingface").create(name=embedding_model_name, trust_remote_code=True, device="cuda" if torch.cuda.is_available() else "cpu")


# Define model
class MyDocument(LanceModel):
    text: str = hf.SourceField()
    vector: Vector(hf.ndims()) = hf.VectorField()
    original_text: str
    context: str
    document: str
    pages: list[int]  # Any additional metadata
    id: str  # Unique identifier for the chunk




db = lancedb.connect("./db")
db.create_table("my_sliding_table", schema=MyDocument, mode="overwrite") # Uncomment this line when running this cell for the first time
table = db.open_table("my_sliding_table")

# Upload in batches with progress bar
batch_size = 100
for i in tqdm(range(0, len(chunks_with_metadata), batch_size), desc="Uploading chunks to VectorDB"):
    batch = chunks_with_metadata[i:i+batch_size]
    table.add(batch)

table.create_scalar_index("id", replace=True) # Index based on the chunk's id, used to manually prevent duplicates

reranker = ColbertReranker()
table.create_fts_index("text", replace=True) # Used by the reranker as well as the hybrid search's BM25 index
table.wait_for_index(["text_idx"])  # Wait for the indexing to finish

<All keys matched successfully>


Uploading chunks to VectorDB:   0%|          | 0/4 [00:00<?, ?it/s]

<All keys matched successfully>
<All keys matched successfully>
 Retrying in 2.9652224071656823 seconds (retry 1 of 7) 



Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device cuda
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting


# Example query

In [7]:
table = db.open_table("my_table")
prompt = "How was the stock-market related information collected?"
results = table.search(prompt, query_type="hybrid", vector_column_name="vector", fts_columns="text") \
            .rerank(reranker=reranker) \
            .limit(3) \
            .to_pandas()


results



Unnamed: 0,text,vector,original_text,context,document,pages,id,_relevance_score
0,We collected stock market-related information ...,"[0.5996058, 1.1112232, -3.3586943, -0.36808917...",We collected stock market-related information ...,"Details the data collection process, specifica...",Stock_Market_Prediction_via_Multi-Source_Multi...,[6],08aa975b95d5c9c782b71fcb335e2ab61751bb4c4f6bb1...,0.878606
1,Experimental design. Our paper relates to rese...,"[0.62975305, 1.1866554, -2.867439, -0.6627391,...",Experimental design. Our paper relates to rese...,"This section details the experimental design, ...",s41598-020-77823-3.pdf,"[4, 5]",a3a29e665eb3215584b2cfab75bcaa99692dd7f3757763...,0.76636
2,"XI ZHANG 1 , (Member, IEEE), SIYU QU 1 , JIEYU...","[-0.1864265, 1.2888607, -3.9965787, -0.2823099...","XI ZHANG 1 , (Member, IEEE), SIYU QU 1 , JIEYU...",Introduces the core problem and proposed solut...,Stock_Market_Prediction_via_Multi-Source_Multi...,[1],9ed6907515207f4a5e5cd48f826d4d5e8cc95ce11d0377...,0.600578


In [8]:
results.iloc[0,0]

'We collected stock market-related information from Jan. 1, 2015 to Dec. 31, 2016, and separate the information into two data sets, one for the year 2015 and the other for 2016. The data consist of three parts, the historical quantitative data, the news articles and the posts on the social network, which are introduced in detail as follows.\n- GLYPH<15> Quantitative data : the source of quantitative data is Wind, 2 a widely used GLYPH<28>nancial information service provider in China. The data we collect are the average prices, market index change and turnover rate of the Shanghai Composite Index in each trading day.\n- GLYPH<15> News data : we collect the news articles on the macro economy through Wind, and get 38,727 and 39,465 news articles in 2015 and 2016 respectively. The news articles are aggregated by Wind from major GLYPH<28>nancial news websites in China, such as http://GLYPH<28>nance.sina.com.cn and http://www.hexun.com. We process the news titles rather than the whole articl