# Chunking

In [1]:
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
import torch
import hashlib
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import ColbertReranker
import ollama
import os
import json
from tqdm.notebook import tqdm
import re, unicodedata
import subprocess


def clean_docling_chunk_strings(chunks):
    cleaned_chunks = []
    
    for chunk in chunks:
        # 2️⃣ Normalize Unicode and replace problematic punctuation
        chunk = unicodedata.normalize("NFKD", chunk).replace("\u00A0", " ")
        chunk = chunk.translate(str.maketrans({
            "–": "-", "—": "-", "‘": "'", "’": "'", "“": '"', "”": '"'
        }))

        # 3️⃣ Remove URLs (massive tokenizers killers)
        chunk = re.sub(r"http\S+", "", chunk)

        # 4️⃣ Normalize whitespace but preserve paragraphs
        chunk = re.sub(r"[ \t]+", " ", chunk)
        chunk = re.sub(r"\n\s*\n", "\n\n", chunk)  # merge single newlines, keep double
        chunk = chunk.strip()

        cleaned_chunks.append(chunk)

    return cleaned_chunks



EMBEDDING_MODEL_NAME = "nomic-ai/nomic-embed-text-v1.5"
MAX_TOKENS = 2000
OLLAMA_MODEL_NAME= "anthropic_chunking"
CHUNKS_WITH_METADATA_FILE_NAME = "preprocessed_chunks/anthropic_control_chunks_with_metadata.json"
INPUT_DIR = "input"
TABLE_NAME = "anthropic_control_table"


converter = DocumentConverter()
tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(EMBEDDING_MODEL_NAME),
    max_tokens=MAX_TOKENS # Optional, uses the max token number of the HF tokenizer by default
)
chunker = HybridChunker(
    tokenizer=tokenizer,
    merge_peers=True #Optional, defaults to true
)

study_names = [f for f in os.listdir(INPUT_DIR) if f.endswith('.pdf')]
processed_chunks=[]
try:
    with open(CHUNKS_WITH_METADATA_FILE_NAME, "r", encoding="utf-8") as f:
        processed_chunks = json.load(f)
except FileNotFoundError:
    print(f"No existing {CHUNKS_WITH_METADATA_FILE_NAME} file found, starting fresh.")
    

chunks_with_metadata = processed_chunks.copy()
processed_studies = set(chunk["document"] for chunk in processed_chunks)

study_names = [f for f in study_names if f not in processed_studies]
print(f"Found {len(processed_studies)} studies which are already processed.\nStudies which STILL need to be processed: {len(study_names)}:\n{study_names}...")


No existing preprocessed_chunks/anthropic_control_chunks_with_metadata.json file found, starting fresh.
Found 0 studies which are already processed.
Studies which STILL need to be processed: 25:
['Stock_Market_Prediction_via_Multi-Source_Multiple_Instance_Learning.pdf', 'A_Conceptual_Framework_and_Recommendations_for_Open_Data_and_Artifacts_in_Empirical_Software_Engineering.pdf', 'A_Hybrid_Gaze_Distance_Estimation_via_Cross-Reference_of_Vergence_and_Depth.pdf', 'A_Feature_Fusion_Based_Indicator_for_Training-Free_Neural_Architecture_Search.pdf', 'A_Resource_Allocation_Model_Based_on_Trust_Evaluation_in_Multi-Cloud_Environments.pdf', 'Quantitative_Evaluation_of_Line-Edge_Roughness_in_Various_FinFET_Structures_Bayesian_Neural_Network_With_Automatic_Model_Selection.pdf', 'Probabilistic_Artificial_Neural_Network_for_Line-Edge-Roughness-Induced_Random_Variation_in_FinFET.pdf', 'Transformation_of_Non-Euclidean_Space_to_Euclidean_Space_for_Efficient_Learning_of_Singular_Vectors.pdf', 'Ultrahig

# Creating chunks and adding Metadata

As well as semantic context with ollama (Anthropic style)

In [None]:
for source in tqdm(study_names, desc="Chunking documents..."):        
    doc = converter.convert(f"{INPUT_DIR}/{source}").document
    chunks = list(chunker.chunk(dl_doc=doc))
    chunks_str = [chunk.text for chunk in chunks]
    chunks_str = clean_docling_chunk_strings(chunks_str)
    entire_doc = " ".join(chunks_str)
    print(f"{len(entire_doc)=}\n{entire_doc[:500]=}...")

    # Free up CUDA memory right after we got the results from Docling, so that Ollama can use the entire GPU
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    for chunk in tqdm(chunks, desc=f"Adding context for chunks of {source[:20]}...", leave=False):    
        chunk_index = chunks.index(chunk)

        entire_doc = "FULL DOCUMENT:\n" + entire_doc
        ollama_prompt = f"CHUNK:\n{chunks_str[chunk_index]}"
        history =  [{'role': 'user', 'content': entire_doc}, {'role': 'user', 'content': ollama_prompt}]

        response = ollama.chat(
            model=OLLAMA_MODEL_NAME,
            messages=history,
            options={
                "num_ctx": 30_000
            }
        )
        context = response['message']['content']

        # ---- ANTHROPIC'S APPROACH TO CONTEXT ----
        text_to_embed = context + "\n\n" + chunks_str[chunk_index] # The context is PREPENDED to the chunk as per Anthropic's original algporithm
        # print(context)
        pages = set(
                prov.page_no
                for doc_item in chunk.meta.doc_items
                for prov in doc_item.prov
            )
        id = hashlib.sha256(chunks_str[chunk_index].encode()).hexdigest()
        chunks_with_metadata.append({'text': text_to_embed, 'original_text':chunks_str[chunk_index], 'context':context, 'document':source, 'pages':list(pages), 'id': id})
        
    # Free up ollama from GPU memory so that Docling can semantically analyze the next doc even if it's like 100 pages
    subprocess.run(["ollama", "stop", OLLAMA_MODEL_NAME], check=True)
# Total runtime: 71m 34s for 25 documents

Chunking documents...:   0%|          | 0/25 [00:00<?, ?it/s]

[32m[INFO] 2026-01-16 20:25:02,347 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-16 20:25:02,348 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-16 20:25:02,356 [RapidOCR] download_file.py:60: File exists and is valid: /home/martin/projects/Quantwise/Quantwise-Chunking/.venv/lib/python3.14/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-16 20:25:02,357 [RapidOCR] main.py:50: Using /home/martin/projects/Quantwise/Quantwise-Chunking/.venv/lib/python3.14/site-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2026-01-16 20:25:02,555 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2026-01-16 20:25:02,555 [RapidOCR] device_config.py:57: Using GPU device with ID: 0[0m
[32m[INFO] 2026-01-16 20:25:02,556 [RapidOCR] download_file.py:60: File exists and is valid: /home/martin/projects/Quantwise/Quantwise-Chunking/.venv/lib/python3.14/site-packages/rapidocr/models/ch_pt

len(entire_doc)=43341
entire_doc[:500]='Received July 29, 2018, accepted August 27, 2018, date of publication September 13, 2018, date of current version October 8, 2018.\nDigital Object Identifier 10.1 109/ACCESS.2018.2869735 XI ZHANG 1 , (Member, IEEE), SIYU QU 1 , JIEYUN HUANG 1 , BINXING FANG 1 , AND PHILIP YU 2 , (Fellow, IEEE)\n1 Key Laboratory of Trustworthy Distributed Computing and Service, Ministry of Education, Beijing University of Posts and Telecommunications,\nBeijing 100876, China\n2 Department of Computer Science, The Univ'...


Adding context for chunks of Stock_Market_Predict...:   0%|          | 0/18 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=50223
entire_doc[:500]='Per Runeson per.runeson@cs.lth.se Lund University Lund, Sweden Background. Open science aims to improve research accessibility, replicability, and consequently its quality. Empirical software engineering entails both data and artifacts, which may be shared more or less openly, to support transparency. However, the trade-offs involved in balancing the openness against integrity and secrecy concerns need methodological guidance. Aim. We aim to derive such advice, based on our own experiences from '...


Adding context for chunks of A_Conceptual_Framewo...:   0%|          | 0/22 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hRapidOCR returned empty result!


len(entire_doc)=37033
entire_doc[:500]='Date of publication xxxx 00, 0000, date of current version xxxx 00, 0000.\nDigital Object Identifier 10.1 109/ACCESS.2023.0322000 1 Korea Institute of Science and Technology, Seoul, South Korea\n2 KHU-KIST Department of Converging Science and Technology, Kyung Hee University, Seoul, South Korea\nCorresponding author: Min-Koo Kang (e-mail: minkoo@kist.re.kr).\nThis work was financially supported by the Institute of Civil-Military Technology Cooperation Program funded by the Defense Acquisition Progra'...


Adding context for chunks of A_Hybrid_Gaze_Distan...:   0%|          | 0/14 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=47813
entire_doc[:500]='Received August 2, 2021, accepted September 16, 2021, date of publication September 27, 2021, date of current version October 6, 2021.\nDigital Object Identifier 10.1 109/ACCESS.2021.31 1591 1 LINH-TAM TRAN , MUHAMMAD SALMAN ALI , AND SUNG-HO BAE , (Member, IEEE)\nDepartment of Computer Science and Engineering, Kyung Hee University, Yongin 17104, Republic of Korea\nCorresponding author: Sung-Ho Bae (shbae@khu.ac.kr)\nThis work was supported by the Technology Innovation Program or the Industrial Stra'...


Adding context for chunks of A_Feature_Fusion_Bas...:   0%|          | 0/26 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hRapidOCR returned empty result!
RapidOCR returned empty result!


len(entire_doc)=52286
entire_doc[:500]='Received July 9, 2021, accepted July 18, 2021, date of publication July 26, 2021, date of current version August 3, 2021.\nDigital Object Identifier 10.1 109/ACCESS.2021.3100316 A. B. M. BODRUL ALAM 1 , (Member, IEEE), ZUBAIR MD. FADLULLAH 1,2 , (Senior Member, IEEE), AND SALIMUR CHOUDHURY 2 , (Senior Member, IEEE)\n1 Thunder Bay Regional Health Research Institute (TBRHRI), Thunder Bay, ON P7B 7A5, Canada 2 Department of Computer Science, Lakehead University, Thunder Bay, ON P7B 5E1, Canada\nCorres'...


Adding context for chunks of A_Resource_Allocatio...:   0%|          | 0/18 [00:00<?, ?it/s]

RapidOCR returned empty result!
RapidOCR returned empty result!
RapidOCR returned empty result!


len(entire_doc)=26194
entire_doc[:500]='Received February 7, 2022, accepted February 22, 2022, date of publication March 2, 2022, date of current version March 11, 2022.\nDigital Object Identifier 10.1 109/ACCESS.2022.31561 18 SANGHO YU 1 , SANG MIN WON 1 , HYOUNG WON BAAC 1 , DONGHEE SON 1 , AND CHANGHWAN SHIN 2 , (Senior Member, IEEE)\n1 Department of Electrical and Computer Engineering, Sungkyunkwan University, Suwon 16419, Republic of Korea 2 School of Electrical Engineering, Korea University, Seoul 02841, Republic of Korea\nCorrespo'...


Adding context for chunks of Quantitative_Evaluat...:   0%|          | 0/9 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hRapidOCR returned empty result!


len(entire_doc)=26489
entire_doc[:500]='Received May 11, 2021, accepted June 6, 2021, date of publication June 11, 2021, date of current version June 22, 2021.\nDigital Object Identifier 10.1 109/ACCESS.2021.3088461 Department of Electrical and Computer Engineering, Sungkyunkwan University, Suwon 16419, Republic of Korea\nCorresponding author: Changhwan Shin (cshin@skku.edu)\nThis work was supported in part by the National Research Foundation of Korea (NRF) grant funded by the Korea Government Ministry of Science and ICT (MSIT) under Gra'...


Adding context for chunks of Probabilistic_Artifi...:   0%|          | 0/14 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=44706
entire_doc[:500]='Received June 20, 2020, accepted July 6, 2020, date of publication July 9, 2020, date of current version July 22, 2020.\nDigital Object Identifier 10.1 109/ACCESS.2020.3008195 Department of Electronic Engineering, Inha University, Incheon 22212, South Korea\nCorresponding author: Byung Cheol Song (bcsong@inha.ac.kr)\nThis work was supported in part by the Industrial Technology Innovation Program funded by the Ministry of Trade, Industry & Energy (MI, South Korea) (Development on Deep Learning based'...


Adding context for chunks of Transformation_of_No...:   0%|          | 0/21 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hRapidOCR returned empty result!
RapidOCR returned empty result!
RapidOCR returned empty result!
RapidOCR returned empty result!
RapidOCR returned empty result!
RapidOCR returned empty result!
RapidOCR returned empty result!


len(entire_doc)=44643
entire_doc[:500]='Daewoon Seong , Deokmin Jeon , Ruchire Eranga Wijesinghe , Kibeom Park , Hyeree Kim , Euimin Lee , Mansik Jeon , Member, IEEE , and Jeehyun Kim , Member, IEEE\nAbstract -The primary optimization of the imaging speed of optical coherence tomography (OCT) has been keenly studied. In order to overcome the major speed limitation of spectral-domain OCT (SD-OCT), we developed an ultrahigh-speed SD-OCT system, with an A-scan rate of up to 1 MHz, using the method of space-time-division multiplexing (STDM'...


Adding context for chunks of Ultrahigh-Speed_Spec...:   0%|          | 0/13 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hRapidOCR returned empty result!


len(entire_doc)=42424
entire_doc[:500]='Received 22 January 2023, accepted 11 February 2023, date of publication 22 February 2023, date of current version 27 February 2023.\nDigital Object Identifier 10.1 109/ACCESS.2023.3246486 TYLER PHILLIPS 1 , (Member, IEEE), LAURENTIU D. MARINOVICI 2 , (Member, IEEE), CRAIG RIEGER 1 , (Senior Member, IEEE), AND ALICE ORRELL 2\n1 Idaho National Laboratory, Idaho Falls, ID 83415, USA\n2 Pacific Northwest National Laboratory, Richland, WA 99354, USA\nCorresponding author: Laurentiu D. Marinovici (Lauren'...


Adding context for chunks of Scalable_Resilience_...:   0%|          | 0/18 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=28567
entire_doc[:500]='Received: 30 March 2017\nAccepted: 7 June 2017\nPublished: xx xx xxxx Tomasz Ryczkowski, Agata Froncz ak & Piotr Fronczak\nIn this paper, we analyse the gravity model in the global passenger air-transport network. We show that in the standard form, the model is inadequate for correctly describing the relationship between passenger flows and typical geo-economic variables that characterize connected countries. We propose a model for transfer flights that allows exploitation of these discrepancies in'...


Adding context for chunks of s41598-017-06108-z.p...:   0%|          | 0/9 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=61118
entire_doc[:500]="1234567890():,; OPEN Alessandro Spelta 1,2 ✉ , Andrea Flori 3 , Nicolò Pecora 4 , Sergey Buldyrev 5 & Fabio Pammolli 2,3\nWe introduce an indicator that aims to detect the emergence of market instabilities by quantifying the intensity of self-organizing processes arising from stock returns ' comovements. In /uniFB01 nancial markets, phenomena like imitation, herding and positive feedbacks characterize the emergence of endogenous instabilities, which can modify the qualitative and quantitative be"...


Adding context for chunks of s41467-020-15356-z.p...:   0%|          | 0/19 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=53588
entire_doc[:500]='Christoph Schweimer ͷ , Bernhard C. Geiger ͷ * , Meizhu Wang ͷ , Sergiy Gogolenko \u0378 , Imran Mahmood \u0379 , Alireza Jahani \u0379 , Diana Suleimenova \u0379 * & Derek Groen \u0379, ͅ\nAutomated construction of location graphs is instrumental but challenging, particularly in logistics optimisation problems and agent-based movement simulations. Hence, we propose an algorithm for automated construction of location graphs, in which vertices correspond to geographic locations of interest and edges to direct travelling r'...


Adding context for chunks of s41598-021-90943-8.p...:   0%|          | 0/13 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hRapidOCR returned empty result!


len(entire_doc)=29571
entire_doc[:500]='r eceived: 01 November 2016 a ccepted: 22 February 2017 P ublished: 30 March 2017 Yuwang Wang 1 , Yang Liu 1 , Jinli Suo 1 , Guohai Situ 2 , Chang Qiao 1 & Qionghai Dai 1\nComputational ghost imaging (CGI) achieves single-pixel imaging by using a Spatial Light Modulator (SLM) to generate structured illuminations for spatially resolved information encoding. The imaging speed of CGI is limited by the modulation frequency of available SLMs, and sets back its practical applications. This paper propos'...


Adding context for chunks of srep45325.pdf...:   0%|          | 0/9 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=52287
entire_doc[:500]='Daniel S. Zachary *\nResources Centre for Environmental Technologies, Public Research Centre Henri Tudor, 29, avenue J.F. Kennedy, Grand Duchy of Luxembourg.\nThis paper develops a framework to determine the sustainability of a general activity. We define an activity as an action or process that uses one or more resources and that responds either wholly or partially to a demand. A definition for sustainability is developed and is contingent on whether or not an activity can be sustained according '...


Adding context for chunks of srep05215.pdf...:   0%|          | 0/14 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=41726
entire_doc[:500]="SUBJECT AREAS: APPLIED PHYSICS SCIENTIFIC DATA STATISTICAL PHYSICS, THERMODYNAMICS AND NONLINEAR DYNAMICS\nReceived\n12 November 2013 Accepted 10 March 2014 Published 27 March 2014\nCorrespondence and requests for materials should be addressed to M.C. (matthieu. cristelli@roma1.infn.it) Federico Garzarelli 1 , Matthieu Cristelli 2 , Gabriele Pompa 3 , Andrea Zaccaria 2 & Luciano Pietronero 1,2,4\n1 ''Sapienza'', Universita ` di Roma, Dip. Fisica, P. le A. Moro 2, 00185, Roma, Italy, 2 Institute of C"...


Adding context for chunks of srep04487.pdf...:   0%|          | 0/11 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25hRapidOCR returned empty result!


len(entire_doc)=28358
entire_doc[:500]='SUBJECT AREAS: COMPLEX NETWORKS APPLIED PHYSICS COMPUTATIONAL SCIENCE\nCOMPUTER SCIENCE\nReceived\n17 September 2013\nAccepted\n5 December 2013 Published 20 December 2013\nCorrespondence and requests for materials should be addressed to M.A. (M.Alanyali@ warwick.ac.uk) Merve Alanyali 1 , Helen Susannah Moat 2 & Tobias Preis 2\n1 Centre for Complexity Science, University of Warwick, Coventry, CV4 7AL, UK, 2 WarwickBusinessSchool, University of Warwick, Coventry, CV4 7AL, UK.\nThecomplexbehaviorof financi'...


Adding context for chunks of srep03578.pdf...:   0%|          | 0/10 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=27472
entire_doc[:500]='SUBJECT AREAS:\nSTATISTICAL PHYSICS, THERMODYNAMICS AND\nNONLINEAR DYNAMICS APPLIED PHYSICS COMPUTATIONAL SCIENCE\nINFORMATION THEORY AND\nCOMPUTATION\nReceived 25 February 2013\nAccepted 3 April 2013\nPublished 25 April 2013\nCorrespondence and requests for materials should be addressed to T.P. (Tobias.Preis@ wbs.ac.uk)\n* These authors contributed equally to this work. Tobias Preis 1 * , Helen Susannah Moat 2,3 * & H. Eugene Stanley 2 *\n1 Warwick Business School, University of Warwick, Scarman Road, Co'...


Adding context for chunks of srep01684.pdf...:   0%|          | 0/9 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=75251
entire_doc[:500]="iyad Rahwan 1,2,3,34 * , Manuel Cebrian 1,34 , Nick Obradovich 1,34 , Josh Bongard 4 , Jean-François Bonnefon 5 , Cynthia Breazeal 1 , Jacob w . Crandall 6 , Nicholas A. Christakis 7,8,9,10 , iain D. Couzin 11,12,13 , Matthew O. Jackson 14,15,16 , Nicholas R. Jennings 17,18 , ece Kamar 19 , isabel M. Kloumann 20 , Hugo Larochelle 21 , David Lazer 22,23,24 , Richard Mcelreath 25,26 , Alan Mislove 27 , David C. Parkes 28,29 , Alex 'Sandy' Pentland 1 , Margaret e. Roberts 30 , Azim Shariff 31 , Jo"...


Adding context for chunks of s41586-019-1138-y.pd...:   0%|          | 0/25 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=50939
entire_doc[:500]='OPEN Yi Li, Zichuan Mi & Wenjun Jing *\nThis study adopts the textual network to describe the coordination among the interplay of words, where nodes represent words and nodes are connected if the corresponding words have co-occurrence pattern across documents. To study stock movements, we further proposed the sparse laplacian shrinkage logistic model (SLS_L) which can properly take into account the network connectivity structure. By using this approach, we investigated the relationship between Sh'...


Adding context for chunks of s41598-020-77823-3.p...:   0%|          | 0/14 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=40826
entire_doc[:500]='Shigeki Mizushima , Naoki Kuramoto , Kenichi Fujii , and Takahide Umeda\nAbstract -For the future realization of the kilogram using the X-ray crystal density (XRCD) method, isotopically enriched silicon crystals grown by the floating zone method are employed. In this paper, we present quantitative electron paramagnetic resonance (EPR) measurements on 28 Si single crystal A VO28 to increase the reliability of mass deficit correction in the XRCD method. We detected phosphorus impurity in the crysta'...


Adding context for chunks of Electron_Paramagneti...:   0%|          | 0/12 [00:00<?, ?it/s]

RapidOCR returned empty result!
RapidOCR returned empty result!


len(entire_doc)=54647
entire_doc[:500]='Raúl Castilla-Arquillo , Graduate Student Member, IEEE , Anthony Mandow , Member, IEEE , Carlos J. Pérez-del-Pulgar , Member, IEEE , César Álvarez-Llamas , José M. Vadillo , and Javier Laserna\nAbstract -Planetary rover missions on Mars have suffered entrapments and serious mobility incidents due to soil assessment limitations of stereo RGB cameras, which cannot characterize relevant physical phenomena such as thermal behavior that depend on granularity and cohesion. In particular, thermal i'...


Adding context for chunks of Thermal_Imagery_for_...:   0%|          | 0/20 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=17769
entire_doc[:500]='George F. Hurlburt, STEMCorp George K. Thiruvathukal, Loyola University Chicago Maria R. Lee, Shih Chien University, Taiwan\nT he notion of graph reasoning is not new. The heretofore curious and obscure branch of mathematics, graph theory, extends to Leonhard Euler in the 18th century. 1 The application of graph theory to all manner of networks is quite new, however. Graph theory has exploded, largely due to the existence of the Internet.\nAn unintended consequence of the Internet was to reveal ho'...


Adding context for chunks of The_Graph_Database_J...:   0%|          | 0/10 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

len(entire_doc)=46332
entire_doc[:500]='Received 29 September 2023, accepted 20 October 2023, date of publication 23 October 2023, date of current version 1 November 2023.\nDigital Object Identifier 10.1 109/ACCESS.2023.3327131 - 1 College of Ecology and Environment, Nanjing Forestry University, Nanjing 210037, China\n- 2 School of Economics and Management, Nanchang Institute of Science and Technology, Nanchang 330108, China\n3 School of Marxism, Hefei Vocational College of Finance and Economics, Hefei 230601, China\n4 College of Humaniti'...


Adding context for chunks of The_Application_of_t...:   0%|          | 0/17 [00:00<?, ?it/s]

RapidOCR returned empty result!
RapidOCR returned empty result!
RapidOCR returned empty result!


len(entire_doc)=33571
entire_doc[:500]="Jiaqi Cui , Member, IEEE , Gang Ming , Fang Wang , Junyao Li , Pengfei Wang , Songbai Kang , Feng Zhao , Da Zhong , and Ganghua Mei\nAbstract -Lamp-pumped rubidium atomic frequency standard (RAFS) is one of the most commonly utilized atomic frequency standards. Over the past few decades, the RAFS's frequency stability performance has improved rapidly, and the best one has been in the 10 -13 τ -1 / 2 level. In this article, we demonstrate an RAFS with stability in the 10 -14 τ -1 / 2 level for the"...


Adding context for chunks of Realization_of_a_Rub...:   0%|          | 0/12 [00:00<?, ?it/s]

[?25l[?2026h[?25l[1G[K[?25h[?2026l[2K[1G[?25h

In [3]:
# Save the the processed chunks in case VectorDB upload goes wrong.
# Luckily since this is a notebook, if the chunking is interrupted, we can still save the partial results here.
# Append new chunks to the existing file if it exists, otherwise create it
if os.path.exists(CHUNKS_WITH_METADATA_FILE_NAME):
    print(f"Appending to existing {CHUNKS_WITH_METADATA_FILE_NAME} file.")
    with open(CHUNKS_WITH_METADATA_FILE_NAME, "r", encoding="utf-8") as f:
        existing_data = json.load(f)
    # Avoid duplicate entries by id
    existing_ids = {chunk['id'] for chunk in existing_data}
    new_chunks = [chunk for chunk in chunks_with_metadata if chunk['id'] not in existing_ids]
    chunks_with_metadata = existing_data + new_chunks

with open(CHUNKS_WITH_METADATA_FILE_NAME, "w", encoding="utf-8") as f:
    json.dump(chunks_with_metadata, f, ensure_ascii=False, indent=2)

print(f"Results saved to {CHUNKS_WITH_METADATA_FILE_NAME}")

Results saved to preprocessed_chunks/anthropic_control_chunks_with_metadata.json


# Creating Database

In [4]:
registry = get_registry()
hf = registry.get("huggingface").create(name=EMBEDDING_MODEL_NAME, trust_remote_code=True, device="cuda" if torch.cuda.is_available() else "cpu")


# Define model
class MyDocument(LanceModel):
    text: str = hf.SourceField()
    vector: Vector(hf.ndims()) = hf.VectorField()
    original_text: str
    context: str
    document: str
    pages: list[int]  # Any additional metadata
    id: str  # Unique identifier for the chunk


db = lancedb.connect("./db")
db.create_table(TABLE_NAME, schema=MyDocument, mode="overwrite") # Uncomment this line when running this cell for the first time
table = db.open_table(TABLE_NAME)

# Upload in batches with progress bar
with open(CHUNKS_WITH_METADATA_FILE_NAME, "r", encoding="utf-8") as f:
    chunks_with_metadata = json.load(f)

batch_size = 100
for i in tqdm(range(0, len(chunks_with_metadata), batch_size), desc="Uploading chunks to VectorDB"):
    batch = chunks_with_metadata[i:i+batch_size]
    table.add(batch)

table.create_scalar_index("id", replace=True) # Index based on the chunk's id, used to manually prevent duplicates

reranker = ColbertReranker()
table.create_fts_index("text", replace=True) # Used by the reranker as well as the hybrid search's BM25 index
table.wait_for_index(["text_idx"])  # Wait for the indexing to finish

<All keys matched successfully>
[90m[[0m2026-01-16T20:51:59Z [33mWARN [0m lance::dataset::write::insert[90m][0m No existing dataset at /home/martin/projects/Quantwise/Quantwise-Chunking/db/anthropic_control_table.lance, it will be created


Uploading chunks to VectorDB:   0%|          | 0/4 [00:00<?, ?it/s]

<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>
<All keys matched successfully>


Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device cuda
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting


# Example query

In [5]:
prompt = "How was stock market data gathered?"
results = table.search(prompt, query_type="hybrid", vector_column_name="vector", fts_columns="text") \
            .rerank(reranker=reranker) \
            .limit(5) \
            .to_pandas()


results

<All keys matched successfully>


Unnamed: 0,text,vector,original_text,context,document,pages,id,_relevance_score
0,Details the data collection process for the st...,"[0.70299774, 1.1682792, -3.7868931, -0.2385918...",We collected stock market-related information ...,Details the data collection process for the st...,Stock_Market_Prediction_via_Multi-Source_Multi...,[6],4cf733a743ce1b6eb4e3c41e23b999ed51cd3d280449ef...,1.036964
1,Introduces the methodology of analyzing Google...,"[0.17444716, 1.9274594, -3.5291111, -0.1920702...","Tobias Preis 1 * , Helen Susannah Moat 2,3 * &...",Introduces the methodology of analyzing Google...,srep01684.pdf,[1],03bb481c7bef2cc1f008113ea1e3259fdf160b601829cc...,1.025862
2,Quantifies the relationship between search vol...,"[0.46575984, 2.3796456, -3.5407345, 0.02398393...","In summary, our results are consistent with th...",Quantifies the relationship between search vol...,srep01684.pdf,[5],c80c4fb4f9449b543440f05257d57a5bee14a10d6f666e...,0.961116
3,"This section details the experimental design, ...","[0.5125915, 1.3523785, -2.9573245, -0.60483754...",Experimental design. Our paper relates to rese...,"This section details the experimental design, ...",s41598-020-77823-3.pdf,"[4, 5]",4eecb9240c936f76259c30feaf4292800c84483b696ec2...,0.949171
4,Quantifies the relationship between changes in...,"[0.46696505, 1.8472215, -2.931459, -0.16067062...",We analyze the performance of a set of 98 sear...,Quantifies the relationship between changes in...,srep01684.pdf,"[1, 2, 3, 4, 5]",727b53bf64c56e682acadb7029b3e2c15e58d4cef4f894...,0.904368


In [6]:
results.iloc[0,0]

'Details the data collection process for the study, outlining the sources and types of data used (quantitative, news, social media) and the timeframe for data gathering.\n\nWe collected stock market-related information from Jan. 1, 2015 to Dec. 31, 2016, and separate the information into two data sets, one for the year 2015 and the other for 2016. The data consist of three parts, the historical quantitative data, the news articles and the posts on the social network, which are introduced in detail as follows.\n- GLYPH<15> Quantitative data : the source of quantitative data is Wind, 2 a widely used GLYPH<28>nancial information service provider in China. The data we collect are the average prices, market index change and turnover rate of the Shanghai Composite Index in each trading day.\n- GLYPH<15> News data : we collect the news articles on the macro economy through Wind, and get 38,727 and 39,465 news articles in 2015 and 2016 respectively. The news articles are aggregated by Wind fro

In [7]:
table.stats()

{'total_bytes': 3488984,
 'num_rows': 377,
 'num_indices': 2,
 'fragment_stats': {'num_fragments': 4,
  'num_small_fragments': 4,
  'lengths': {'min': 77,
   'max': 100,
   'mean': 94,
   'p25': 100,
   'p50': 100,
   'p75': 100,
   'p99': 100}}}