# Chunking

In [1]:
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
import hashlib
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
from lancedb.rerankers import ColbertReranker
import ollama


embedding_model_name = "nomic-ai/nomic-embed-text-v1.5"
MAX_TOKENS = 2000

source = "Stock_Market_Prediction_via_Multi-Source_Multiple_Instance_Learning.pdf"  # document per local path or URL
converter = DocumentConverter()
doc = converter.convert(source).document

tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(embedding_model_name),
    max_tokens=MAX_TOKENS # Optional, uses the max token number of the HF tokenizer by default
)


chunker = HybridChunker(
    tokenizer=tokenizer,
    merge_peers=True #Optional, defaults to true
)
chunks = list(chunker.chunk(dl_doc=doc))


  from .autonotebook import tqdm as notebook_tqdm


# Adding Metadata

As well as semantic context with ollama (Anthropic style)

In [2]:
chunks_with_metadata = []
entire_doc = ""
for chunk in chunks:
    entire_doc += " " +chunk.text

entire_doc = "FULL DOCUMENT:\n" + entire_doc

for chunk in chunks:    
    
    ollama_prompt = f"CHUNK:\n{chunk.text}"
    history =  [{'role': 'user', 'content': entire_doc}, {'role': 'user', 'content': ollama_prompt}]

    response = ollama.chat(
        model="chunker_full_doc",
        messages=history
    )
    context = response['message']['content']
    print(f"Context for chunk: {context}")
    text_to_embed = chunk.text + "\n\n" + context # We put the context AFTER the chunk to not mess up cosine similarity but still

    pages = set(
            prov.page_no
            for doc_item in chunk.meta.doc_items
            for prov in doc_item.prov
        )
    id = hashlib.sha256(chunk.text.encode()).hexdigest()
    chunks_with_metadata.append({'text': text_to_embed, 'original_text':chunk.text, 'context':context, 'document':source, 'pages':list(pages), 'id': id})

Context for chunk: Provides contextual information about the publication of the research paper, including its dates and DOI.
Context for chunk: Introduces the core problem and proposed solution: a multi-source multiple instance model for predicting stock market movements by integrating events, sentiments, and quantitative data, with a focus on interpretable predictions.
Context for chunk: Introduces the core problem: predicting stock market movements and highlights the challenge of integrating diverse data sources (quantitative data and qualitative descriptions like news and social media). It also establishes the key approach: utilizing a multi-source multiple instance learning (M-MI) model to fuse event representations and sentiments for improved predictions.
Context for chunk: Summarizes the reliance on single data sources in existing event-driven stock prediction models and highlights the need for integrating multiple sources to overcome this limitation.
Context for chunk: Highlight

# Creating Database

In [3]:
import torch
registry = get_registry()
hf = registry.get("huggingface").create(name=embedding_model_name, trust_remote_code=True, device="cuda" if torch.cuda.is_available() else "cpu") #TODO: Test if there's a point to running this on GPU. LanceDB seems indifferent to the device.


# Define model
class MyDocument(LanceModel):
    text: str = hf.SourceField()
    vector: Vector(hf.ndims()) = hf.VectorField()
    original_text: str
    context: str
    document: str
    pages: list[int]  # Any additional metadata
    id: str  # Unique identifier for the chunk




db = lancedb.connect("./db")
db.create_table("my_table", schema=MyDocument, mode="overwrite") # Uncommend this line when running this cell for the first time
table = db.open_table("my_table")
table.add(chunks_with_metadata) # LanceDB doesn't check for duplicates by default
table.create_scalar_index("id", replace=True) # Index based on the chunk's id, used to manually prevent duplicates

reranker = ColbertReranker()
table.create_fts_index("text", replace=True) # Used by the reranker as well as the hybrid search's BM25 index
table.wait_for_index(["text_idx"])  # Wait for the indexing to finish

<All keys matched successfully>
<All keys matched successfully>


Loading ColBERTRanker model colbert-ir/colbertv2.0 (this message can be suppressed by setting verbose=0)
No device set
Using device cuda
No dtype set
Using dtype torch.float32
Loading model colbert-ir/colbertv2.0, this might take a while...
Linear Dim set to: 128 for downcasting


In [4]:
table.add(chunks_with_metadata) # LanceDB doesn't check for duplicates by default


<All keys matched successfully>


AddResult(version=31)

In [5]:
len(chunks_with_metadata) # Should be the same number of chunks as in the original document

18

# Example query

In [6]:
prompt = "How was the stock-market related information collected?"
results = table.search(prompt, query_type="hybrid", vector_column_name="vector", fts_columns="text") \
            .rerank(reranker=reranker) \
            .limit(3) \
            .to_pandas()


results

<All keys matched successfully>


Unnamed: 0,text,vector,original_text,context,document,pages,id,_relevance_score
0,We collected stock market-related information ...,"[0.5996058, 1.1112232, -3.3586943, -0.36808917...",We collected stock market-related information ...,"Details the data collection process, specifica...",Stock_Market_Prediction_via_Multi-Source_Multi...,[6],08aa975b95d5c9c782b71fcb335e2ab61751bb4c4f6bb1...,0.878606
1,We collected stock market-related information ...,"[0.5996058, 1.1112232, -3.3586943, -0.36808917...",We collected stock market-related information ...,"Details the data collection process, specifica...",Stock_Market_Prediction_via_Multi-Source_Multi...,[6],08aa975b95d5c9c782b71fcb335e2ab61751bb4c4f6bb1...,0.878606
2,Stock markets play important roles in the econ...,"[0.39195225, 1.1913257, -3.0367725, -0.3536288...",Stock markets play important roles in the econ...,Introduces the core problem: predicting stock ...,Stock_Market_Prediction_via_Multi-Source_Multi...,"[1, 2]",7ad78330122f8822c806a60689a342ee4a4dc34abf7a77...,0.586552


In [7]:
tokenizer.count_tokens(entire_doc)  # Check how many tokens the prompt has, should be less than MAX_TOKENS

Token indices sequence length is longer than the specified maximum sequence length for this model (10860 > 8192). Running this sequence through the model will result in indexing errors


10860