In [1]:
from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
from docling.document_converter import DocumentConverter
from docling.chunking import HybridChunker
from transformers import AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import torch
embedding_model_name = "nomic-ai/nomic-embed-text-v1.5"
MAX_TOKENS = 2000

source = "Stock_Market_Prediction_via_Multi-Source_Multiple_Instance_Learning.pdf"  # document per local path or URL
converter = DocumentConverter()
doc = converter.convert(source).document

tokenizer = HuggingFaceTokenizer(
    tokenizer=AutoTokenizer.from_pretrained(embedding_model_name),
    max_tokens=MAX_TOKENS # Optional, uses the max token number of the HF tokenizer by default
)


chunker = HybridChunker(
    tokenizer=tokenizer,
    merge_peers=True #Optional, defaults to true
)
chunks = list(chunker.chunk(dl_doc=doc))


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# model = SentenceTransformer(embedding_model_name, trust_remote_code=True)
# device = "cuda" if torch.cuda.is_available() else "cpu"
# model.to(device)

In [3]:
chunks_with_metadata = []
for chunk in chunks:
    pages = set(
                prov.page_no
                for doc_item in chunk.meta.doc_items
                for prov in doc_item.prov
            )
    chunks_with_metadata.append({'text': chunk.text, 'document':source, 'pages':list(pages)})

In [None]:
import lancedb
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector
import torch

registry = get_registry()
hf = registry.get("huggingface").create(name="nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)#, device="cuda" if torch.cuda.is_available() else "cpu") #TODO: Test if there's a point to running this on GPU. LanceDB seems indifferent to the device.


# Define model
class MyDocument(LanceModel):
    text: str = hf.SourceField()
    vector: Vector(hf.ndims()) = hf.VectorField()
    document: str
    pages: list[int]  # Any additional metadata


db = lancedb.connect("./db")
# db.create_table("my_table", schema=MyDocument, mode="overwrite") # Uncommend this line when running this cell for the first time
table = db.open_table("my_table")
# table.add(chunks_with_metadata) # Uncomment this line when running this cell for the first time to add data to the table

  from .autonotebook import tqdm as notebook_tqdm
<All keys matched successfully>


In [2]:
results = table.search("How was the stock-market related information collected?") \
               .limit(3) \
               .select(["text", "document", "pages"]) \
               .to_pandas()

# results

<All keys matched successfully>


In [3]:
for result in results.itertuples():
    print(f"Text: {result.text}")
    print("-" * 25)

Text: We collected stock market-related information from Jan. 1, 2015 to Dec. 31, 2016, and separate the information into two data sets, one for the year 2015 and the other for 2016. The data consist of three parts, the historical quantitative data, the news articles and the posts on the social network, which are introduced in detail as follows.
- GLYPH<15> Quantitative data : the source of quantitative data is Wind, 2 a widely used GLYPH<28>nancial information service provider in China. The data we collect are the average prices, market index change and turnover rate of the Shanghai Composite Index in each trading day.
- GLYPH<15> News data : we collect the news articles on the macro economy through Wind, and get 38,727 and 39,465 news articles in 2015 and 2016 respectively. The news articles are aggregated by Wind from major GLYPH<28>nancial news websites in China, such as http://GLYPH<28>nance.sina.com.cn and http://www.hexun.com. We process the news titles rather than the whole art

In [4]:
# question = model.encode("What was the frequency of each label in the DocLayNet dataset?", normalize_embeddings=True)
# similarities = []
# for chunk in contextualized_chunks:
#     chunk_embedding = model.encode(chunk, normalize_embeddings=True)
#     similarity = cosine_similarity([question], [chunk_embedding])[0][0]
#     similarities.append({'similarity':similarity, 'chunk': chunk})

# similarities = sorted(similarities, key=lambda x: (-x['similarity']))
# similarities[:3]