In [39]:
from typing import List
import time 
from langchain_community.document_loaders import PyPDFLoader
def load_pdf(files: List[str]) -> List[dict]:
    """ Load given set of pdfs and returns List of dictionaries with pdf content """
    pages = []
    for file in files:
        loader = PyPDFLoader(file)
        for page in loader.lazy_load():
            pages.append(page)
    return pages

files = ['data/kodekskarny2.pdf']
ingested_data = load_pdf(files)


In [88]:
def chunk_data(data: List[dict]) -> List[dict]:
    """ This function chunks page content to text, article_number and paragraph_number """
    import re
    chunks = []
    pattern = r'(Art\.\s+\d+.*?)\s*(?=Art\.\s+\d+|$)'
    pattern_for_paragraphs = r'(?=§\s*\d+\.)'
    for page in ingested_data:
        try:
            text = page.page_content.replace('\xa0', '').replace('\n', '')
            splitted_per_article = re.findall(pattern, text, re.DOTALL)
            for article in splitted_per_article:
                article_number = article.split('[')[0]
                if '§' in article:
                    splitted_per_paragraph = re.split(pattern_for_paragraphs, article)
                    paragraphs = splitted_per_paragraph[1:] # Remove first element of list with article name
                    for paragraph in paragraphs:
                        paragraph_number = paragraph.split('§')[1].split('.')[0].strip()
                        index_of_paragraph_number = paragraph.index(paragraph_number)
                        text = paragraph[index_of_paragraph_number+1:]
                        chunks.append({'article_number': article_number, 'paragraph_number': paragraph_number, 'text': text})
                else:
                    chunks.append({'article_number': article_number, 'paragraph_number': None, 'text': article.split(']')[1]})
        except Exception as e:
            print(e)
            pass
            
    return chunks 
            

In [89]:
chunks = chunk_data(ingested_data)
print(chunks[846])

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
{'article_number': 'Art. 363. ', 'paragraph_number': '2', 'text': '.Ściganie następuje na wniosek dowódcy jednostki.1 Niniejsza ustawa w zakresie swojej regulacji realizuje postanowienia:1) dyrektywy Parlamentu Europejskiego i Rady (UE) 2018/1673 z dnia 23 października 2018 r. w sprawie zwalczania praniapieniędzy za pomocą środków prawnokarnych (Dz. Urz. UE L 284 z 12.11.2018, str. 22);2) dyrektywy Parlamentu Europejskiego i Rady (UE) 2019/713 z dnia 17 kwietnia 2019 r. w sprawie zwalczania fałszowania ioszustw związanych z bezgotówkowymi środkami płatniczymi, zastępującej decyzję ramo

In [91]:
!pip install fastembed
!pip install pymilvus

Collecting pymilvus
  Using cached pymilvus-2.5.4-py3-none-any.whl.metadata (5.7 kB)
Collecting grpcio<=1.67.1,>=1.49.1 (from pymilvus)
  Using cached grpcio-1.67.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Using cached ujson-5.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting pandas>=1.2.4 (from pymilvus)
  Using cached pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus)
  Using cached milvus_lite-2.4.11-py3-none-manylinux2014_x86_64.whl.metadata (9.2 kB)
Collecting pytz>=2020.1 (from pandas>=1.2.4->pymilvus)
  Using cached pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas>=1.2.4->pymilvus)
  Using cached tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pymilvus-2.5.4-py3-none-any.whl (222 kB)
Using cached grpcio-1.67.1-cp312-cp31

In [105]:
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
from fastembed import TextEmbedding


def create_milvus_collection(host: str = 'localhost', port: str = '19530', collection_name: str = 'kodekskarny_embedd') -> Collection:
    """ This function connect to Milvus DB and create collection, if collection already exists it connects to it """
    connections.connect(alias="default", uri=f"http://{host}:{port}")
    
    if collection_name in utility.list_collections():
        collection = Collection(name=collection_name)
    else:
        pk_field = FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True)
        vector_field = FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=384) 
        text_field = FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=4096)
        article_field = FieldSchema(name="article", dtype=DataType.VARCHAR, max_length=512)
        paragraph_field = FieldSchema(name="paragraph", dtype=DataType.VARCHAR, max_length=512)
    
        schema = CollectionSchema(fields=[pk_field, vector_field, text_field, article_field, paragraph_field], description="Collection for document embeddings")
    
        collection = Collection(name=collection_name, schema=schema)
    

        index_params = {"metric_type": "L2", "index_type": "IVF_FLAT", "params": {"nlist": 128}}
        collection.create_index(field_name="vector", index_params=index_params)

    collection.load()
    
    print(f"Collection '{collection_name}' is ready for insertion!")
    return collection

In [104]:
collection = create_milvus_collection(port='19530')

Collection 'kodekskarny_embedd' is ready for insertion!


In [2]:
def insert_chunks_to_milvus(chunks: List[dict], collection: Collection):
    """Embeds and inserts chunked data into Milvus."""
    
    embedding_model = TextEmbedding()
    print("The model BAAI/bge-small-en-v1.5 is ready to use.")
    
    texts = [chunk["text"] for chunk in chunks]
    articles = [chunk["article_number"] for chunk in chunks]
    paragraphs = [chunk["paragraph_number"] for chunk in chunks]
    
    embeddings_generator = embedding_model.embed(texts)
    embeddings_list = list(embeddings_generator)
    
    entities = [
        {"vector": embedding.tolist(), "text": text, "articles": articles, "paragraphs": paragraphs}
        for embedding, text, articles, paragraphs in zip(embeddings_list, texts, articles, paragraphs)
    ]

    if entities:
        collection.insert(entities)
        print(f"Inserted {len(entities)} records into '{COLLECTION_NAME}'.")
    else:
        print("No records to insert.")

NameError: name 'List' is not defined

In [1]:
insert_chunks_to_milvus(chunks, collection)

NameError: name 'insert_chunks_to_milvus' is not defined