In [10]:
from typing import List
import time 
from langchain_community.document_loaders import PyPDFLoader
def load_pdf(files: List[str]) -> List[dict]:
    """ Load given set of pdfs and returns List of dictionaries with pdf content """
    pages = []
    for file in files:
        loader = PyPDFLoader(file)
        for page in loader.lazy_load():
            pages.append(page)
    return pages

files = ['data/kodekskarny2.pdf']
ingested_data = load_pdf(files)


In [11]:
def chunk_data(data: List[dict]) -> List[dict]:
    """ This function chunks page content to text, article_number and paragraph_number """
    import re
    chunks = []
    pattern = r'(Art\.\s+\d+.*?)\s*(?=Art\.\s+\d+|$)'
    pattern_for_paragraphs = r'(?=§\s*\d+\.)'
    for page in ingested_data:
        try:
            text = page.page_content.replace('\xa0', '').replace('\n', '')
            splitted_per_article = re.findall(pattern, text, re.DOTALL)
            for article in splitted_per_article:
                article_number = article.split('[')[0]
                if '§' in article:
                    splitted_per_paragraph = re.split(pattern_for_paragraphs, article)
                    paragraphs = splitted_per_paragraph[1:] # Remove first element of list with article name
                    for paragraph in paragraphs:
                        paragraph_number = paragraph.split('§')[1].split('.')[0].strip()
                        index_of_paragraph_number = paragraph.index(paragraph_number)
                        text = paragraph[index_of_paragraph_number+1:]
                        chunks.append({'article_number': article_number, 'paragraph_number': paragraph_number, 'text': text})
                else:
                    chunks.append({'article_number': article_number, 'paragraph_number': 0, 'text': article.split(']')[1]})
        except Exception as e:
            print(e)
            pass
            
    return chunks 
            

In [12]:
chunks = chunk_data(ingested_data)

list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range
list index out of range


In [22]:
from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType, utility
from fastembed import TextEmbedding


def create_milvus_collection(host: str = 'localhost', port: str = '19530', collection_name: str = 'kodekskarny_embedd') -> Collection:
    """ This function connect to Milvus DB and create collection, if collection already exists it connects to it """
    connections.connect(alias="default", uri=f"http://{host}:{port}")
    
    if collection_name in utility.list_collections():
        collection = Collection(name=collection_name)
    else:
        pk_field = FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True)
        vector_field = FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=3072) 
        text_field = FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=4096)
        article_field = FieldSchema(name="article", dtype=DataType.VARCHAR, max_length=512)
        paragraph_field = FieldSchema(name="paragraph", dtype=DataType.VARCHAR, max_length=512)
    
        schema = CollectionSchema(fields=[pk_field, vector_field, text_field, article_field, paragraph_field], description="Collection for document embeddings")
    
        collection = Collection(name=collection_name, schema=schema)
    

        index_params = {"metric_type": "L2", "index_type": "IVF_FLAT", "params": {"nlist": 128}}
        collection.create_index(field_name="vector", index_params=index_params)

    collection.load()
    
    print(f"Collection '{collection_name}' is ready for insertion!")
    return collection

In [5]:
collection = create_milvus_collection(port='19530')

Collection 'kodekskarny_embedd' is ready for insertion!


In [17]:
def insert_chunks_to_milvus(chunks: List[dict], collection: Collection) -> None:
    """Embeds and inserts chunked data into Milvus."""
    results = collection.query(expr="", output_fields=["count(*)"])
    number_of_rows = int(results[0]['count(*)'])
    if number_of_rows > 0:
        print("Chunks already ingested")
        return
    embedding_model = TextEmbedding()
    print("The model BAAI/bge-small-en-v1.5 is ready to use.")
    
    texts = [chunk["text"] for chunk in chunks]
    articles = [str(chunk["article_number"]) for chunk in chunks]
    paragraphs = [str(chunk["paragraph_number"]) for chunk in chunks]
    
    embeddings_generator = embedding_model.embed(texts)
    embeddings_list = list(embeddings_generator)
    
    entities = [
        {"vector": embedding.tolist(), "text": text, "article": article, "paragraph": paragraph}
        for embedding, text, article, paragraph in zip(embeddings_list, texts, articles, paragraphs)
    ]

    if entities:
        collection.insert(entities)
        print(f"Inserted {len(entities)} records to db'.")
    else:
        print("No records to insert.")

In [19]:
insert_chunks_to_milvus(chunks, collection)

Chunks already ingested


In [29]:
def query_vdb(query: str, collection: Collection) -> None:
    """ Query milvus vector database for similarity search for query """
    
    embedding_model = TextEmbedding()
    query_embedding = list(embedding_model.embed([query]))[0]
    
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    results = collection.search(
        data=[query_embedding],
        anns_field="vector",
        param=search_params,
        limit=5,
        output_fields=["text", "article", "paragraph"]  
    )
    
    for result in results[0]:
        print(f"Score: {result.score}")
        print(f"Text: {result.entity.get('text')}")
        print(f"Article: {result.entity.get('article')}")
        print(f"Paragraph: {result.entity.get('paragraph')}\n")

In [2]:
from openai import AzureOpenAI
import os
client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = "2024-02-01",
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)

def generate_embeddings(text, model="text-embedding-3-large"): # model = "deployment_name"
    return client.embeddings.create(input = [text], model=model).data[0].embedding

print(generate_embeddings('byl sobie pies i jezdzil koleja'))

[0.0002517935063224286, -0.01283576525747776, 0.012894435785710812, -0.01740553230047226, 0.010827936232089996, -0.007842267863452435, -0.012907473370432854, -0.0005867033614777029, -0.00776404095813632, -0.0013021554332226515, -0.019439438357949257, 0.011512423865497112, -0.020612843334674835, 0.0046805888414382935, -0.03202096372842789, 0.01718388870358467, -0.016336428001523018, -0.014550242573022842, -0.0027689137496054173, -0.03350728005170822, 0.06013057380914688, -0.06372901797294617, 0.003223608946427703, -0.020899677649140358, -0.014980492182075977, 7.456021558027714e-05, -0.0068318345583975315, 0.00559975765645504, -0.016310352832078934, 0.011479828506708145, -0.01994791440665722, -0.0008833144675008953, 0.0023761484771966934, -0.0091786477714777, 0.041460368782281876, -0.013468101620674133, 0.05919184908270836, -0.029100485146045685, -0.023076998069882393, 0.01509783323854208, 0.027796700596809387, 0.020091330632567406, -0.003624522825703025, -0.026649368926882744, 0.0045567

In [23]:
collection = create_milvus_collection(port='19530')
utility.drop_collection('kodekskarny_embedd')
collection = create_milvus_collection(port='19530')

Collection 'kodekskarny_embedd' is ready for insertion!
Collection 'kodekskarny_embedd' is ready for insertion!


In [19]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")
total_tokens = 0
for data in chunks:
    if len(tokenizer.encode(data['text'])) > 8000:
        print(len(tokenizer.encode(data['text'])))
    total_tokens += len(tokenizer.encode(data['text']))
print(total_tokens)

68285


In [24]:
def insert_chunks_to_milvus(chunks: List[dict], collection: Collection) -> None:
    """Embeds and inserts chunked data into Milvus."""
    results = collection.query(expr="", output_fields=["count(*)"])
    number_of_rows = int(results[0]['count(*)'])
    if number_of_rows > 0:
        print("Chunks already ingested")
        return
        
    texts = [chunk["text"] for chunk in chunks]
    articles = [str(chunk["article_number"]) for chunk in chunks]
    paragraphs = [str(chunk["paragraph_number"]) for chunk in chunks]
    
    embeddings_list = []
    for chunk in chunks:
        if chunk['text'].strip():
            embeddings_list.append(generate_embeddings(chunk['text'].strip()))
    
    entities = [
        {"vector": embedding, "text": text, "article": article, "paragraph": paragraph}
        for embedding, text, article, paragraph in zip(embeddings_list, texts, articles, paragraphs)
    ]

    if entities:
        collection.insert(entities)
        print(f"Inserted {len(entities)} records to db'.")
    else:
        print("No records to insert.")

In [25]:
insert_chunks_to_milvus(chunks, collection)

Inserted 834 records to db'.


In [44]:
def query_vdb(query: str, collection: Collection) -> None:
    """ Query milvus vector database for similarity search for query """
    
    query_embedding = generate_embeddings(query)
    
    search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
    results = collection.search(
        data=[query_embedding],
        anns_field="vector",
        param=search_params,
        limit=3,
        output_fields=["text", "article", "paragraph"]  
    )

    context = []
    
    for result in results[0]:
        context.append(f"Score: {result.score} Text: {result.entity.get('text')} Article: {result.entity.get('article')} Paragraph: {result.entity.get('paragraph')}\n")
        
    return context

In [45]:
print(query_vdb("broniłem się i zabiłem człowieka", collection))

['Score: 1.0456018447875977 Text: Odpowiada jak za podżeganie, kto w celu skierowania przeciwko innej osobie postępowania karnego nakłania ją dopopełnienia czynu zabronionego; w tym wypadku nie stosuje się art. 22 i 23.Rozdział IIIWyłączenie odpowiedzialności karnej Article: Art. 24.  Paragraph: 0\n', 'Score: 1.0620143413543701 Text: .Kto udziela albo obiecuje udzielić korzyści majątkowej lub osobistej w zamian za pośrednictwo w załatwieniusprawy w podmiocie wskazanym w art. 230 §  1, polegające na bezprawnym wywarciu wpływu na decyzję, działanielub zaniechanie osoby pełniącej funkcję publiczną, w związku z pełnieniem tej funkcji,podlega karze pozbawienia wolności od 6 miesięcy do lat 8. Article: Art. 230a.  Paragraph: 1\n', 'Score: 1.1137981414794922 Text: .W razie przekroczenia granic obrony koniecznej, w szczególności gdy sprawca zastosował sposób obronyniewspółmierny do niebezpieczeństwa zamachu, sąd może zastosować nadzwyczajne złagodzenie kary, a nawetodstąpić od jej wymierzenia.

In [38]:
def chat_completion(prompt: str, model: str = 'gpt-4o-mini') -> str:
    """ It takes user's prompt and return answer from given model from OpenAI """
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages = [ 
            {
            "role": "assistant",
            "content": "Jestem przyjaznym botem, ktory odpowiada na pytania prawnicze korzystajac z dostarczonego kontekstu",
            },
            {
            "role": "user",
            "content": prompt,
            },
            
        ],
        temperature=0.3
    )
    content = completion.choices[0].message.content
    return content

In [50]:
def rag_template(query: str, collection: Collection) -> str:
    context_array = query_vdb(query, collection)
    context = ' '.join(context_array)
    template = f""" 
    Odpowiedz na podane pytanie korzystajac tylko i wylacznie z podanego kontekstu, wybierz z niego najbardziej pasujacy, do odpowiedzi podaj nr artykulu i paragrafu skad czerpales wiedze.
    Kontekst: {context}
    Pytanie: {query}
    """
    return template


In [60]:
print(chat_completion(rag_template("Pobiłem sąsiada bo się pruł", collection)))

Na podstawie podanego kontekstu, nie znalazłem bezpośredniej informacji dotyczącej pobicia sąsiada. Jednakże, jeśli chodzi o odpowiedzialność za przestępstwo, można odwołać się do artykułu dotyczącego ścigania przestępstw, gdy pokrzywdzonym jest osoba najbliższa, co może mieć związek z sytuacją. 

Odpowiedź: Art. 157, Paragraf 5.
