In [1]:
import sys, os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.assistant.rag_helpers import similarity_search_for_tenant
from src.assistant.vector_db import get_embedding_model

In [2]:
# Test query
test_query = "how to release residues from regulatory control"

# Get results
results = similarity_search_for_tenant(
    tenant_id="default",
    embed_llm=get_embedding_model(),
    persist_directory="../database",
    similarity="cosine",
    normal=True,
    query=test_query
)

# Print results
results

  return HuggingFaceEmbeddings(model_kwargs={'device': 'cpu'})
  return HuggingFaceEmbeddings(model_kwargs={'device': 'cpu'})
  from .autonotebook import tqdm as notebook_tqdm
  vectorstore = Chroma(


[Document(metadata={'id': 'strlschg_en_bf.pdf:0:102', 'page': 0, 'source': 'strlschg_en_bf.pdf'}, page_content='methods of recovery and disposal laid down by a statutory ordinance in accordance with the second sentence are complied with in their disposal or recovery. the federal government is herewith empowered to issue a statutory ordinance, with the consent of the bundesrat, es\xad tablishing the monitoring limits and methods of recovery and disposal to apply to residues. 3 residues accumulated may not be mixed or diluted prior to their intended disposal or recovery in order to satisfy the monitoring limits in accordance with subsection 2. the first sentence shall also apply to residues accumulated abroad and brought into germany for recovery. 4 if the residues requiring monitoring are stored on the operating premises of the par\xad ty subject to an obligation in accordance with subsection 1, the latter must declare said storage to the competent authority. the competent authority sha

In [3]:
def transform_documents(documents):
    """
    Transforms a list of Document objects into a specific dictionary format.
    
    Args:
        documents (list): List of Document objects with metadata and page_content
        
    Returns:
        list: List of dictionaries with content and metadata in the required format
    """
    samples = []
    
    for doc in documents:
        transformed_doc = {
            "content": doc.page_content,
            "metadata": {
                "name": doc.metadata['id'],
                "path": doc.metadata['source']
            }
        }
        samples.append(transformed_doc)
    
    return samples

In [4]:
tr=transform_documents(results)
tr

[{'content': 'methods of recovery and disposal laid down by a statutory ordinance in accordance with the second sentence are complied with in their disposal or recovery. the federal government is herewith empowered to issue a statutory ordinance, with the consent of the bundesrat, es\xad tablishing the monitoring limits and methods of recovery and disposal to apply to residues. 3 residues accumulated may not be mixed or diluted prior to their intended disposal or recovery in order to satisfy the monitoring limits in accordance with subsection 2. the first sentence shall also apply to residues accumulated abroad and brought into germany for recovery. 4 if the residues requiring monitoring are stored on the operating premises of the par\xad ty subject to an obligation in accordance with subsection 1, the latter must declare said storage to the competent authority. the competent authority shall be informed without undue delay if such storage ceases. 5 for those residues that are not subje

In [5]:
import ollama
import re
from IPython.display import Markdown, display

In [6]:
def source_summarizer(context_documents, llm_model="deepseek-r1"):
    system_message = """
    You are an expert summarizer working within a RAG system. Your task is to create a concise, accurate summary of the provided information while properly attributing all facts to their sources.

    Guidelines:
    - Create a clear, coherent summary limited to 3-5 sentences
    - Focus on the most important facts and insights
    - Maintain factual accuracy without adding new information
    - Use neutral, professional language
    - Cite EVERY piece of information using the format [Document Name](document_path)
    - Place citations immediately after the relevant information
    - Ensure each citation is correctly matched to its source
    - Return only the plain text summary without markdown formatting
    """

    formatted_context = "\n".join(
        f"Content: {doc['content']}\nSource: {doc['metadata']['name']}\nPath: {doc['metadata']['path']}"
        for doc in context_documents
    )

    prompt = f"""
    Here are the documents to summarize:
    
    {formatted_context}
    
    Provide a concise summary with proper citations:
    """
    
    response = ollama.chat(
        model=llm_model,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt}
        ]
    )
    
    response_content = response["message"]["content"]
    
    # Clean markdown formatting if present
    try:
        final_content = re.sub(r"<think>.*?</think>", "", response_content, flags=re.DOTALL).strip()
    except:
        final_content = response_content.strip()

    # Extract metadata from all documents
    document_names = [doc['metadata']['name'] for doc in context_documents]
    document_paths = [doc['metadata']['path'] for doc in context_documents]

    return {
        "content": final_content,
        "metadata": {
            "name": document_names,
            "path": document_paths
        }
    }

In [7]:
smry = source_summarizer(tr)
smry

{'content': 'The first document outlines that residues must be disposed or recovered in accordance with monitoring limits (strlschg_en_bf.pdf:0:102). Residues cannot be mixed or diluted unless monitored, and storage declarations are required. Cross-border transportation of residues for disposal is prohibited.\n\nThe second document elaborates by stating that parties under an obligation must declare residue intent to the competent authority without undue delay (strlschg_en_bf.pdf:0:103). Release from monitoring can occur if population protection, worker exposure limits, and waste law compliance are met. Release requires written notice.',
 'metadata': {'name': ['strlschg_en_bf.pdf:0:102', 'strlschg_en_bf.pdf:0:103'],
  'path': ['strlschg_en_bf.pdf', 'strlschg_en_bf.pdf']}}

In [8]:
display(Markdown(smry['content']))

The first document outlines that residues must be disposed or recovered in accordance with monitoring limits (strlschg_en_bf.pdf:0:102). Residues cannot be mixed or diluted unless monitored, and storage declarations are required. Cross-border transportation of residues for disposal is prohibited.

The second document elaborates by stating that parties under an obligation must declare residue intent to the competent authority without undue delay (strlschg_en_bf.pdf:0:103). Release from monitoring can occur if population protection, worker exposure limits, and waste law compliance are met. Release requires written notice.

In [9]:
import time
models = ["deepseek-r1:1.5b","deepseek-r1:latest", "llama3.1:8b-instruct-q4_0", "llama3.2", "gemma3:4b", "phi4-mini", "mistral:instruct", "mistrallite"]

for model in models:
    start_time = time.time()
    summary = source_summarizer(tr,model)
    elapsed_time = time.time() - start_time
    display(Markdown(f'#LLM: {model}'))
    display(Markdown(summary['content']))
    print(f'Time taken: {elapsed_time:.2f} seconds')
    print('________________________________')

#LLM: deepseek-r1:latest

The document outlines monitoring limits for residues in Germany, requiring them not to be mixed or diluted without authority notification. Residues must be declared for disposal or recovery if they require monitoring. Prohibitions on cross-border transportation are noted, with release conditions ensuring population protection, occupational worker exposure limits, and legal waste permissibility. Release is via written notice, adhering to public dose constraints.

[Document Name](document_path)
The document outlines monitoring limits for residues in Germany, requiring them not to be mixed or diluted without authority notification. Residues must be declared for disposal or recovery if they require monitoring. Prohibitions on cross-border transportation are noted, with release conditions ensuring population protection, occupational worker exposure limits, and legal waste permissibility. Release is via written notice, adhering to public dose constraints.

[Document Name](document_path)
The document outlines monitoring limits for residues in Germany, requiring them not to be mixed or diluted without authority notification. Residues must be declared for disposal or recovery if they require monitoring. Prohibitions on cross-border transportation are noted, with release conditions ensuring population protection, occupational worker exposure limits, and legal waste permissibility. Release is via written notice, adhering to public dose constraints.

[Document Name](document_path)

Time taken: 25.16 seconds
________________________________


#LLM: llama3.1:8b-instruct-q4_0

Residues requiring monitoring must be disposed of or recovered in accordance with statutory ordinances, and their disposal or recovery may not be mixed or diluted to meet monitoring limits (strlschg_en_bf.pdf:0:102). The party subject to an obligation must declare storage of such residues to the competent authority and secure them against loss and unauthorized access prior to disposal or recovery (strlschg_en_bf.pdf:0:103). Release from monitoring is permitted if certain criteria are met, including ensuring the protection of the population and compliance with waste law.

Time taken: 14.54 seconds
________________________________


#LLM: llama3.2

Here is a concise summary of the provided information:

In accordance with the statute, residues requiring monitoring must be recovered or disposed of in accordance with established monitoring limits and methods [strlschg_en_bf.pdf:0:102]. The Federal Government can issue a statutory ordinance with the consent of the Bundesrat to establish these limits and methods. Residues accumulated abroad must be declared for recovery in Germany, and storage on operating premises requires notification to the competent authority [strlschg_en_bf.pdf:0:103]. Prior to disposal or recovery, residues must be secured against loss and unauthorized access, and only passed on to other parties for these purposes [strlschg_en_bf.pdf:0:102]. The cross-border transportation of residues is prohibited.

Time taken: 8.91 seconds
________________________________


#LLM: gemma3:4b

The federal government is authorized to establish monitoring limits and methods for residue recovery and disposal, requiring compliance with statutory ordinance [strlschg_en_bf.pdf:0:102]. Residues accumulated in excess of three may not be mixed or diluted prior to disposal, and parties subject to obligations must declare residue storage and cessation to the competent authority [strlschg_en_bf.pdf:0:103]. Furthermore, the competent authority can require proof of compliance with monitoring limits, including technical procedures and measurement requirements, to ensure public protection with an effective dose limit of 1 msv [strlschg_en_bf.pdf:0:103].

Time taken: 10.38 seconds
________________________________


#LLM: mistral:instruct

Under the German Ordinance on Waste (strlschg\_en\_bf.pdf:0:102), the disposal or recovery of certain residues must comply with methods and monitoring limits established by statutory ordinances with Bundestrat consent [1]. Residues requiring monitoring cannot be mixed, diluted, or passed on for purposes other than disposal or recovery [1]. Operators subject to this obligation must declare storage to the competent authority and secure residues against loss and unauthorized access [1]. The competent authority may require compliance with monitoring limits for non-monitored residues [1]. Cross-border transportation of residues for disposal is prohibited [1]. Operators must declare intended recovery or disposal of monitored residues to the competent authority and may be released from monitoring for specific recoveries or disposals under certain conditions (strlschg\_en\_bf.pdf:0:103). Conditions include ensuring public protection against exposure, maintaining occupational worker body doses below categorical levels, and no reservations on the method of recovery or disposal [2].

[1] strlschg_en_bf.pdf:0:102-102, 102-103
[2] strlschg_en_bf.pdf:0:103, 103-105

Time taken: 25.64 seconds
________________________________


#LLM: mistrallite

Under the Federal Immission Control Act (BImSchG), companies are required to comply with statutory ordinances on monitoring limits and methods of recovery and disposal for residues, issued by the federal government with the consent of the Bundesrat. These residues must not be mixed or diluted prior to their intended disposal or recovery. The party under an obligation in accordance with section 61 subsection 1 is responsible for securing residues against loss and access by unauthorised parties prior to their disposal or recovery. Cross-border transportation of residues into Germany for their disposal is prohibited.
        The first sentence of section 61 applies not only to residues accumulated in Germany, but also to those brought into Germany from abroad for recovery. If the party under an obligation in accordance with section 61 subsection 1 has established that the residues require monitoring, they must declare their intended recovery or disposal without undue delay to the competent authority, stating their nature, mass and specific activity. A declaration may be dispensed with if notification is made in accordance with section 63 subsection 1 due to the nature and specific activity of the residues requiring monitoring.
        At the request of the party under an obligation in accordance with section 61 subsection 1, first sentence, the competent authority shall release residues requiring monitoring from that monitoring for a particular recovery or disposal, provided that 1. the requisite protection of the population against exposure is ensured by means of the measures put in place for recovery or disposal, 2. the body dose of occupational workers in the course of disposal or recovery cannot exceed the levels for categorisation as an occupationally-exposed person, and 3. there are no reservations under the law on waste as to the permissibility of the envisaged method of recovery or disposal and whether it will be complied with. Release from the monitoring obligation is effected by means of notice in writing.
        The criteria for protecting the public are that, as a constraint for the exposure of members of the public caused by disposal or recovery, even without additional measures an effective dose of 1 mSv in a given calendar week must not be exceeded.

Time taken: 36.27 seconds
________________________________
