In [6]:
import meilisearch
import json
import sys
import pathlib
import pickle
import ollama
import uuid

import networkx as nx
import numpy as np

from typing import List
from collections import defaultdict

In [114]:
# pip install langchain-huggingface sentence_transformers
from langchain_huggingface import HuggingFaceEmbeddings
# pip install langchain-community
from langchain_community.vectorstores import Meilisearch
# pip install langchain-ollama
from langchain_ollama.llms import OllamaLLM

In [7]:
sys.path.insert(0, "src")
from data_functions import DataProcessingFactory
from cluster_functions import PhraseClusterFactory, WordEmbeddingClustering
from embedding_functions import SentenceEmbeddingsFactory, cosine

  from tqdm.autonotebook import tqdm





---
#### Meilisearch Configuration

In [147]:
master_key = 'f8_g29iWsZn9334ofVKeDJ5qPbTHggrVAoJpHnhsyqs'
client = meilisearch.Client('http://localhost:7700', master_key)
index_grascco = client.index('grascco')
index_grascco_manual = client.index('grascco_manual')

In [146]:
index_grascco_manual.delete()

TaskInfo(task_uid=154, index_uid='grascco_manual', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2025, 4, 17, 7, 41, 9, 292369))

In [148]:
index_grascco_manual.update_filterable_attributes(
    ["metadata"]
)

TaskInfo(task_uid=155, index_uid='grascco_manual', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2025, 4, 17, 7, 41, 57, 342196))

In [149]:
index_grascco_manual.update_distinct_attribute(
    "metadata.document_id"
)

TaskInfo(task_uid=156, index_uid='grascco_manual', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2025, 4, 17, 7, 42, 44, 422423))

In [90]:
index_grascco.update_settings(
    {
        "distinctAttribute": "document_id",
        "searchableAttributes": [
            "text"
        ],
    }
)

TaskInfo(task_uid=128, index_uid='grascco', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2025, 4, 11, 13, 57, 1, 251390))

In [78]:
index_grascco.update_embedders(
    {
        "grascco_ollama": {
            "source": "ollama",
            "url": "http://172.25.0.2:11434/api/embed",
            "model": "nomic-embed-text",
            "documentTemplate": "{{doc.phrases}}"
        }
    }
)

TaskInfo(task_uid=125, index_uid='grascco', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2025, 4, 11, 13, 25, 29, 448550))

---

In [10]:
meili_embeddings = HuggingFaceEmbeddings(
    model_name="Sahajtomar/German-semantic",
    model_kwargs={
        'device': 'cpu'
    },
    encode_kwargs={
        'normalize_embeddings': False
    },
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: Sahajtomar/German-semantic


In [23]:
embedder_name = "grascco_german_semantic"
meili_embedders = {
    embedder_name: {
        "source": "userProvided",
        "dimensions": 1024
    }
}

---
#### Concept Graphs Load Objects & Build Meilisearch Documents

In [65]:
def meili_doc_generator(data_obj: DataProcessingFactory.DataProcessing, graph_list: List[nx.Graph], language: str = 'de', phrases_as_list: bool = False, exclude_graphs: list = None):
    _format_dict = lambda p,l:  {'de': f'Die relevanten Phrasen in diesem Dokument sind {" und ".join(p)}.'}.get(l if len(p) > 0 else "unknown", "Es gibt keine relevanten Phrasen in diesem Dokument.")
    doc_id_2_phrases = defaultdict(dict)
    doc_id_2_graphs = defaultdict(set)
    for i in range(len(graph_list)):
        if (i is not None) and (i in exclude_graphs): continue
        for n, n_dict in graphs[i].nodes(data=True):
            _label = n_dict['label']
            for doc_dict in n_dict['documents']:
                _id = doc_dict['id']
                for offset in doc_dict['offsets']:
                    doc_id_2_phrases[_id][offset] = _label
                doc_id_2_graphs[_id].add(i)
    _doc_id = None
    _db_doc_id = None
    _part = 0
    for i, doc_part in enumerate(data_obj.processed_docs):
        _current_doc_id = doc_part._.doc_id
        _offset = (doc_part._.offset_in_doc, doc_part._.offset_in_doc + len(str(doc_part)),)
        _phrases = []
        for o, label in doc_id_2_phrases[_current_doc_id].items():
            if o[0] >= _offset[0] and o[1] <= _offset[1]:
                _phrases.append(label)
        if _doc_id != _current_doc_id:
            _doc_id = _current_doc_id
            _db_doc_id = str(uuid.uuid4())
            _part = 0
        _json = {
            "document_id": _db_doc_id,
            "id": f"{_db_doc_id}-{_part}",
            "name": doc_part._.doc_name,
            "text": str(doc_part),
            "graphs": list(doc_id_2_graphs[_doc_id]),
        }
        _part += 1
        if phrases_as_list:
            _json["phrases"] = _phrases
        else:
            _json["phrases"] = _format_dict(['"' + p + '"' for p in _phrases], language)
        yield _json

In [15]:
# process_name = "top_default_text_source_stem"
process_name = "grascco_lokal"
base_path = pathlib.Path("tmp") / pathlib.Path(process_name)

In [16]:
data_path = base_path / pathlib.Path(f"{process_name}_data")
data: DataProcessingFactory.DataProcessing = DataProcessingFactory.load(data_path)

In [17]:
embedding_path = base_path / pathlib.Path(f"{process_name}_embedding")
embeddings: SentenceEmbeddingsFactory.SentenceEmbeddings = SentenceEmbeddingsFactory.load(data_path, embedding_path)

In [18]:
clustering_path = base_path / pathlib.Path(f"{process_name}_clustering")
clustering: PhraseClusterFactory.PhraseCluster = PhraseClusterFactory.load(clustering_path)

In [19]:
# top_default_text_source_stem exclude graphs: 1, 2, 9, 12, 18, 20
graph_path = base_path / pathlib.Path(f"{process_name}_graph.pickle")
graphs: List[nx.Graph] = pickle.load(open(graph_path, "rb"))

---

In [64]:
index_grascco_manual.delete()

TaskInfo(task_uid=149, index_uid='grascco_manual', status='enqueued', type='indexDeletion', enqueued_at=datetime.datetime(2025, 4, 15, 8, 11, 26, 749985))

In [150]:
from itertools import tee
iters = tee(meili_doc_generator(data, graphs, exclude_graphs=[1, 2, 9, 12, 18, 20], phrases_as_list=True), 3)
vectorstore = Meilisearch.from_texts(
    texts=[(" AND ".join(d["phrases"]) if d["phrases"] else d["text"]) for d in iters[0]],
    metadatas=[
        {
            "document_id": d["document_id"],
            "name": d["name"],
            "original_text": d["text"],
            "graphs": d["graphs"],
        } for d in iters[1]
    ],
    ids=[d["id"] for d in iters[2]],
    embedding=meili_embeddings,
    embedders=meili_embedders,
    embedder_name=embedder_name,
    client=client,
    index_name="grascco_manual"
)

In [151]:
# query = "Welche Krankheit korreliert oft mit einer Psychose?"
query = "Welche Verbrennung hat Frau Albers?"
result = vectorstore.similarity_search_with_score(
    query=query,
    embedder_name=embedder_name,
    k=5
)
doc_ids = set(d[0].metadata["document_id"] for d in result)

In [152]:
doc_results = []
_offset = 0
while True:
    doc_result = index_grascco_manual.get_documents(
        {
            "filter": f"metadata.document_id IN {list(doc_ids)}",
            "offset": _offset,
        }
    )
    doc_results.extend(doc_result.results)
    _offset += doc_result.limit
    if _offset >= doc_result.total:
        break

In [153]:
_running_id = None
_first_doc_part = True
_contexts = []
for d in doc_results:
    _doc_id = d.metadata["document_id"]
    if _running_id is not None and _running_id != _doc_id:
        _contexts.append("\n\n---\n\n")
        _first_doc_part = True
    if _first_doc_part:
        _contexts.append(f"Dokument-Name: {d.metadata['name']}")
        _first_doc_part = False
    _running_id = _doc_id
    if d.metadata["text"] != '':
        # _contexts.append("Wichtiger Absatz:")
        _contexts.append(d.metadata["original_text"])
context = "\n".join(_contexts)

In [154]:
from langchain.chains import RetrievalQA
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate

llm = OllamaLLM(model="mistral", base_url="http://localhost:7701", num_ctx=4096)

prompt = """
1. Benutze den folgenden Kontext um die Antwort auf die Frage am Ende zu finden.
2. Bitte erfinde keine Antworten wenn du etwas nicht weißt, sondern antworte mit 'Das kann ich aus den Texten nicht herausfinden.'
3. Du kannst wenn notwendig den Dokument-Namen mit ausgeben, der nach 'Dokument-Name' steht.
4. Halte die Antwort kurz und präzise mit etwa 3,4 Sätzen.

Kontext: {context}

Frage: {question}

Hilfreiche Antwort:"""

PROMPT_FROM_TEMPLATE = PromptTemplate.from_template(prompt)

llm_chain = LLMChain(
    llm=llm,
    prompt=PROMPT_FROM_TEMPLATE,
    callbacks=None,
    verbose=True,
)
response = llm_chain.run(context=context, question=query)



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
1. Benutze den folgenden Kontext um die Antwort auf die Frage am Ende zu finden.
2. Bitte erfinde keine Antworten wenn du etwas nicht weißt, sondern antworte mit 'Das kann ich aus den Texten nicht herausfinden.'
3. Du kannst wenn notwendig den Dokument-Namen mit ausgeben, der nach 'Dokument-Name' steht.
4. Halte die Antwort kurz und präzise mit etwa 3,4 Sätzen.

Kontext: Dokument-Name: Albers
Werte Frau Kollegin, werter geehrter Herr Kollege!
Wir berichten über lhre Patientin Beate Albers (* 4.4.1997), die sich vom 19.3. bis zum 7.5.2029 in unserer stat. Behandlung befand.
Vorgeschichte Befund
•	Verbrennung 1. – 3. Grades, Kopf I Hals,5% v KOF 
•	Handamputation LI
•	Akute Psychose aus dem schizophrenen Formenkreis 
•	Selbstschädigung  
•	Blutungsanamie  
•	Hypokaliämie  
•	Arterieller Hypertonus
•	Symptomatisches Anfallsleiden seit 2007
•	St.p.  Aneurysmablutung  
•	Passagerer Diabetes mellitus  
19.03.20

INFO:httpx:HTTP Request: POST http://localhost:7701/api/generate "HTTP/1.1 200 OK"



[1m> Finished chain.[0m


In [155]:
response

' In dem Text, der von Frau Albers handelt, ist keine Angabe enthalten, welche Verbrennung Frau Albers vornimmt. Das Thema Verbrennung wird nicht erwähnt.'

In [139]:
retriever = vectorstore.as_retriever(
                search_kwargs={'k': 1}
            )

In [140]:
retriever.invoke("Psychose")

MeilisearchApiError: MeilisearchApiError. Error code: invalid_search_embedder. Error message: Cannot find embedder with name `default`.. Hint: It might not be working because you're not up to date with the Meilisearch version that search call requires. Error documentation: https://docs.meilisearch.com/errors#invalid_search_embedder Error type: invalid_request

---

In [80]:
index_grascco.add_documents(
    list(meili_doc_generator(data, graphs, exclude_graphs=[1, 2, 9, 12, 18, 20])),
    primary_key="id"
)

TaskInfo(task_uid=127, index_uid='grascco', status='enqueued', type='documentAdditionOrUpdate', enqueued_at=datetime.datetime(2025, 4, 11, 13, 29, 18, 862253))

In [82]:
def get_result_stats(search_results: dict):
    stats = {
        "count": 0,
        "docs": set()
    }
    for result in search_results["hits"]:
        stats["docs"].add(result["name"])
    stats["count"] = len(stats["docs"])
    return stats

In [69]:
search_result = index_grascco_manual.search(
    "Lungenentzündung",
    {
        "hybrid": {
            "embedder": "grascco_ollama",
            "semanticRatio": 0.7  # 70% semantic, 30% full-text
        },
        "rankingScoreThreshold": 0.5,
        "showRankingScore": True,
        "showRankingScoreDetails": True,
    }
)

In [70]:
search_result

{'hits': [],
 'query': 'Lungenentzündung',
 'processingTimeMs': 1,
 'limit': 20,
 'offset': 0,
 'estimatedTotalHits': 0,
 'semanticHitCount': 0}

In [95]:
get_result_stats(search_result)

{'count': 20,
 'docs': {'Baastrup',
  'Colon_Fake_A',
  'Colon_Fake_D',
  'Colon_Fake_H',
  'Colon_Fake_I',
  'Fabry',
  'Fuss',
  'Gebauer',
  'Jenninger',
  'Koenig',
  'Meyr',
  'Obradovic',
  'Obradovic-2',
  'Obradovic_Fehler',
  'Popovic',
  'Queisser',
  'Stoelzl',
  'Wankel',
  'Weber',
  'Xavier'}}

---

In [None]:
ollama_client = ollama.Client(host="localhost:7701")

In [None]:
elastic_docs = []
for doc in meili_docs:
    doc["phrases"] = list(ollama_client.embed(
        model="nomic-embed-text",
        input=doc["phrases"]
    ).embeddings[0]) if len(doc["phrases"]) > 0 else [0.1] * 768
    elastic_docs.append(doc)


In [None]:
with open('document_dump_embeddings.json', 'w', encoding='utf-8') as json_file:
    json.dump(elastic_docs, json_file, ensure_ascii=False, indent=2)