In [16]:
from elasticsearch import Elasticsearch, helpers
from elasticsearch._sync.client import IndicesClient
from elasticsearch_dsl import Search
import json

import ollama

In [2]:
def delete_index(cli, index_name):
    cli.indices.close(index=index_name)
    cli.indices.delete(index=index_name)

In [5]:
client = Elasticsearch(hosts='http://localhost:9008', request_timeout=120)
search = Search(using=client, index='documents_stem_embedding')

In [6]:
document_dump = {}
dump_to_disk = False
for hit in search.scan():
    document_dump[hit.id] = hit.to_dict()
if dump_to_disk:
    with open('document_dump.json', 'w', encoding='utf-8') as f:
        json.dump(document_dump, f, ensure_ascii=False, indent=2)

In [63]:
delete_index(client, 'documents_stem_embedding')

In [45]:
client.indices.create(
    index='documents_stem_embedding',
    mappings={
        "properties": {
            "id": {"type": "keyword"},
            "document_id": {"type": "keyword"},
            "type": {"type": "keyword"},
            "name": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}},
            "text": {"type": "text", "analyzer": "medical_analyzer_german"},
            "phrases": {
                "type": "dense_vector",
                "dims": 768, # naturally depends on the embeddings to index; with version >8.11 needs not to be declared but will be deduced with the first vector
                "index": True,
                "similarity": "cosine",
                "index_options": {
                    "type": "hnsw",
                    "ef_construction": 128,
                    "m": 24
                }
            }
        }
    },
    settings={
        "analysis": {
            "analyzer": {
                "medical_analyzer_german": {
                    "tokenizer": "whitespace",
                    "filter": ["lowercase", "german_stop", "german_snowball"]
                },
            },
            "filter": {
                "german_snowball": {"type": "snowball", "language": "German2"},
                "german_stop": {"type": "stop", "stopwords": "_german_"},
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'documents_stem_embedding'})

In [None]:
indices_client = IndicesClient(client)
indices_client.analyze(
    index='documents_stem',
    body={
        "analyzer": "medical_analyzer_german",
        "text": "Jet-Lavage, Debridement und VAG Wechsel linken Hand"
    }
).body

In [50]:
from_file = None
from_file = 'document_dump_embeddings.json'
_index = "documents_stem_embedding"
if from_file:
    with open(from_file, 'r', encoding='utf-8') as f:
        json_data = json.load(f)
        index_actions = (
            {
                "_index": _index,
                "_id": hit.get("id"),
                "_source": dict(phrases_dot=hit["phrases"], **hit)
            } for hit in (
                json_data.values() if isinstance(json_data, dict) else json_data
            )
        )
else:
    index_actions = (
        {
            "_index": _index,
            "_id": hit.id,
            "_source": hit.to_dict()
        } for hit in search.scan()
    )

In [51]:
helpers.bulk(client, index_actions)

BulkIndexError: 1 document(s) failed to index.

---

In [52]:
_q = "Wahn"
_q_embedding = list(ollama.Client(host="localhost:7701").embed(
    model="nomic-embed-text",
    input=_q
).embeddings[0])

In [61]:
client = Elasticsearch(hosts='http://localhost:9008', request_timeout=120)
response = client.search(
    index='documents_stem_embedding',
    body={
        "size": 10,
        "query": {
            "query_string": {
                "query": _q,
                "fields": ["text"],
                "boost": 0.9
            }
        },
        "highlight": {
            "fields": {
                "text": {
                    "fragment_size": 30,
                    "number_of_fragments": 0,
                }
            },
            "type": "unified"
        },
        "knn": {
            "field": "phrases",
            "query_vector": _q_embedding,
            "k": 5,
            "num_candidates": 50,
            "boost": 0.1,
        },
    },
)

In [59]:
len(response.body["hits"]["hits"])

5

In [62]:
[
    {
        "id": r["_id"],
        "score": r["_score"],
        "text": r["_source"]["text"],
        "name": r["_source"]["name"],
    }
    for r in response.body["hits"]["hits"]
]

[{'id': 'bcb5eeac-a4c5-4d6c-a1cd-b83dc031ed95',
  'score': 0.08077419,
  'text': '\ufeffARCOS-KLINIK FLENSBURG\nAkademisches Lehrkrankenhaus\nder Otto-Waalkes-Universität Borkum\nUrologische Abteilung\nCHEFARZT: DR. MED. H. BLASENSTEIN\n\nHerrn\nHelge Klabauter\nUrologe\nFriesische Str. 21 a\n24937 Flensburg\n \n\nFlensburg, 27. März 2025\nRote Str. 3 \nTelefon (0461) 708 - 223\n\n\nBetr.: Asger Baastrup, geb. 5.7.54, Schleswiger Str. 95a, 24941 Flensburg \n\n\nSehr geehrte Herr Kollege Klabauter!\n\nHr. Baastrup Asger war vom 8.3. - 22.3.2025 bei uns in stationärer Behandlung.\n\nDiagnosen: Narbige Blasanhalsstenose. nach Prostata-Adenom-Enukleatio 2006.\nBekannte Blindheit nach Maculadeg. bds.\nHerzinsuffizienz\n\nKleines Prostata-Ca Stadium pT1a,· G1.\n\n1 Tag vor der stationären Aufnahme war eine Sachse-Spaltung der narbigen Stenose durchgeführt worden. Der Katheter war herausgefallen. Nach nochmaliger vorübergehender Kathetervers6rgung betrug der Resturin·aber immer noch 150 ml., 