In [1]:
from elasticsearch import Elasticsearch, helpers
from elasticsearch._sync.client import IndicesClient
from elasticsearch_dsl import Search
import json

In [2]:
def delete_index(cli, index_name):
    cli.indices.close(index=index_name)
    cli.indices.delete(index=index_name)

In [14]:
client = Elasticsearch(hosts='http://localhost:9008')
search = Search(using=client, index='documents')

In [6]:
document_dump = {}
dump_to_disk = False
for hit in search.scan():
    document_dump[hit.id] = hit.to_dict()
if dump_to_disk:
    with open('document_dump.json', 'w', encoding='utf-8') as f:
        json.dump(document_dump, f, ensure_ascii=False, indent=2)

In [12]:
delete_index(client, 'documents_test')

In [13]:
client.indices.create(
    index='documents_test',
    mappings={
        "properties": {
            "id": {"type": "keyword"},
            "name": {"type": "text", "fields": {"keyword": {"type": "keyword", "ignore_above": 256}}},
            "text": {"type": "text", "analyzer": "medical_analyzer_german"},
        }
    },
    settings={
        "analysis": {
            "analyzer": {
                "medical_analyzer_german": {
                    "tokenizer": "whitespace",
                    "filter": ["lowercase", "german_stop", "german_snowball"]
                },
            },
            "filter": {
                "german_snowball": {"type": "snowball", "language": "German2"},
                "german_stop": {"type": "stop", "stopwords": "_german_"},
            }
        }
    }
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'documents_test'})

In [None]:
indices_client = IndicesClient(client)
indices_client.analyze(
    index='documents_stem',
    body={
        "analyzer": "medical_analyzer_german",
        "text": "Jet-Lavage, Debridement und VAG Wechsel linken Hand"
    }
).body

In [15]:
index_actions = (
    {"_index": "documents_test",
     "_id": hit.id,
     "_source": hit.to_dict()
     } for hit in search.scan())

In [16]:
helpers.bulk(client, list(index_actions)[:3])

(3, [])

In [67]:
client.search(
    index='documents_stem',
    body={
        "query": {
            "match": {
                "text": {
                    "query": "stationär",
                }
            }
        }
    }
)

ObjectApiResponse({'took': 3, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 36, 'relation': 'eq'}, 'max_score': 1.1096418, 'hits': [{'_index': 'documents_stem', '_id': 'af93e481-c461-4c39-8647-995a5a53ae92', '_score': 1.1096418, '_source': {'name': 'Colon_Fake_E', 'text': 'KLINIK FÜR ONKOLOGIE\n\nMünchen, am 17.8.2033  \n\nSehr geehrte Frau Kollegin, sehr geehrter Herr Kollege, \n\nwir berichten Ihnen von unserem gemeinsamen Patienten Euripedes Erler (FN:445544767), geb. 30.12.1987der sich vom 12.7.2033 bis 21.7.2033 in unserer stationären Behandlung befand. \n \n\nHAUPT-UND NEBENDIAGNOSEN\nHauptdiagnose(n), ICD-10\nRektumkarzinom, (02/33), lokal weit fortgeschritten, C20\nTumoranämie, (03/33), D63.0\nChron. Alkoholabusus\nZ.n. akuter Pankreatitis, OP Pankreaspseudozyste 2022\nZ.n. Port-a-cath Implantation  \n\nDEKURS DER TUMORERKRANKUNG\nOperation(en) und Histologie(n):\ndiagnostische PE (21.02.2033)\nHisto: Adenoc