In [1]:

%matplotlib inline

In [2]:
import collections
import itertools
import json
import pathlib
import uuid
from typing import Union

import networkx as nx
import owlready2
import requests
from rdflib.extras.external_graph_libs import *
from rdflib.graph import Graph
from tqdm.autonotebook import tqdm

  from tqdm.autonotebook import tqdm


In [3]:
pathlib_data_base = pathlib.Path("C:\\Users\\fra3066mat\\Documents\\data\\snomed")

### Parse and store SNOMED taxonomy

In [4]:
def node_to_id(node):
    return node.split('/')[-1]

In [5]:
path = pathlib_data_base / pathlib.Path("./snomed_taxonomy.rdf")

# every node with 'http://snomed.info/id/SNOMED_ID' is child of 'http://www.w3.org/2002/07/owl#Class'
# 'http://snomed.info/id/138875005' is top node --> SNOMED CT Concept (SNOMED RT+CTV3)
# graph is directed in top direction (that is, a neighbor is/are only the direct parent(s))

graph = Graph()
graph.parse(path, format="application/rdf+xml")
nx_graph = rdflib_to_networkx_multidigraph(graph)

id_node_mapping = {}
for n in list(nx_graph.nodes()):
    _id = node_to_id(n)
    # Remove meta nodes (e.g. 'owl#Class') and top level node 'SNOMED CT Concept (SNOMED RT+CTV3)'
    if (not _id.isnumeric()) or (node_to_id(n) == "138875005"):
        nx_graph.remove_node(n)
        continue
    id_node_mapping[_id] = n

In [6]:
def id_to_node(snomed_id: str, mapping_dict: dict = id_node_mapping):
    node = mapping_dict.get(snomed_id, None)
    return node

In [7]:
def parents_of_id(snomed_id: str, mapping_dict: dict = id_node_mapping):
    for neighbor in nx_graph.neighbors(id_to_node(snomed_id, mapping_dict)):
        _id = node_to_id(neighbor)
        if not _id.isnumeric():
            continue
        yield _id

In [8]:
def children_of_id(snomed_id: str, mapping_dict: dict = id_node_mapping):
    for predecessor in nx_graph.predecessors(id_to_node(snomed_id, mapping_dict)):
        _id = node_to_id(predecessor)
        if not _id.isnumeric():
            continue
        yield _id

---
### Read SNOMED Interface Terminology

In [9]:
interface_terminology_path = pathlib_data_base / pathlib.Path("./SCT-GIT_de_drugs_nlp.dat")
interface_terminology = collections.defaultdict(list)

for line in interface_terminology_path.open('r', encoding='utf-8').readlines():
    # first name in the resulting list for each entry is the preferred name
    snomed_id, internal_id, concept_name, interface_name = line.strip().split("\t")
    interface_terminology[snomed_id].append(interface_name)
first_level_concepts = [node_to_id(n) for n in nx_graph.nodes() if
                        nx_graph.out_degree(n) == 0 and node_to_id(n) in interface_terminology]

---
### Create/upload concepts to TOP FW

In [10]:
snomed_ontology_path = pathlib_data_base / pathlib.Path("./snomed_ontology.owx")
snomed_ontology_graph = owlready2.get_ontology(f"file://{snomed_ontology_path}").load()

#### Get Preferred Label by id:
`labels = snomed_ontology_graph.search_one(iri = "*40949007").prefLabel`  
Answer (e.g.):  
`[locstr('Kingdom Fungi macroconidium', 'en'),`  
` locstr('Makroconidium des Reichs Fungi', 'de')]`  

Language:  
`labels[0].lang`  
String Form:  
`str(labels[0])`

In [11]:
def batched(iterable, n, *, strict=False):
    # batched('ABCDEFG', 3) → ABC DEF G
    if n < 1:
        raise ValueError('n must be at least one')
    iterator = iter(iterable)
    while batch := tuple(itertools.islice(iterator, n)):
        if strict and len(batch) != n:
            raise ValueError('batched(): incomplete batch')
        yield batch

In [12]:
def get_fallback_entry(
        id: str,
        creation_dump: dict,
        snomed_ontology: owlready2.Ontology,
):
    titles = snomed_ontology.search_one(iri=f"*{id}").prefLabel
    creation_dump["missing_german_interface_terms"].append(id)
    return [
        {
            "lang": label.lang.split("-")[0],  # Some lang entries distinguish between "en-us" and "en-gb"
            "text": str(label)
        } for label in titles
    ]

In [13]:
def create_top_fw_concept(
        concept_id: str,
        concept_id_store: dict,
        creation_dump: dict,
        parent_id: Union[str, list] = None,
        title_dict: dict = None,
):
    if concept_id not in creation_dump["top_id_store"]:
        _id = str(uuid.uuid4())
        creation_dump["top_id_store"][concept_id] = _id
    else:
        _id = creation_dump["top_id_store"][concept_id]

    _is_interface_term = concept_id in concept_id_store
    _data = {
        "entityType": "single_concept",
        "id": _id,
        "titles": [
            {
                "lang": "de" if _is_interface_term else "en",
                "text": concept_id_store[concept_id][0] if _is_interface_term else f"No interface term: '{concept_id}'",
            }
        ] if title_dict is None else title_dict,
        "codes": [
            {
                "uri": f"http://snomed.info/id/{concept_id}",
                "codeSystem": {
                    "uri": "http://snomed.info/id",
                    "shortName": "SNOMED CT"
                },
                "code": concept_id,
            }
        ]
    }

    if _is_interface_term and len(concept_id_store[concept_id]) > 1:
        _data["synonyms"] = [
            {
                "lang": "de",
                "text": syn
            } for syn in concept_id_store[concept_id][1:]
        ]

    if parent_id is not None:
        parent_id = [parent_id] if isinstance(parent_id, str) else parent_id
        add_list = []
        for pid in parent_id:
            add_list.append(
                {
                    "entityType": "single_concept",
                    "id": creation_dump["top_id_store"].get(pid)
                }
            )
        if len(add_list) > 0:
            _data["superConcepts"] = add_list

    return _data

In [14]:
def post_to_top_fw(
        post_data: Union[list, dict],
        post_api_token: str = None,
        organisation: str = "imise",
        repository: str = "snomed_interface_terminology",
        api_url: str = "https://top.imise.uni-leipzig.de/api",
        use_keycloak: bool = True,
):
    _endpoint_suffix = "" if isinstance(post_data, dict) else "/bulk"
    request = requests.post(
        url=f"{api_url.rstrip('/')}/{organisation}/{repository}/entity{_endpoint_suffix}",
        headers={"Authorization": "Bearer " + post_api_token} if (use_keycloak and post_api_token is not None) else {},
        json=post_data
    )
    return request

In [None]:
username = input("Please give your TOP FW username: ")
password = input("Please give your TOP FW password: ")
api_token = requests.post(
    url="https://top.imise.uni-leipzig.de/auth/realms/top-realm/protocol/openid-connect/token",
    data={
        "client_id": "top-frontend",
        "username": username,
        "password": password,
        "grant_type": "password"
    }
)

In [None]:
info_dump_path = "./tmp/creation_dump_info.json"
post_array_path = "./tmp/post_array.json"

creation_dump_info = json.load(pathlib.Path(info_dump_path).open('rb')) if pathlib.Path(info_dump_path).exists() else {
    "top_id_store": {},
    "missing_german_interface_terms": []
}
post_array = json.load(pathlib.Path(post_array_path).open('rb')) if pathlib.Path(
    post_array_path).exists() else collections.defaultdict(list)

In [None]:
with tqdm(total=len(nx_graph)) as pbar:
    for i, stratum in enumerate(nx.topological_generations(nx_graph.reverse(copy=True))):
        pbar.set_description_str(f"Stratum {str(i).zfill(2)}: ")
        for node in stratum:
            pbar.update(1)
            _node_concept_id = node_to_id(node)
            _snomed_pref_label = None
            if _node_concept_id not in interface_terminology:
                _snomed_pref_label = get_fallback_entry(id=_node_concept_id, creation_dump=creation_dump_info,
                                                        snomed_ontology=snomed_ontology_graph)
            _parents = None
            if i > 0:
                _parents = [node_to_id(x) for x in nx_graph[node]]
            post_array[i].append(
                create_top_fw_concept(_node_concept_id, interface_terminology, creation_dump_info, _parents,
                                      title_dict=_snomed_pref_label)
            )
json.dump(creation_dump_info, pathlib.Path(info_dump_path).open('w', encoding='utf-8'), ensure_ascii=False)
json.dump(post_array, pathlib.Path(post_array_path).open('w', encoding='utf-8'), ensure_ascii=False)

In [None]:
[p for p in post_array["3"] if p.get("codes", [{}])[0].get("code") not in interface_terminology][:20]

In [None]:
[p for p in post_array["3"] if (p.get("titles", [{}])[0].get("lang") == "de") and (
            p.get("codes", [{}])[0].get("code") in creation_dump_info["missing_german_interface_terms"])][:20]

In [None]:
stratum = 0
batch_size = 100
with tqdm(total=len(post_array[str(stratum)])) as pbar:
    for batch in batched(post_array[str(stratum)], batch_size):
        response = post_to_top_fw(list(batch), api_token.json()['access_token'],
                                  repository="snomed_interface_terminology", api_url="http://localhost:8080/",
                                  use_keycloak=False)
        pbar.update(batch_size)

In [None]:
for stratum_n, stratum in post_array.items():
    print(f"-- Stratum {stratum_n} --")
    for entity in tqdm(stratum):
        post_to_top_fw(entity, api_token.json()['access_token'], repository="snomed_interface_terminology")

Enthält die SNOMED Interface Terminology (313.548 Konzepte) von Stefan Schulz.
Contact: Stefan Schulz - stefan.schulz@medunigraz.at

In [17]:
_subtree_dump = {
    "top_id_store": {},
    "missing_german_interface_terms": []
}
_subtree_post_array = []
_root_id = "105590001"
_subtree = nx.traversal.bfs_tree(nx_graph.reverse(), id_to_node(_root_id))
_subtree_rev = _subtree.reverse(copy=True)
with tqdm(total=len(_subtree)) as pbar:
    for i, stratum in enumerate(nx.topological_generations(_subtree)):
        pbar.set_description_str(f"Stratum {str(i).zfill(2)}: ")
        for node in stratum:
            pbar.update(1)
            _node_concept_id = node_to_id(node)
            _snomed_pref_label = None
            if _node_concept_id not in interface_terminology:
                _snomed_pref_label = get_fallback_entry(id=_node_concept_id, creation_dump=_subtree_dump,
                                                        snomed_ontology=snomed_ontology_graph)
            _parents = None
            if i > 0:
                _parents = [node_to_id(x) for x in _subtree_rev[node]]
            _subtree_post_array.append(
                create_top_fw_concept(_node_concept_id, interface_terminology, _subtree_dump, _parents,
                                      title_dict=_snomed_pref_label)
            )

  0%|          | 0/27484 [00:00<?, ?it/s]

In [57]:
_subtree_post_array

[{'entityType': 'single_concept',
  'id': '5988d4ca-8344-4ade-a471-e97354176b0d',
  'titles': [{'lang': 'de', 'text': 'Drogen'}],
  'codes': [{'uri': 'http://snomed.info/id/105590001',
    'codeSystem': {'uri': 'http://snomed.info/id', 'shortName': 'SNOMED CT'},
    'code': '105590001'}],
  'synonyms': [{'lang': 'de', 'text': 'Substanzen'},
   {'lang': 'de', 'text': 'Substanz'}]},
 {'entityType': 'single_concept',
  'id': '613a9893-128e-4434-992e-dfc0ee374964',
  'titles': [{'lang': 'de',
    'text': 'Substanz kategorisiert bei Gefährdungscharakteristik'}],
  'codes': [{'uri': 'http://snomed.info/id/438951008',
    'codeSystem': {'uri': 'http://snomed.info/id', 'shortName': 'SNOMED CT'},
    'code': '438951008'}],
  'synonyms': [{'lang': 'de',
    'text': 'Substanz kategorisiert bei Gefahrbesonderheit'},
   {'lang': 'de', 'text': 'Substanz kategorisiert bei Gefahrenbesonderheit'},
   {'lang': 'de',
    'text': 'Substanz kategorisiert bei Gefährdungsbesonderheit'},
   {'lang': 'de', 'te

In [70]:
import time

for p_array in tqdm(_subtree_post_array):
    time.sleep(0.005)
    post_to_top_fw(
        post_data=p_array,
        post_api_token=None,
        organisation="imise",
        repository="snomed_interface_terminology",
        api_url="http://localhost:8080",
        use_keycloak=False,
    )

100%|██████████| 27484/27484 [17:11<00:00, 26.64it/s]


In [131]:
node = id_to_node([p.get("codes")[0].get("code") for p in _subtree_post_array if
                   (p.get("titles", [{}])[0].get("text") == "Körpersubstanz")][0])
_subgraph = nx.subgraph(_subtree, nx.bfs_tree(_subtree, node).nodes())

In [132]:
len(_subgraph)

860

In [15]:
from elasticsearch import Elasticsearch

In [16]:
client = Elasticsearch(hosts='http://localhost:9008')

In [18]:
terms = []
for entry in _subtree_post_array:
    for _type in ["titles", "synonyms"]:
        for te in entry.get(_type, []):
            if te["lang"] == "de":
                terms.append(te["text"])

In [19]:
len(terms)

96093

In [49]:
query_string = " OR ".join([f"\"{t}\"" for t in terms[:19000]])
response = client.search(
    index='documents_stem',
    body={
        "query": {
            "query_string": {
                "query": query_string
            }
        },
        "highlight": {
            "fields": {
                "text": {
                    # "pre_tags" : ["<em>"],
                    # "post_tags" : ["</em>"],
                    "number_of_fragments": 0,
                    "fragment_size": 0,
                }
            }
        }
    }
)

In [35]:
_test_list = []
for _t in terms[:19000]:
    for _tt in _t.split():
        _test_list.append(_tt)
len(_test_list)

36137

In [50]:
response.get("hits").get("hits")[0]["highlight"]

{'text': ['Sehr geehrter Herr Dr. Albers, \n\nwir berichten über unseren gemeinsamen Patienten Herrn Klaus Neubauer, * 23.11.1999, wohnhaft 73333 Gingen, der sich in der Zeit vom 21.02.2024 bis 25.02.2024 in der stationärer Behandlung in unserer Klinik befand. \n\nDiagnosen (ICD 10): \nPsychische und Verhaltensstörungen durch <em>Opioide</em> und Opiate: Schädlicher Gebrauch (F11.1) \nAnamn. Psychische und Verhaltensstörungen durch Alkohol: schädlicher Gebrauch (F10.2) , (F10.1) \nsonstige abnorme Gewohnheiten und Störungen der Impulkontrolle  \n\nAufnahme Modus / aktuelle Anamnese: \nPatient kommt zur Aufnahme nach Telefonischer Ankündigung durch PIA. Patient in Behandlung bei Fr. OÄ Schönfeld in PIA. \nPatient berichtet im Aufnahme Gespräch dass, ihm geht in der letzten Zeit schlecht, könne nicht schlafen, habe versucht mit der Seroquel-Dosis von alleine hoch zu gehen, nahm gestern 2250 mg Seroquel Prolong, verschriebene Dosis sei 50 mg Seroquel Prolong. \nEr gibt an, seit Ende Janua

In [130]:
[t for t in terms if "mal" in t.lower()]

['malen',
 'malen Produkt',
 'malen Präparat',
 'Pheniraminmaleat',
 'Azatadinmaleat',
 'humane mesenchymale Stammzelle',
 'Mensch mesenchymale Stammzelle',
 'menschliche mesenchymale Stammzelle',
 'Menschen mesenchymale Stammzelle',
 'Pomalidomid',
 'Thymalfasin',
 'Antimalariamittel',
 'Substanz mit Antimalariawirkung',
 'Substanz mit Antimalariawirkungsmechanismus',
 'Substanz mit Antimalariamittelwirkungsmechanismus',
 'Substanz mit Antimalariamittelwirkung',
 'Pafuramidinmaleat',
 'Malarone',
 'Wachsmalstift',
 'Formalindämpfe',
 'Formalin',
 'lysosomales Transportprotein',
 'bovines Serumalbumin',
 'Rinderserumalbumin',
 'Aminoplasmal',
 'Aminoplasmal B.Braun',
 'Aminoplasmal Paed',
 'peroxisomaler Schädigungsmarker',
 'Ohrenschmalz',
 'Omalizumab',
 'Tramal',
 'Tramal Tropfen',
 'Tramal retard',
 'Pizotifenmaleat',
 'Pizotylinmaleat',
 'Thiethylperazinmaleat',
 'Thiethylperazinmalat',
 'Tegaserod Maleat',
 'Sunitinibmalat',
 'Afatinibdimaleat',
 'Neratinibmaleat',
 'Maltosetetra