In [1]:
from pathlib import Path
from shutil import rmtree

import altair as alt
import fsspec
import kuzu
import polars as pl

import ddl


snomed_data = Path("./data/external/snomed-ips/Snapshot/Terminology/")
kuzu_path = Path("./data/internal/kuzu-db/")
if kuzu_path.exists:
    rmtree(kuzu_path)

db = kuzu.Database(kuzu_path)
conn = kuzu.Connection(db)


def read_snomed(path: Path) -> pl.DataFrame:
    return pl.read_csv(path, separator="\t").with_columns(
        pl.col("effectiveTime").cast(pl.String).str.to_date("%Y%m%d"),
        pl.col("active").cast(pl.Boolean),
    )


In [2]:
Description = read_snomed(
    snomed_data / "sct2_Description_IPSSnapshot-en_IPST_20240701.txt"
)

# Two type of descriptions:
# 900000000000003001 | Fully qualified name
# 900000000000013009 | Synonyms
Description.select(pl.col("typeId").value_counts()).unnest("typeId")

typeId,count
i64,u32
900000000000003001,19697
900000000000013009,40417


In [3]:
# load Concept
fullname = Description.filter(pl.col("typeId") == 900000000000003001).select(
    pl.col("conceptId"), pl.col("term").alias("fullQualifiedName")
)
synonyms = (
    Description.filter(pl.col("typeId") == 900000000000013009)
    .select(pl.col("conceptId"), pl.col("term").alias("synonyms"))
    .group_by(pl.col("conceptId"))
    .agg("synonyms")
)

join_concept = dict(how="left", left_on="id", right_on="conceptId")
Concept = (
    read_snomed(snomed_data / "sct2_Concept_IPSSnapshot_IPST_20240701.txt")
    .join(fullname, **join_concept)
    .join(synonyms, **join_concept)
)

conn.execute(ddl.sct.concept + "COPY SCT FROM Concept;")

[<kuzu.query_result.QueryResult at 0x135dc48d0>,
 <kuzu.query_result.QueryResult at 0x135dc4e90>]

In [4]:
# load Relationship, note we need to change ordering of columns for loading in Kuzu
join_relationship = dict(how="left", left_on="typeId", right_on="conceptId")
Relationship = (
    read_snomed(snomed_data / "sct2_Relationship_IPSSnapshot_IPST_20240701.txt")
    .select(
        pl.col(
            [
                "sourceId",
                "destinationId",
                "id",
                "effectiveTime",
                "active",
                "moduleId",
                "relationshipGroup",
                "typeId",
                "characteristicTypeId",
                "modifierId",
            ]
        )
    )
    .join(fullname, **join_relationship)
    .join(synonyms, **join_relationship)
)

In [5]:
# inspect frequency of each type of relationship out of 66,017 relationships
print(Relationship.shape)

# 116680003 | Is A occurs 32,111 times i.e accounts for half
# 363698007 | Finding site 5,497
# 116676008 | Associated morphology 3,818
type_count = (
    Relationship.select(pl.col("typeId").value_counts())
    .unnest("typeId")
    .sort("count", descending=True)
)
type_count.plot.bar(
    alt.X("count:Q").scale(type="symlog"), y=alt.Y("typeId:O").sort("-x")
)

(66017, 12)


In [6]:
for name, id in ddl.sct.top10_relationships:
    Relationship_ = Relationship.filter(pl.col("typeId") == id)
    conn.execute(
        f"DROP TABLE IF EXISTS {name};"
        + ddl.sct.relationship(name)
        + f"COPY {name} FROM Relationship_;"
    )


## ICD-10 2019

ICD-10 structure, from top to bottom:

- Chapter
- Group
- Category
  - three-position, e.g. C88
  - four-position, e.g. C88.9


In [7]:
icd_data = Path("./data/external/icd10-2019/")

ICD10Chapter = pl.read_csv(
    icd_data / "icd102019syst_chapters.txt",
    has_header=False,
    separator=";",
).rename({"column_1": "number", "column_2": "rubric"})

ICD10Group = pl.read_csv(
    icd_data / "icd102019syst_groups.txt", has_header=False, separator=";"
).with_columns(
    pl.concat_str([pl.col("column_1"), pl.col("column_2")], separator="-").alias(
        "code"
    ),
    pl.col("column_3").alias("chapter"),
    pl.col("column_4").alias("rubric"),
)

ICD10Code = pl.read_csv(
    icd_data / "icd102019syst_codes.txt",
    has_header=False,
    separator=";",
    infer_schema_length=10000,
)

Group_to_Chapter = ICD10Group.select("code", "chapter")

Category3_to_group = ICD10Group.join(
    ICD10Code.filter(pl.col("column_1") == 3), left_on="column_1", right_on="column_5"
).select(pl.col("column_7").alias("category3"), pl.col("code").alias("group"))

ICD10Group = ICD10Group.select("code", "rubric")

expr_category = (
    pl.col("column_7").alias("code"),
    pl.col("column_9").alias("rubric"),
)

ICD10Category3 = ICD10Code.filter(pl.col("column_1") == 3).select(expr_category)
ICD10Category4 = ICD10Code.filter(pl.col("column_1") == 4).select(expr_category)

Category4_to_3 = ICD10Category4.select(
    pl.col("code"), pl.col("code").str.head(3).alias("superclass")
)


In [8]:
for name in ["Chapter", "Group", "Category3", "Category4"]:
    conn.execute(
        f"DROP TABLE IF EXISTS ICD10{name};"
        + ddl.icd.__dict__[name]
        + f"COPY ICD10{name} FROM ICD10{name};"
    )

conn.execute(
    "DROP TABLE IF EXISTS IsSubClassOf;"
    + ddl.icd.IsSubClassOf
    + "COPY IsSubClassOf_ICD10Category4_ICD10Category3 FROM Category4_to_3;"
    + "COPY IsSubClassOf_ICD10Category3_ICD10Group FROM Category3_to_group;"
    + "COPY IsSubClassOf_ICD10Group_ICD10Chapter FROM Group_to_Chapter;"
)


[<kuzu.query_result.QueryResult at 0x135e20e50>,
 <kuzu.query_result.QueryResult at 0x13b28c610>,
 <kuzu.query_result.QueryResult at 0x13b28c6d0>,
 <kuzu.query_result.QueryResult at 0x13b28c750>,
 <kuzu.query_result.QueryResult at 0x13b28c7d0>]

## WHO ANC Profile

- Note we are using value sets with mapping to SNOMED IPS
- Also constraints are not relevant (too detailed for Momcare)
- We do use Measures (downstream)


In [9]:
systems = ["ICD-10", "ICD-11", "ICF", "ICHI", "LOINC", "SNOMED-CT"]


def parse_conceptmap(system: str) -> pl.DataFrame:
    "Generate flattened mapping tables from WHO ANC conceptmap."

    if system not in systems:
        return None

    with fsspec.open(
        f"https://build.fhir.org/ig/dhes/smart-anc/ConceptMap-{system}.json"
    ) as f:
        df = pl.read_json(f)

    unnest_group = pl.col("group").list.explode().struct.unnest()
    unnest_element = pl.col("element").list.explode().struct.unnest().list.explode()

    return (
        df.select(unnest_group)
        .select(unnest_element)
        .select(
            pl.col(pl.String).name.prefix("who_anc_"),
            pl.lit(system.replace("-", "")).alias("target"),
            pl.col("target").struct.unnest(),
        )
    )


# there are errors in WHO ANC codes for SNOMED-CT mapping. These need fixing
# see list below
errors = {
    "1.56399E+16": 15639921000119107,
    "1.22475E+16": 12247531000119106,
    "1.07437E+16": 10743651000119105,
    "1.07612E+16": 10761341000119105,
    "4.41041E+14": 441041000124100,
}

who = pl.concat(
    [parse_conceptmap(system) for system in systems], how="diagonal"
).with_columns(pl.col("code").replace(errors))

In [10]:
# 735 unique WHO ANC codes
who.select(pl.col("who_anc_code").n_unique())

who_anc_code
u32
735


In [11]:
# Coverage varies widely, SNOMED most complete
# Multiple WHO ANC code can map to the same target code
who.group_by("target").agg(pl.n_unique("who_anc_code", "code")).sort(
    "who_anc_code", descending=True
)

target,who_anc_code,code
str,u32,u32
"""SNOMEDCT""",725,429
"""ICD11""",550,225
"""ICD10""",532,188
"""LOINC""",385,145
"""ICHI""",163,42
"""ICF""",100,32


In [12]:
# Same SNOMED or ICD10 concept maps to multiple WNO ANC codes!
many_to_one = (
    (
        who.group_by("target", "code")
        .agg(pl.count("who_anc_code").alias("count_"))
        .filter(pl.col("count_") > 1)
    )
    .join(who, on=["target", "code"])
    .sort(["target", "code"])
)
many_to_one

target,code,count_,who_anc_code,who_anc_display,equivalence,display
str,str,u32,str,str,str,str
"""ICD10""","""A53.9""",2,"""ANC.B9.DE111""","""Syphilis positive""","""equivalent""","""Syphilis, unspecified"""
"""ICD10""","""A53.9""",2,"""ANC.B9.DE108""","""Syphilis positive""","""equivalent""","""Syphilis, unspecified"""
"""ICD10""","""B18.1""",2,"""ANC.B9.DE72""","""Hepatitis B positive""","""equivalent""","""Chronic viral hepatitis B with…"
"""ICD10""","""B18.1""",2,"""ANC.B9.DE75""","""Hepatitis B positive""","""equivalent""","""Chronic viral hepatitis B with…"
"""ICD10""","""B18.2""",2,"""ANC.B9.DE93""","""Hepatitis C positive""","""equivalent""","""Chronic viral hepatitis C"""
…,…,…,…,…,…,…
"""SNOMEDCT""","""84229001""",3,"""ANC.B7.DE53""","""Gets tired easily""","""equivalent""","""Fatigue (finding)"""
"""SNOMEDCT""","""8517006""",2,"""ANC.B6.DE154""","""Recently quit tobacco products""","""equivalent""","""Ex-smoker (finding)"""
"""SNOMEDCT""","""8517006""",2,"""ANC.B7.DE12""","""Recently quit tobacco products""","""equivalent""","""Ex-smoker (finding)"""
"""SNOMEDCT""","""91175000""",2,"""ANC.B6.DE41""","""Convulsions""","""equivalent""","""Seizure (finding)"""


In [13]:
# SCT mapping has two types of equivalence relationships
many_to_one.group_by("target").agg(pl.col("equivalence").value_counts())

target,equivalence
str,list[struct[2]]
"""SNOMEDCT""","[{""equivalent"",390}, {""relatedto"",28}]"
"""ICHI""","[{""equivalent"",141}]"
"""ICF""","[{""equivalent"",78}, {""relatedto"",13}]"
"""LOINC""","[{""relatedto"",95}, {""equivalent"",220}]"
"""ICD10""","[{""equivalent"",438}]"
"""ICD11""","[{""equivalent"",428}]"


In [14]:
# Unique WHO ANC Code
who_unique = who.select(
    pl.col("who_anc_code").alias("code"),
    pl.col("who_anc_display").alias("rubric"),
).unique()

# exclude 70 mappings not in snomed
who_sct = many_to_one.filter((pl.col("target") == "SNOMEDCT")).with_columns(
    pl.col("code").cast(pl.Int64)
)


In [15]:
# SCT codes in WHO ANC that are not in SNOMED IPS
# For example 720407008|Mother victim of domestic violence
sct_not_in_ips = (
    who_sct.join(Concept, left_on="code", right_on="id", how="left")
    .filter(pl.col("active").is_null())
    .select("code")
    .unique()
)

sct_not_in_ips

code
i64
416237000
442084003
55052008
736693005
118246004
…
413672003
271650006
733461000
31868001


In [16]:
who_icd10 = many_to_one.filter(pl.col("target") == "ICD10").select(
    "who_anc_code", "code"
)

# mappings to 3- and 4-position ICD10 codes
who_icd10_3 = who_icd10.filter(pl.col("code").str.len_chars() < 4)
who_icd10_4 = who_icd10.filter(pl.col("code").str.len_chars() == 4)

In [17]:
who_sct_equivalent = who_sct.filter(
    (pl.col("equivalence") == "equivalent") & (~pl.col("code").is_in(sct_not_in_ips))
).select("who_anc_code", "code")

who_sct_related = who_sct.filter(
    (pl.col("equivalence") == "relatedto") & (~pl.col("code").is_in(sct_not_in_ips))
).select("who_anc_code", "code")

In [18]:
# load WhoAncCode
conn.execute(
    "DROP TABLE IF EXISTS WhoAncCode;"
    + ddl.who_anc.WhoAncCode
    + "COPY WhoAncCode FROM who_unique;"
)

[<kuzu.query_result.QueryResult at 0x13b9148d0>,
 <kuzu.query_result.QueryResult at 0x13b916190>,
 <kuzu.query_result.QueryResult at 0x13b914810>]

In [19]:
conn.execute(
    "DROP TABLE IF EXISTS EquivalentTo;"
    + ddl.who_anc.EquivalentTo
    + "COPY EquivalentTo_WhoAncCode_SCT FROM who_sct_equivalent;"
    + "COPY EquivalentTo_WhoAncCode_ICD10Category3 FROM who_icd10_3;"
    + "COPY EquivalentTo_WhoAncCode_ICD10Category4 FROM who_icd10_4;"
)


[<kuzu.query_result.QueryResult at 0x13b916e10>,
 <kuzu.query_result.QueryResult at 0x13b916b90>,
 <kuzu.query_result.QueryResult at 0x13b916f50>,
 <kuzu.query_result.QueryResult at 0x13b916fd0>,
 <kuzu.query_result.QueryResult at 0x13b917050>]

In [21]:
conn.execute(
    "DROP TABLE IF EXISTS RelatedTo;"
    + ddl.who_anc.RelatedTo
    + "COPY RelatedTo FROM who_sct_related;"
)


[<kuzu.query_result.QueryResult at 0x13b537790>,
 <kuzu.query_result.QueryResult at 0x115e55c10>,
 <kuzu.query_result.QueryResult at 0x11555b450>]

In [None]:
# SNOMED-CT codes with exponent notation
sct_errors = [
    {
        "code": "ANC.B5.DE24",
        "display": "Leg cramps",
        "target": [
            {
                "code": "1.56399E+16",
                "display": "Cramp of muscle of bilateral lower limbs (finding)",
                "equivalence": "relatedto",
            }
        ],
    },
    {
        "code": "ANC.B5.DE33",
        "display": "Pain - Leg",
        "target": [
            {
                "code": "1.22475E+16",
                "display": "Pain in bilateral lower legs (finding)",
                "equivalence": "relatedto",
            }
        ],
    },
    {
        "code": "ANC.B7.DE30",
        "display": "Leg cramps",
        "target": [
            {
                "code": "1.56399E+16",
                "display": "Cramp of muscle of bilateral lower limbs (finding)",
                "equivalence": "relatedto",
            }
        ],
    },
    {
        "code": "ANC.B7.DE35",
        "display": "Pain - Leg",
        "target": [
            {
                "code": "1.22475E+16",
                "display": "Pain in bilateral lower legs (finding)",
                "equivalence": "relatedto",
            }
        ],
    },
    {
        "code": "ANC.B7.DE56",
        "display": "Leg cramps",
        "target": [
            {
                "code": "1.56399E+16",
                "display": "Cramp of muscle of bilateral lower limbs (finding)",
                "equivalence": "relatedto",
            }
        ],
    },
    {
        "code": "ANC.B7.DE61",
        "display": "Pain - Leg",
        "target": [
            {
                "code": "1.22475E+16",
                "display": "Pain in bilateral lower legs (finding)",
                "equivalence": "relatedto",
            }
        ],
    },
    {
        "code": "ANC.B8.DE90",
        "display": "Mucopurulent cervicitis",
        "target": [
            {
                "code": "1.07437E+16",
                "display": "Inflammation of cervix in pregnancy (disorder)",
                "equivalence": "relatedto",
            }
        ],
    },
    {
        "code": "ANC.End.17",
        "display": "Preterm Birth",
        "target": [
            {
                "code": "1.07612E+16",
                "display": "Preterm labour with preterm delivery (finding)",
                "equivalence": "equivalent",
            }
        ],
    },
    {
        "code": "ANC.B10.DE61",
        "display": "Counselling conducted on healthy eating and keeping physically active",
        "target": [
            {
                "code": "4.41041E+14",
                "display": "Counseling about nutrition (procedure)",
                "equivalence": "equivalent",
            }
        ],
    },
    {
        "code": "ANC.B10.DE61",
        "display": "Counselling conducted on healthy eating and keeping physically active",
        "target": [
            {
                "code": "4.41041E+14",
                "display": "Counseling about nutrition (procedure)",
                "equivalence": "equivalent",
            }
        ],
    },
    {
        "code": "ANC.B10.DE69",
        "display": "Counselling conducted on balanced energy and protein dietary supplementation",
        "target": [
            {
                "code": "4.41041E+14",
                "display": "Counseling about nutrition (procedure)",
                "equivalence": "equivalent",
            }
        ],
    },
]
