In [3]:
from pathlib import Path

import altair as alt
import kuzu
import polars as pl

from ddl import snomed


snomed_data = Path("./data/external/snomed-ips/Snapshot/Terminology/")
kuzu_path = Path("./data/internal/kuzu-db/")
if kuzu_path.exists:
    for file in kuzu_path.glob("*"):
        file.unlink(file)
    kuzu_path.rmdir()
    kuzu_path.mkdir()

db = kuzu.Database(kuzu_path)
conn = kuzu.Connection(db)


def read_snomed(path: Path) -> pl.DataFrame:
    return pl.read_csv(path, separator="\t").with_columns(
        pl.col("effectiveTime").cast(pl.String).str.to_date("%Y%m%d"),
        pl.col("active").cast(pl.Boolean),
    )


In [4]:
Description = read_snomed(
    snomed_data / "sct2_Description_IPSSnapshot-en_IPST_20240701.txt"
)

# Two type of descriptions:
# 900000000000003001 | Fully qualified name
# 900000000000013009 | Synonyms
Description.select(pl.col("typeId").value_counts()).unnest("typeId")

typeId,count
i64,u32
900000000000013009,40417
900000000000003001,19697


In [20]:
Concept = (
    read_snomed(snomed_data / "sct2_Concept_IPSSnapshot_IPST_20240701.txt")
    .join(
        Description.filter(pl.col("typeId") == 900000000000003001).select(
            pl.col("conceptId"), pl.col("term").alias("fullQualifiedName")
        ),
        left_on="id",
        right_on="conceptId",
    )
    .join(
        Description.filter(pl.col("typeId") == 900000000000013009)
        .select(pl.col("conceptId"), pl.col("term").alias("synonyms"))
        .group_by(pl.col("conceptId"))
        .agg("synonyms"),
        left_on="id",
        right_on="conceptId",
    )
)
Concept

id,effectiveTime,active,moduleId,definitionStatusId,fullQualifiedName,synonyms
i64,date,bool,i64,i64,str,list[str]
38464002,2024-07-01,true,999991001000101,900000000000074008,"""Structure of median nerve (bod…","[""Median nerve"", ""Structure of median nerve""]"
256317002,2024-07-01,true,999991001000101,900000000000074008,"""Grapes (substance)""","[""Grapes"", ""Grape""]"
12461007,2024-07-01,true,999991001000101,900000000000074008,"""Structure of pelvic diaphragm …","[""Pelvic diaphragm"", ""Structure of pelvic diaphragm""]"
71049009,2024-07-01,true,999991001000101,900000000000074008,"""Structure of deep venous syste…","[""Deep venous system of upper extremity"", ""Deep veins of upper extremity"", … ""Structure of deep venous system of upper limb""]"
48409008,2024-07-01,true,999991001000101,900000000000074008,"""Respiratory crackles (finding)""","[""Respiratory crackles"", ""Rales"", ""Respiratory crepitations""]"
…,…,…,…,…,…,…
127021009,2024-07-01,true,999991001000101,900000000000073002,"""Neoplasm of adrenal gland (dis…","[""Neoplasm of adrenal gland"", ""Adrenal tumour"", … ""Tumor of adrenal gland""]"
235810006,2024-07-01,true,999991001000101,900000000000074008,"""Bolus obstruction of intestine…","[""Bolus obstruction of intestine""]"
297220005,2024-07-01,true,999991001000101,900000000000074008,"""Structure of bone marrow of il…","[""Iliac crest marrow structure"", ""Iliac crest bone marrow"", … ""Structure of bone marrow of iliac crest""]"
373373000,2024-07-01,true,999991001000101,900000000000074008,"""Poorly differentiated histolog…","[""Poorly differentiated histological grade finding""]"


In [19]:
Description.filter(pl.col("typeId") == 900000000000013009).select(
    pl.col("conceptId"), pl.col("term").alias("synonyms")
).group_by(pl.col("conceptId")).agg("synonyms")


conceptId,synonyms
i64,list[str]
373623009,"[""Osteoarthritis of glenohumeral joint"", ""Degenerative joint disease of glenohumeral joint""]"
161445009,"[""H/O: diabetes mellitus"", ""Pre-existing diabetes mellitus"", ""History of diabetes mellitus""]"
280106006,"[""Entire right fallopian tube""]"
386087005,"[""Collection of aspirated sputum""]"
69195002,"[""Degeneration of cervical intervertebral disc""]"
…,…
415859000,"[""Yersinia enterocolitica serogroup O:20""]"
112638000,"[""Displacement"", ""Deviation""]"
16213931000119103,"[""Specimen from dome of urinary bladder obtained by biopsy""]"
782044000,"[""Pegaspargase 750 unit/mL solution for injection""]"


In [None]:
# load Concepts

conn.execute(snomed["Concept"] + "COPY concept FROM Concept;")


# load Relationship, note we need to change ordering of columns for loading in Kuzu
Relationship = read_snomed(
    snomed_data / "sct2_Relationship_IPSSnapshot_IPST_20240701.txt"
).select(
    pl.col(
        [
            "sourceId",
            "destinationId",
            "id",
            "effectiveTime",
            "active",
            "moduleId",
            "relationshipGroup",
            "typeId",
            "characteristicTypeId",
            "modifierId",
        ]
    )
)

conn.execute(
    "DROP TABLE IF EXISTS relationship;"
    + snomed["Relationship"]
    + "COPY Relationship FROM Relationship;"
)

RuntimeError: Binder exception: Concept already exists in catalog.

In [8]:
Concept

id,effectiveTime,active,moduleId,definitionStatusId
i64,date,bool,i64,i64
109006,2024-07-01,true,999991001000101,900000000000074008
111002,2024-07-01,true,999991001000101,900000000000074008
140004,2024-07-01,true,999991001000101,900000000000073002
219006,2024-07-01,true,999991001000101,900000000000074008
252000,2024-07-01,true,999991001000101,900000000000074008
…,…,…,…,…
900000000000536009,2024-07-01,true,999991001000101,900000000000074008
900000000000537000,2024-07-01,true,999991001000101,900000000000074008
900000000000548007,2024-07-01,true,999991001000101,900000000000074008
900000000000549004,2024-07-01,true,999991001000101,900000000000074008


typeId,count
i64,u32
900000000000003001,19697
900000000000013009,40417


In [None]:
full_name_id = 900000000000003001

Description_fullname = Description.filter(pl.col("typeId") == full_name_id)
Description_fullname

typeId,count
i64,u32
900000000000003001,19697
900000000000013009,40417


In [None]:
# inspect frequency of each type of relationship out of 66,017 relationships
print(df.shape)

# 116680003 | Is A occurs 32,111 times i.e accounts for half
# 363698007 | Finding site 5,497
# 116676008 | Associated morphology 3,818
type_count = df.select(pl.col("typeId").value_counts()).unnest("typeId")
type_count.plot.bar(
    alt.X("count:Q").scale(type="symlog"), y=alt.Y("typeId:O").sort("-x")
)

(66017, 10)


In [11]:
type_count.select(pl.col("count")).sum()

count
u32
66017


In [12]:
df.shape

(66017, 10)

In [None]:
ddl_concept = """
DROP TABLE IF EXISTS concept;
CREATE NODE TABLE concept(
    id INT64,
    effectiveTime DATE,
    active BOOLEAN,
    moduleId INT64,
    definitionStatusId INT64,
    PRIMARY KEY (id)
);
COPY concept FROM df;
"""

conn.execute(ddl_concept)

[<kuzu.query_result.QueryResult at 0x110759f10>,
 <kuzu.query_result.QueryResult at 0x10ee63290>,
 <kuzu.query_result.QueryResult at 0x10ee60b10>]

In [None]:
ddl_relationship = """
DROP TABLE IF EXISTS relationship;
CREATE NODE TABLE relationship(
id INT64,
effectiveTime DATE,
active BOOLEAN,
moduleId INT64,
sourceId INT64,
destinationId INT64,
relationshipGroup INT64,
typeId INT64,
characteristicTypeId INT64,
modifierId INT64,
"""