In [1]:
from pathlib import Path
from shutil import rmtree

import altair as alt
import kuzu
import polars as pl

from ddl import sct


snomed_data = Path("./data/external/snomed-ips/Snapshot/Terminology/")
kuzu_path = Path("./data/internal/kuzu-db/")
if kuzu_path.exists:
    rmtree(kuzu_path)

db = kuzu.Database(kuzu_path)
conn = kuzu.Connection(db)


def read_snomed(path: Path) -> pl.DataFrame:
    return pl.read_csv(path, separator="\t").with_columns(
        pl.col("effectiveTime").cast(pl.String).str.to_date("%Y%m%d"),
        pl.col("active").cast(pl.Boolean),
    )


In [2]:
Description = read_snomed(
    snomed_data / "sct2_Description_IPSSnapshot-en_IPST_20240701.txt"
)

# Two type of descriptions:
# 900000000000003001 | Fully qualified name
# 900000000000013009 | Synonyms
Description.select(pl.col("typeId").value_counts()).unnest("typeId")

typeId,count
i64,u32
900000000000003001,19697
900000000000013009,40417


In [3]:
# load Concept
fullname = Description.filter(pl.col("typeId") == 900000000000003001).select(
    pl.col("conceptId"), pl.col("term").alias("fullQualifiedName")
)
synonyms = (
    Description.filter(pl.col("typeId") == 900000000000013009)
    .select(pl.col("conceptId"), pl.col("term").alias("synonyms"))
    .group_by(pl.col("conceptId"))
    .agg("synonyms")
)

join_concept = dict(how="left", left_on="id", right_on="conceptId")
Concept = (
    read_snomed(snomed_data / "sct2_Concept_IPSSnapshot_IPST_20240701.txt")
    .join(fullname, **join_concept)
    .join(synonyms, **join_concept)
)

conn.execute(sct.concept + "COPY concept FROM Concept;")

[<kuzu.query_result.QueryResult at 0x1373f5010>,
 <kuzu.query_result.QueryResult at 0x1373f5090>]

In [4]:
# load Relationship, note we need to change ordering of columns for loading in Kuzu
join_relationship = dict(how="left", left_on="typeId", right_on="conceptId")
Relationship = (
    read_snomed(snomed_data / "sct2_Relationship_IPSSnapshot_IPST_20240701.txt")
    .select(
        pl.col(
            [
                "sourceId",
                "destinationId",
                "id",
                "effectiveTime",
                "active",
                "moduleId",
                "relationshipGroup",
                "typeId",
                "characteristicTypeId",
                "modifierId",
            ]
        )
    )
    .join(fullname, **join_relationship)
    .join(synonyms, **join_relationship)
)

In [5]:
# inspect frequency of each type of relationship out of 66,017 relationships
print(Relationship.shape)

# 116680003 | Is A occurs 32,111 times i.e accounts for half
# 363698007 | Finding site 5,497
# 116676008 | Associated morphology 3,818
type_count = (
    Relationship.select(pl.col("typeId").value_counts())
    .unnest("typeId")
    .sort("count", descending=True)
)
type_count.plot.bar(
    alt.X("count:Q").scale(type="symlog"), y=alt.Y("typeId:O").sort("-x")
)

(66017, 12)


In [6]:
for name, id in sct.top10_relationships:
    Relationship_ = Relationship.filter(pl.col("typeId") == id)
    conn.execute(
        f"DROP TABLE IF EXISTS {name};"
        + sct.relationship(name)
        + f"COPY {name} FROM Relationship_;"
    )
