In [2]:
from IPython.display import display, Markdown, HTML
import pandas as pd

In [3]:
icdo = (
    pd.read_csv("../resources/Copy-of-ICD-O-3.2_MFin_17042019_web.csv", skiprows=1, dtype=str)
    .rename(columns=lambda s: s.replace(" ", "_").replace('3.2', ''))
    .assign(Code_references=lambda df: df.Code_reference.str.strip(' ').str.lstrip('(').str.rstrip(')').str.split(', '))
)
icdo.head()

Unnamed: 0,ICDO,Level,Term,Code_reference,obs,See_also,See_note,Includes,Excludes,Other_text,Code_references
0,,1,MORPHOLOGY,,,,,,,,
1,800,2,"Neoplasms, NOS",,,,,,,,
2,8000/0,Preferred,"Neoplasm, benign",,,,,,,,
3,8000/0,Synonym,"Tumor, benign",,,,,,,,
4,8000/0,Synonym,"Unclassified tumor, benign",,,,,,,,


In [4]:
counts = icdo.Level.value_counts().to_frame()
counts.loc["TOTAL"] = counts["count"].sum()
counts

Unnamed: 0_level_0,count
Level,Unnamed: 1_level_1
Synonym,1188
Preferred,1143
Related,558
2,49
3,10
1,1
TOTAL,2949


In [5]:
mrconso = pd.read_csv("../mrconso.csv", dtype=str)
mrconso

Unnamed: 0,cui,sab,code,str,lat,tty
0,C3653992,ATC,A,ALIMENTARY TRACT AND METABOLISM,ENG,PT
1,C3653992,ATC,A,ALIMENTARY TRACT AND METABOLISM DRUGS,ENG,RXN_PT
2,C3653755,ATC,A01,STOMATOLOGICAL PREPARATIONS,ENG,PT
3,C3653755,ATC,A01A,STOMATOLOGICAL PREPARATIONS,ENG,PT
4,C3653508,ATC,A01AA,Caries prophylactic agents,ENG,PT
...,...,...,...,...,...,...
7330059,C5884998,SRC,V-SNOMEDCT_US_2024_03_01,SNOMEDCT_US_2024_03_01,ENG,VAB
7330060,C5884998,SRC,V-SNOMEDCT_US_2024_03_01,"US Edition of SNOMED CT, 2024_03_01",ENG,VPT
7330061,C0700119,SRC,V-SRC,Metathesaurus Source Terminology Names,ENG,RPT
7330062,C0700119,SRC,V-SRC,Source Terminology Names (UMLS),ENG,SSN


In [6]:
refs = icdo.Code_references.explode().dropna().str.removesuffix('._').drop_duplicates().to_frame('code')

for sab in ['ICD10', 'ICD10AM', 'ICD10CM']:
    refs = (
        pd.merge(refs, mrconso[mrconso.sab == sab][['code', 'cui']], on='code', how='left')
        .rename(columns={'cui': 'cui_'+sab})
    )

refs = (
    refs
    .groupby('code')
    .agg(set)
    .reset_index()
)

refs

Unnamed: 0,code,cui_ICD10,cui_ICD10AM,cui_ICD10CM
0,C07,{C0747273},{C0747273},{C0747273}
1,C08,{C0153362},{C0153362},"{C0153362, C4290059}"
2,C11,{C0153392},{C0153392},{C0153392}
3,C15,{C0546837},{C0546837},{C0546837}
4,C16,{C0024623},{C0024623},{C0024623}
...,...,...,...,...
79,C75.2,{C0496843},{C0496843},{C0496843}
80,C75.3,{C0153655},{C0153655},{C0153655}
81,C75.4,{C0153656},{C0153656},{C0153656}
82,C75.5,{C0438413},{C0438413},{C0438413}


In [21]:
icdo_pref = (
    icdo[icdo.Level == "Preferred"]
    [['ICDO', 'Term', 'obs', 'Code_references']]
    .fillna({'obs': ''})
)

def acc_dict2(vals):
    res = {}
    for val in vals:
        [cui, code_term] = val.split('|')
        if code_term not in res:
            res[code_term] = set()
        res[code_term].add(cui)
    return res

def acc_dict3(vals):
    res = {}
    for val in vals:
        [sab, cui, code] = val.split('|')
        if sab not in res:
            res[sab] = {}
        if code not in res[sab]:
            res[sab][code] = set()
        res[sab][code].add(cui)
    return res

print('Exact term match')
icdo_pref_term_exact = (
    pd.merge(
        icdo_pref,
        mrconso[mrconso.code != "NOCODE"],
        left_on='Term',
        right_on='str',
        how='left',
    )
    .assign(sab_cui_code=lambda df: df.sab + '|' + df.cui + '|' + df.code)
    .groupby('ICDO')
    .sab_cui_code.agg(lambda s: acc_dict3(v for v in s if not pd.isna(v)))
)
print(len(icdo_pref_term_exact.index.drop_duplicates()))

print("Term match without NOS")
icdo_pref_term_no_nos = (
    pd.merge(
        icdo_pref.assign(Term_no_NOS=lambda df: df.Term.str.removesuffix(', NOS')),
        mrconso[mrconso.code != "NOCODE"],
        left_on='Term_no_NOS',
        right_on='str',
        how='left',
    )
    .assign(sab_cui_code=lambda df: df.sab + '|' + df.cui + '|' + df.code)
    .groupby('ICDO')
    .sab_cui_code.agg(lambda s: acc_dict3(v for v in s if not pd.isna(v)))
)
print(len(icdo_pref_term_no_nos.index.drop_duplicates()))

print('From code reference (ICD10CM)')
icdo_pref_code_ref = (
    pd.merge(
        icdo_pref.explode('Code_references').assign(Code_reference=lambda df: df.Code_references.fillna('').str.removesuffix('._')),
        mrconso[mrconso.sab == 'ICD10CM'][['code', 'cui', 'str']],
        left_on='Code_reference',
        right_on='code',
        how='left')
    .assign(cui_code=lambda df: df.cui + '|' + df.code + ':' + df.str)
    .groupby('ICDO')
    .cui_code.agg(lambda s: acc_dict2(v for v in s if not pd.isna(v)))
)
print(len(icdo_pref_code_ref.index.drop_duplicates()))

print('From code reference without suffix ._ (ICD10CM)')
icdo_pref_code_ref_no_wildcard = (
    pd.merge(
        icdo_pref.explode('Code_references').assign(Code_reference=lambda df: df.Code_references.fillna('').str.removesuffix('._')),
        mrconso[mrconso.sab == 'ICD10CM'][['code', 'cui', 'str']],
        left_on='Code_reference',
        right_on='code',
        how='left')
    .assign(cui_code=lambda df: df.cui + '|' + df.code + ':' + df.str)
    .groupby('ICDO')
    .cui_code.agg(lambda s: acc_dict2(v for v in s if not pd.isna(v)))
)
print(len(icdo_pref_code_ref_no_wildcard.index.drop_duplicates()))


print('ICD10AM with code prefix M')
icdo_icd10am = (
    pd.merge(
        icdo_pref.assign(code=lambda df: 'M' + df.ICDO),
        mrconso[mrconso.sab == 'ICD10AM'][['code', 'cui', 'str']],
        on='code',
        how='inner')
    .assign(cui_code_str=lambda df: df.cui + '|' + df.code + ':' + df.str)
    .groupby('ICDO')
    .cui_code_str.agg(lambda s: acc_dict2(v for v in s if not pd.isna(v)))
)
print(len(icdo_icd10am.index.drop_duplicates()))


Exact term match
1143
Term match without NOS
1143
From code reference (ICD10CM)
1143
From code reference without suffix ._ (ICD10CM)
1143
ICD10AM with code prefix M
709


In [None]:
icdo_manual = (
    pd.read_csv("ICDO3-manual.csv")
    .set_index('code')
    .cui
)

for ix, row in icdo_pref.iterrows():
    cui = None
    match = None
    umls_sab = None
    umls_code = None
    cands = []

    # Manual mapping
    manual_cui = icdo_manual.get(row.ICDO)
    if manual_cui:
        match = 'manual'
        cui = manual_cui
        cands.append(f'manual {cui1}')

    # Exact term match
    term_exact = icdo_pref_term_exact[row.ICDO]
    for sab in 'ICD10AM MTH SNOMEDCT_US ICD10CM ICD10 ICD10 MSH ICD10DUT MTH ICPC2P DMDICD10 HPO ORPHANET SNMI ICD10AE ICD9CM MDR SNM MEDLINEPLUS MTHICD9 RCDLNC '.split():
        try:
            cuis_by_code = term_exact[sab]
            for code, cuis in term_exact[sab].items():
                for cui1 in cuis:
                    cands.append(f'exact term {cui1} ({sab}:{code})')
                    if cui is None:
                        match = f'exact term'
                        cui = cui1
                        umls_sab = sab
                        umls_code = code
        except KeyError:
            pass

    # ICD10AM code adding prefix M
    icd10am = icdo_icd10am.get(row.ICDO)
    if icd10am:
        for code_term, cuis in icd10am.items():
            code, term = code_term.split(":")
            for cui1 in cuis:
                cands.append(f'manual {cui1}')
                if cui is None:
                    match = f'ICD10AM without M'
                    cui = cui1
                    umls_sab = 'ICD10AM'
                    umls_code = code

    # code reference
    for [code_term, cuis] in icdo_pref_code_ref.get(row.ICDO, {}).items():
        code, term = code_term.split(':')
        for cui1 in cuis:
            cands.append(f'code reference {cui1} ({code_term})')
            if cui is None:
                match = f'code reference'
                cui = cui1
                umls_sab = 'ICD10'
                umls_code = code

    # code reference without suffix ._
    for [code_term, cuis] in icdo_pref_code_ref_no_wildcard.get(row.ICDO, {}).items():
        for cui1 in cuis:
            cands.append(f'code reference no wildcard {cui1} ({code_term})')

    # term without suffix NOS
    term_no_nos = icdo_pref_term_no_nos[row.ICDO]
    for sab in 'ICD10AM MTH SNOMEDCT_US ICD10CM ICD10 ICD10 MSH ICD10DUT MTH ICPC2P DMDICD10 HPO ORPHANET SNMI ICD10AE ICD9CM MDR SNM MEDLINEPLUS MTHICD9 RCDLNC '.split():
        try:
            cuis_by_code = term_no_nos[sab]
            for code, cuis in term_no_nos[sab].items():
                for cui1 in cuis:
                    cands.append(f'term without NOS {cui1} ({sab}:{code})')
        except KeyError:
            pass
    
    icdo_pref.loc[ix, 'cui'] = cui or pd.NA
    icdo_pref.loc[ix, 'match'] = match or pd.NA
    icdo_pref.loc[ix, 'umls_sab'] = umls_sab or pd.NA
    icdo_pref.loc[ix, 'umls_code'] = umls_code or pd.NA
    icdo_pref.loc[ix, 'rel'] = 'EQ'
    icdo_pref.loc[ix, 'candidates'] = ', '.join(cands) or pd.NA

display(HTML(
    pd.DataFrame([
        ("CUI", (~icdo_pref.cui.isna()).sum()),
        ("No CUI", icdo_pref.cui.isna().sum()),
        ("No candidate", (icdo_pref.cui.isna() & icdo_pref.candidates.isna()).sum()),
    ]).set_index(0)[1].to_frame('count').to_html()
))

icdo_pref[icdo_pref.cui.isna()].to_csv("ICDO3-missing.csv", index=False)

(
    icdo_pref[~icdo_pref.cui.isna()]
    .drop('Code_references', axis=1)
    .rename(columns={'ICDO': 'code', 'Term': 'term'})
    [['code','term','rel','umls_code','umls_sab','cui','match']]
    .to_csv("ICDO3.csv", index=False)
)

icdo_pref

Unnamed: 0_level_0,count
0,Unnamed: 1_level_1
CUI,1075
No CUI,68
No candidate,44


Unnamed: 0,ICDO,Term,obs,Code_references,cui,criterion,umls_sab,umls_code,rel,candidates
2,8000/0,"Neoplasm, benign",,,C0086692,exact term,ICD10AM,M8000/0,EQ,"exact term C0086692 (ICD10AM:M8000/0), exact t..."
5,8000/1,"Neoplasm, uncertain whether benign or malignant",,,C0677041,exact term,ICD10AM,M8000/1,EQ,"exact term C0677041 (ICD10AM:M8000/1), exact t..."
10,8000/3,"Neoplasm, malignant",,,C0006826,exact term,ICD10AM,M8000/3,EQ,"exact term C0006826 (ICD10AM:M8000/3), exact t..."
16,8000/6,"Neoplasm, metastatic",,,C2939420,exact term,ICD10AM,M8000/6,EQ,"exact term C2939420 (ICD10AM:M8000/6), exact t..."
21,8000/9,"Neoplasm, malignant, uncertain whether primary...",,,C0334224,exact term,ICD10AM,M8000/9,EQ,"exact term C0334224 (ICD10AM:M8000/9), exact t..."
...,...,...,...,...,...,...,...,...,...,...
2936,9985/3,Myelodysplastic syndrome with multilineage dys...,,,C0796466,exact term,SNOMEDCT_US,128836009,EQ,"exact term C0796466 (SNOMEDCT_US:128836009), e..."
2939,9986/3,Myelodysplastic syndrome with isolated del (5q),,,C1292779,exact term,SNOMEDCT_US,128837000,EQ,"exact term C1292779 (SNOMEDCT_US:128837000), t..."
2941,9987/3,"Therapy-related myelodysplastic syndrome, NOS",,,,,,,EQ,term without NOS C1292780 (SNOMEDCT_US:1288380...
2944,9989/3,"Myelodysplastic syndrome, NOS",,,C3463824,exact term,SNOMEDCT_US,4227006,EQ,"exact term C3463824 (SNOMEDCT_US:4227006), exa..."
