In [1]:
# ---
# Notebook: 02_review_ct_mri_procedures.ipynb
# Purpose: Review top 50 procedures containing CT and MRI variants
# ---

import pandas as pd

# --- Load summary CSV ---
df = pd.read_csv("../data/processed/snomed_procedure_summary.csv")
df.columns = df.columns.str.strip()
df['SNOMED Procedure'] = df['SNOMED Procedure'].astype(str).str.strip()

# --- Define CT and MRI keyword patterns ---
ct_keywords = [
    "CT", "Ct", "ct", 
    "Computed Tomography", "computed tomography", 
    "CT scan", "ct scan"
]

mri_keywords = [
    "MRI", "Mri", "mri", 
    "Magnetic Resonance Imaging", "magnetic resonance imaging", 
    "MR scan", "MR ", "Mr scan"
]

# --- Filter function ---
def filter_top_procedures(df, keywords, label, top_n=50):
    pattern = "|".join([fr"\b{kw}\b" for kw in keywords])
    filtered_df = df[df['SNOMED Procedure'].str.contains(pattern, case=False, regex=True)].copy()
    filtered_df = filtered_df.sort_values(by='Count', ascending=False).head(top_n).reset_index(drop=True)
    
    print(f"\n🔍 Top {top_n} procedures for: {label}")
    display(
        filtered_df.style
        .set_caption(f"Top {top_n} {label} Procedures (by Count)")
        .format({"Count": "{:,.0f}", "% of Total": "{:.2f}%"})
        .background_gradient(subset='Count', cmap='Greens')
        .set_table_styles([
            {"selector": "caption", "props": [("caption-side", "top"), ("font-weight", "bold")]}
        ])
    )

# --- Display CT and MRI results ---
filter_top_procedures(df, ct_keywords, "CT")
filter_top_procedures(df, mri_keywords, "MRI")



🔍 Top 50 procedures for: CT


Unnamed: 0,SNOMED Procedure,Count,% of Total
0,Computed tomography of entire head (procedure),9281,2.53%
1,Computed tomography of entire head (procedure) (408754009),5410,1.48%
2,"Computed tomography of thorax, abdomen and pelvis with contrast (procedure)",4687,1.28%
3,Computed tomography of abdomen and pelvis with contrast (procedure),3271,0.89%
4,"Computed tomography of thorax, abdomen and pelvis with contrast (procedure) (433761009)",2705,0.74%
5,Computed tomography of abdomen and pelvis with contrast (procedure) (432370003),2354,0.64%
6,Computed tomography of chest (procedure),2050,0.56%
7,Computed tomography angiography of pulmonary artery with contrast (procedure),2042,0.56%
8,Positron emission tomography with computed tomography fluorodeoxyglucose imaging of whole body (procedure) (443271005),1554,0.42%
9,Computed tomography of chest (procedure) (169069000),1139,0.31%



🔍 Top 50 procedures for: MRI


Unnamed: 0,SNOMED Procedure,Count,% of Total
0,Magnetic resonance imaging of head (procedure),3377,0.92%
1,Magnetic resonance imaging of head (procedure) (241601008),1954,0.53%
2,Magnetic resonance imaging of lumbar and sacral spine (procedure),1550,0.42%
3,Magnetic resonance imaging of lumbar and sacral spine (procedure) (433141005),1443,0.39%
4,Magnetic resonance imaging of head with contrast (procedure) (432874000),845,0.23%
5,Magnetic resonance imaging of head with contrast (procedure),817,0.22%
6,Magnetic resonance imaging of spine (procedure),810,0.22%
7,Magnetic resonance imaging of cervical spine (procedure),805,0.22%
8,Magnetic resonance imaging of cervical spine (procedure) (241646009),683,0.19%
9,Magnetic resonance imaging of internal auditory meatus (procedure),557,0.15%


In [2]:
import re, pandas as pd

df = pd.read_csv("../data/processed/snomed_procedure_summary.csv")

# Step A – clean duplicates
df['proc_clean'] = (
    df['SNOMED Procedure']
    .str.replace(r"\s*\(\d+\)$", "", regex=True)
    .str.strip()
    .str.lower()
)

# Step B – CT filter without PET/SPECT/CBCT
ct_pattern   = r"\b(ct|computed tomography)\b"
ct_excl      = r"(pet|positron emission|single photon|cone beam)"
df_ct        = df[df['proc_clean'].str.contains(ct_pattern) &
                  ~df['proc_clean'].str.contains(ct_excl)]

# Step C – MRI filter
mri_pattern  = r"\b(mri|magnetic resonance)\b"
df_mri       = df[df['proc_clean'].str.contains(mri_pattern)]

# Step D – collapse duplicates and show head-only tables
for label, sub in [("CT", df_ct), ("MRI", df_mri)]:
    sub2 = (sub.groupby('proc_clean', as_index=False)
                 .agg({'Count':'sum', '% of Total':'sum'})
                 .sort_values('Count', ascending=False)
                 .head(50))
    display(sub2.style.set_caption(f"Top 50 {label} (de-duplicated)"))


  df_ct        = df[df['proc_clean'].str.contains(ct_pattern) &
  ~df['proc_clean'].str.contains(ct_excl)]
  df_mri       = df[df['proc_clean'].str.contains(mri_pattern)]


Unnamed: 0,proc_clean,Count,% of Total
64,computed tomography of entire head (procedure),14691,4.01
167,"computed tomography of thorax, abdomen and pelvis with contrast (procedure)",7392,2.02
42,computed tomography of abdomen and pelvis with contrast (procedure),5625,1.53
57,computed tomography of chest (procedure),3189,0.87
24,computed tomography angiography of pulmonary artery with contrast (procedure),2324,0.64
16,computed tomography angiography of coronary arteries (procedure),1541,0.42
56,computed tomography of cervical spine (procedure),1539,0.41
165,computed tomography of thorax with contrast (procedure),1342,0.36
204,low dose computed tomography of thorax (procedure),1315,0.36
59,"computed tomography of chest, abdomen and pelvis (procedure)",1203,0.33


Unnamed: 0,proc_clean,Count,% of Total
104,magnetic resonance imaging of head (procedure),5331,1.45
156,magnetic resonance imaging of lumbar and sacral spine (procedure),2993,0.81
107,magnetic resonance imaging of head with contrast (procedure),1662,0.45
78,magnetic resonance imaging of cervical spine (procedure),1488,0.41
230,magnetic resonance imaging of spine (procedure),1346,0.37
114,magnetic resonance imaging of internal auditory meatus (procedure),1094,0.3
19,magnetic resonance cholangiopancreatography (procedure),894,0.24
206,magnetic resonance imaging of right knee (procedure),782,0.21
136,magnetic resonance imaging of left knee (procedure),743,0.2
187,magnetic resonance imaging of prostate (procedure),727,0.19
