# clean data

In [29]:
import re

def clean_text(text):
    if not isinstance(text, str):
        return text
    text = text.replace("\n", " ").replace("\r", " ").replace("\t", " ")
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces
    text = text.strip()
    return text

In [34]:
import pandas as pd 
import swifter

discharge = pd.read_csv("/data2/simon/data/physionet.org/files/mimic-iv-note/2.2/note/discharge.csv.gz")
columns_to_drop = ["note_id", "subject_id", "hadm_id", "note_type", "note_seq", "charttime", "storetime"]
discharge = discharge.drop(columns=columns_to_drop)
discharge["clinical_text"] = discharge["text"].swifter.apply(clean_text)
discharge = discharge.drop(columns="text")
discharge

Pandas Apply: 100%|██████████| 331793/331793 [02:40<00:00, 2067.34it/s]


Unnamed: 0,clinical_text
0,Name: ___ Unit No: ___ Admission Date: ___ Dis...
1,Name: ___ Unit No: ___ Admission Date: ___ Dis...
2,Name: ___ Unit No: ___ Admission Date: ___ Dis...
3,Name: ___ Unit No: ___ Admission Date: ___ Dis...
4,Name: ___ Unit No: ___ Admission Date: ___ Dis...
...,...
331788,Name: ___ Unit No: ___ Admission Date: ___ Dis...
331789,Name: ___ Unit No: ___ Admission Date: ___ Dis...
331790,Name: ___ Unit No: ___ Admission Date: ___ Dis...
331791,Name: ___ Unit No: ___ Admission Date: ___ Dis...


In [38]:
avg_token_len = discharge["clinical_text"].swifter.apply(lambda x: len(x.split())).mean()
print(f"Average token count: {avg_token_len:.2f}")

Average token count: 1600.27


In [35]:
mem_bytes = discharge.memory_usage(deep=True).sum()
mem_mb = mem_bytes / (1024 ** 2)
print(f"DataFrame memory usage: {mem_mb:.2f} MB")

DataFrame memory usage: 3241.58 MB


In [36]:
radiology = pd.read_csv("/data2/simon/data/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv.gz")
columns_to_drop = ["note_id", "subject_id", "hadm_id", "note_type", "note_seq", "charttime", "storetime"]
radiology = radiology.drop(columns=columns_to_drop)
radiology["clinical_text"] = radiology["text"].swifter.apply(clean_text)
radiology = radiology.drop(columns="text")
radiology

Pandas Apply: 100%|██████████| 2321355/2321355 [02:06<00:00, 18290.93it/s]


Unnamed: 0,clinical_text
0,EXAMINATION: CHEST (PA AND LAT) INDICATION: __...
1,EXAMINATION: LIVER OR GALLBLADDER US (SINGLE O...
2,"INDICATION: ___ HCV cirrhosis c/b ascites, hiv..."
3,EXAMINATION: Ultrasound-guided paracentesis. I...
4,EXAMINATION: Paracentesis INDICATION: ___ year...
...,...
2321350,"HISTORY: ___, with left occipital bleeding. As..."
2321351,INDICATION: ___ female intubated for head blee...
2321352,HISTORY: ___ woman with left occipital hemorrh...
2321353,PORTABLE CHEST OF ___ COMPARISON: ___ radiogra...


In [40]:
avg_token_len = radiology["clinical_text"].swifter.apply(lambda x: len(x.split())).mean()
print(f"Average token count: {avg_token_len:.2f}")

Pandas Apply:   0%|          | 0/2321355 [00:00<?, ?it/s]

Pandas Apply: 100%|██████████| 2321355/2321355 [00:22<00:00, 102076.62it/s]

Average token count: 168.74





In [37]:
mem_bytes = radiology.memory_usage(deep=True).sum()
mem_mb = mem_bytes / (1024 ** 2)
print(f"DataFrame memory usage: {mem_mb:.2f} MB")

DataFrame memory usage: 2642.46 MB


In [23]:
import gzip
from lxml import etree
import pandas as pd
import glob

def extract_article_data(article):
    def xpath_text(elem, path):
        found = elem.find(path)
        return found.text.strip() if found is not None and found.text else None

    pmid = xpath_text(article, ".//PMID")
    title = xpath_text(article, ".//ArticleTitle")

    abstract_parts = article.findall(".//Abstract/AbstractText")
    abstract = " ".join([a.text.strip() for a in abstract_parts if a is not None and a.text]) if abstract_parts else None

    # Concatenate title + abstract as clinical text
    if title and abstract:
        clinical_text = f"{title} {abstract}"
    elif title:
        clinical_text = title
    elif abstract:
        clinical_text = abstract
    else:
        clinical_text = None

    return {
        "pmid": pmid,
        "clinical_text": clinical_text
    }

def parse_pubmed_xml(xml_file):
    with gzip.open(xml_file, 'rb') as f:
        context = etree.iterparse(f, tag='PubmedArticle')
        for _, elem in context:
            yield extract_article_data(elem)
            elem.clear()

# Collect entries into a DataFrame
records = []
for file in glob.glob("../data/pubmed/pubmed25n0001.xml.gz"):
    print(f"Parsing {file}")
    for record in parse_pubmed_xml(file):
        if record["clinical_text"]:  # skip empty ones
            records.append(record)

df = pd.DataFrame(records)
print(df.head())


Parsing ../data/pubmed/pubmed25n0001.xml.gz
  pmid                                      clinical_text
0    1  Formate assay in body fluids: application in m...
1    2  Delineation of the intimate details of the bac...
2    3  Metal substitutions incarbonic anhydrase: a ha...
3    4  Effect of chloroquine on cultured fibroblasts:...
4    5  Atomic models for the polypeptide backbones of...


In [41]:
avg_token_len = df["clinical_text"].swifter.apply(lambda x: len(x.split())).mean()
print(f"Average token count: {avg_token_len:.2f}")

Pandas Apply: 100%|██████████| 30000/30000 [00:00<00:00, 169119.02it/s]

Average token count: 85.30





In [42]:
mem_bytes = df.memory_usage(deep=True).sum()
mem_mb = mem_bytes / (1024 ** 2)
print(f"DataFrame memory usage: {mem_mb:.2f} MB")

DataFrame memory usage: 20.21 MB


# 1200 * 20.21MB = ~ 24 GB