In [1]:
import sys
sys.path.append("../") # go to parent dir

In [2]:
import pandas as pd
import sys
import spacy
import re
import time
import scispacy
import glob
import os
from tqdm import tqdm
tqdm.pandas()
from note_processing.heuristic_tokenize import sent_tokenize_rules 

In [3]:
# OUTPUT_DIR = '/mnt/data01/mimic-3/benchmark-small/test/345' #this path will contain tokenized notes. This dir will be the input dir for create_pretrain_data.sh

#this is the path to mimic data if you're reading from a csv. Else uncomment the code to read from database below
MIMIC_NOTES_PATHS = ['/mnt/data01/mimic-3/benchmark-small/test',
                     '/mnt/data01/mimic-3/benchmark-small/train']  

DEVICE = -1  # -1 is CPU otherwise the GPU device id

In [4]:
all_files = []

for path in MIMIC_NOTES_PATHS:
    files = glob.glob(path + "/*/*_notes_sent.csv")
    all_files += files

print(f"\nTotal note files: {len(all_files)}")

li = []

for filename in tqdm(all_files, desc="Load note files"):
    df = pd.read_csv(filename, index_col=None, header=0)
    df["filename"] = filename
    li.append(df)

notes = pd.concat(li, axis=0, ignore_index=True)
notes.describe(include="all")

print(f"Total notes: {len(notes)}")


Load note files: 100%|██████████| 41/41 [00:00<00:00, 218.30it/s]
Total note files: 41

Total notes: 1126


In [5]:
# Add patient to the table
notes["PATIENT_ID"] = notes["filename"].apply(lambda x: int(re.findall(r'/[0-9]+/', x)[-1][1:-1]))

# Add episode to the table
notes["EPISODE_ID"] = notes["filename"].apply(lambda x: int(re.findall(r'episode[0-9]+_', x)[-1][7:-1]))


In [6]:
notes[notes["EPISODE_ID"] > 5].head(5)

Unnamed: 0,Hours,CATEGORY,DESCRIPTION,TEXT,filename,PATIENT_ID,EPISODE_ID
616,3.721389,Physician,Physician Resident Admission Note,Chief Complaint:\nCC: Hypertensive Urgecny and...,/mnt/data01/mimic-3/benchmark-small/train/109/...,109,16
617,3.721389,Physician,Physician Resident Admission Note,Chief Complaint:\nCC: Hypertensive Urgecny and...,/mnt/data01/mimic-3/benchmark-small/train/109/...,109,16
618,7.371389,Physician,Physician Attending Admission Note - MICU,Chief Complaint:\nHTN urgency I saw and exa...,/mnt/data01/mimic-3/benchmark-small/train/109/...,109,16
619,7.371389,Physician,Physician Attending Admission Note - MICU,Chief Complaint:\nHTN urgency I saw and exa...,/mnt/data01/mimic-3/benchmark-small/train/109/...,109,16
620,10.004722,Nursing,Nursing Progress Note,Admitted from ED with hypertensive crisis.\nSB...,/mnt/data01/mimic-3/benchmark-small/train/109/...,109,16


In [7]:
notes[["Hours", "CATEGORY"]].groupby("CATEGORY").agg(['count'])

Unnamed: 0_level_0,Hours
Unnamed: 0_level_1,count
CATEGORY,Unnamed: 1_level_2
General,23
Nursing,502
Nursing/other,50
Nutrition,13
Pharmacy,2
Physician,333
Radiology,119
Rehab Services,8
Respiratory,75
Social Work,1


In [8]:
# Restrict the number of notes for processing

#category = ["Nursing", "Nursing/other", 'General', 'Physician ']  # or None
category = ["Nursing/other"]  # or None

if category != None:
    notes = notes[notes['CATEGORY'].isin(category)]

print('Number of notes: %d' %len(notes.index))

# nlp = spacy.load('en_core_sci_md', disable=['tagger','ner', 'lemmatizer'])
# nlp.add_pipe('sbd_component', before='parser')  

Number of notes: 50


In [9]:
# Load the pre-trained Bio_ClinicalBERT model
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


In [10]:
# Don't bother trying to run the pipeline without a GPU
import numpy as np
from transformers import pipeline
pipe = pipeline('feature-extraction', model=model, 
                tokenizer=tokenizer, device=DEVICE)

In [11]:
# import torch 
# print(torch.__version__)
# print(torch.cuda.current_device())
# print(torch._C._cuda_getCompiledVersion(), 'cuda compiled version')
# print(torch.version.cuda)

1.8.1+cu102


RuntimeError: No CUDA GPUs are available

In [12]:
features = pipe(['Respiratory CAre Pt received from ED intubated for airway protection.And then another sentenc',
                  'Coughing and gagging with Sx, swallowing frequently with irritation of ETT.']  ,
                pad_to_max_length=True)
features = np.squeeze(features)
features = features[:,0,:]
features

array([[-0.03993183,  0.28045335, -0.22612181, ..., -0.36056122,
         0.02098022, -0.07117227],
       [ 0.26059052,  0.28010404, -0.19884744, ..., -0.41535103,
         0.55648178, -0.36985204]])

In [13]:
import time

def get_embeddings(text, pipe):
    sents = text.split('\n')[:-1]
    #sents = list(map(lambda x: x[:50], sents))
    start_idx = 0
    while True:
        try:
            sent_features = pipe(sents[start_idx:] ,pad_to_max_length=True)
        except BaseException as e:
            start_idx += 1

            if start_idx >= len(sents):
                print("\nError in get_embeddings()")
                print('# of sentences: '+ str(len(sents)))
                sent_len = [len(x) for x in sents]
                print(sent_len)
                sent_features = None
                break

            print("Dropping sentence: " + sents[start_idx-1])
            continue
        break

    if sent_features is not None:
        try:
            sent_features = np.squeeze(sent_features)[:,0,:]
        except BaseException as e:
            print(f"Error squeezing sent_features - {e}")
            sent_features = None
    
    return sent_features

In [14]:
notes["bert_embeddings"] = notes["TEXT"].progress_apply(get_embeddings, args=(pipe,))


100%|██████████| 50/50 [01:05<00:00,  1.31s/it]


In [15]:
for i in range(10):
    print(notes["bert_embeddings"].iloc[i].shape)

(7, 768)
(47, 768)
(5, 768)
(71, 768)
(43, 768)
(5, 768)
(51, 768)
(45, 768)
(6, 768)
(28, 768)


In [16]:
notes.describe(include="all")

Unnamed: 0,Hours,CATEGORY,DESCRIPTION,TEXT,filename,PATIENT_ID,EPISODE_ID,bert_embeddings
count,50.0,50,50,50,50,50.0,50.0,50
unique,,1,1,50,7,,,50
top,,Nursing/other,Report,Nursing Addendum Pt. stated this eve that she ...,/mnt/data01/mimic-3/benchmark-small/train/191/...,,,"[[0.2806047201156616, 0.03838098421692848, -0...."
freq,,50,50,1,20,,,1
mean,45.648222,,,,,223.7,2.6,
std,31.821967,,,,,72.249356,1.726149,
min,4.174444,,,,,109.0,1.0,
25%,21.070764,,,,,191.0,2.0,
50%,44.101111,,,,,191.0,2.0,
75%,62.639236,,,,,222.0,3.0,


In [17]:
notes.head(1)

Unnamed: 0,Hours,CATEGORY,DESCRIPTION,TEXT,filename,PATIENT_ID,EPISODE_ID,bert_embeddings
3,7.351111,Nursing/other,Report,Respiratory CAre Pt received from ED intubated...,/mnt/data01/mimic-3/benchmark-small/test/345/e...,345,1,"[[-0.2339353859424591, -0.19446542859077454, -..."


In [18]:
notes["bert_embeddings_list"] = notes["bert_embeddings"].apply(lambda x: x.tolist())

In [19]:
type(notes["bert_embeddings_list"].iloc[0])

list

In [21]:
df.memory_usage(index=False, deep=True)

filenames = list(notes["filename"].unique().tolist())
for filename in tqdm(filenames, desc="Getting array sizes"):
    df = notes[notes["filename"] == filename][["Hours", "CATEGORY", "DESCRIPTION", "bert_embeddings", "bert_embeddings_list"]]
    size = 0
    note_num = 0
    for i in range(len(df)):
        array = df["bert_embeddings_list"].iloc[i]
        size += sys.getsizeof(array)
        note_num += 1

    print(f"{note_num} notes use {size} bytes of memory")

Getting array sizes: 100%|██████████| 7/7 [00:00<00:00, 362.03it/s]11 notes use 3504 bytes of memory
2 notes use 480 bytes of memory
20 notes use 5776 bytes of memory
7 notes use 1880 bytes of memory
6 notes use 1528 bytes of memory
2 notes use 248 bytes of memory
2 notes use 632 bytes of memory



In [22]:
# Write out a new notes file with the embeddings
# aflanders: This is going to take too long and take up too much space
# The embeddings will be longer than the notes themselves. Each patient/episode
# can go from 500Kb to 18Mb

# np.set_printoptions(threshold=sys.maxsize)

filenames = list(notes["filename"].unique().tolist())
for filename in tqdm(filenames, desc="Writing embedding files"):
    df = notes[notes["filename"] == filename][["Hours", "CATEGORY", "DESCRIPTION", "bert_embeddings_list"]]
    df = df.set_index("Hours")
    write_file = filename.replace("_notes_sent.csv", "_notes_bert.parquet")
    df.to_parquet(write_file)
    # with open(write_file, "w") as f:
        # df.to_parquet(f)

Writing embedding files: 100%|██████████| 7/7 [00:00<00:00, 22.81it/s]


In [23]:
df = pd.read_parquet(write_file)
print(write_file)
df.head()

/mnt/data01/mimic-3/benchmark-small/train/109/episode8_notes_bert.parquet


Unnamed: 0_level_0,CATEGORY,DESCRIPTION,bert_embeddings_list
Hours,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11.429167,Nursing/other,Report,"[[0.3830723762512207, 0.3420313596725464, 0.01..."
25.8625,Nursing/other,Report,"[[0.28190192580223083, 0.214814230799675, -0.1..."
