In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
import string
import torch
import spacy
import os

# Data Preprocessing

In [2]:
#The pre-processing pipeline inherits from the original ClinicalBERT with minor changes. see https://github.com/kexinhuang12345/clinicalBERT
data_dir = "/mnt/sdc/niallt/mimic_iii/"

save_dir = "/mnt/sdc/niallt/mimic_iii/processed/"
df_notes = pd.read_csv(f"{data_dir}/NOTEEVENTS.csv")

  df_notes = pd.read_csv(f"{data_dir}/NOTEEVENTS.csv")


### Avoiding data leakage into likely downstream tasks

We have a problem with mimic-iii data in that there are many downstream tasks and papers that have utilised these notes - with some of the more popular PLMs used having seen all of the notes during pre-training. 

An attempt to avoid the leakage for our ICD9-triage task involves isolating the test sets already created for those tasks and splitting on the HADMIDs found in the test sets. This should leave us with a language modelling training dataset that shares not notes with downstream test sets.

In [3]:
# read in all test sets for the prompt learning paper
test_dir_1 = "/mnt/sdc/niallt/mimic3-icd9-data/intermediary-data/notes2diagnosis-icd-test.csv"
test_dir_2 = "/mnt/sdc/niallt/mimic3-icd9-data/intermediary-data/notes2diagnosis-icd-validate.csv"
test_dir_3 = "/mnt/sdc/niallt/mimic-readmission/discharge/test.csv"

test_dfs = pd.concat([pd.read_csv(dir) for dir in [test_dir_1, test_dir_2, test_dir_3]])

In [5]:
test_dfs.shape

(21796, 7)

In [19]:
test_dfs.head()

Unnamed: 0.1,HADM_ID,TEXT,CATEGORY,ICD9_CODE,Unnamed: 0,ID,Label
0,114601.0,: : Service: NEUROLOGY Allergies: Patient reco...,Discharge summary Discharge summary,43311,,,
1,169625.0,: : Service: CCU HISTORY OF PRESENT ILLNESS: T...,Discharge summary,486,,,
2,138492.0,: : : Sex: F Service: SURGERY Allergies: Penic...,Discharge summary Discharge summary,2381,,,
3,115918.0,: : Service: MEDICINE Allergies: Vancomycin / ...,Discharge summary Discharge summary,56881,,,
4,173668.0,: : : Sex: M Service: UROLOGY Allergies: Patie...,Discharge summary,1890,,,


In [6]:
df_test_ids = test_dfs.HADM_ID.unique()

In [21]:
len(df_test_ids)

18737

In [4]:
# held-out dataset if you want to fine-tune on MIMIC data, it is better to exclude them prior to the training
# exclude all data in test set for re-admission task where we are interested in 
# df_test_ids = pd.read_csv('discharge/test.csv').ID.unique()
train_df_notes = df_notes[~df_notes.HADM_ID.isin(df_test_ids)]
test_df_notes = df_notes[df_notes.HADM_ID.isin(df_test_ids)]

NameError: name 'df_test_ids' is not defined

: 

In [6]:
train_df_notes.shape

(1170217, 11)

In [7]:
test_df_notes.shape

(912963, 11)

In [26]:
train_df_notes.CATEGORY.value_counts()

Nursing/other        515874
Radiology            237373
Nursing              141452
Physician             89750
ECG                   88670
Discharge summary     38324
Echo                  21541
Respiratory           20102
Nutrition              5898
General                5344
Rehab Services         3360
Social Work            1741
Case Management         655
Pharmacy                 69
Consult                  64
Name: CATEGORY, dtype: int64

In [14]:
test_df_notes.CATEGORY.value_counts()

Nursing/other        306623
Radiology            284906
ECG                  120381
Nursing               82104
Physician             51874
Echo                  24253
Discharge summary     21328
Respiratory           11637
Nutrition              3520
General                2957
Rehab Services         2071
Social Work             929
Case Management         312
Pharmacy                 34
Consult                  34
Name: CATEGORY, dtype: int64

In [27]:
len(train_df_notes.HADM_ID.unique())

39625

In [6]:
# add the num words per doc

train_df_notes['num_words'] = train_df_notes['TEXT'].apply(lambda x: len(x.split()))
test_df_notes['num_words'] = test_df_notes['TEXT'].apply(lambda x: len(x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_notes['num_words'] = train_df_notes['TEXT'].apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_notes['num_words'] = test_df_notes['TEXT'].apply(lambda x: len(x.split()))


In [9]:
train_df_notes.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,num_words
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...,110
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...,1943
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...,2099
5,179,53181,170490.0,2172-03-08,,,Discharge summary,Report,,,Admission Date: [**2172-3-5**] D...,1165
7,181,42130,114236.0,2150-03-01,,,Discharge summary,Report,,,Admission Date: [**2150-2-25**] ...,1524


In [16]:
train_df_notes.iloc[0]['TEXT']

'Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRADIOLOGIC STUDIES:  Radiologic studies also included a chest\nCT, which confirmed cavitary lesions in the left lung apex\nconsistent with infectious process/tuberculosis.  This also\nmoderate-sized left pleural effusion.\n\nHEAD CT:  Head CT showed no intracranial hemorrhage or mass\neffect, but old infarction consistent with past medical\nhistory.\n\nABDOMINAL CT:  Abdominal CT showed lesions of\nT10 and sacrum most likely secondary to osteoporosis. These can\nbe followed by repeat imaging as an outpatient.\n\n\n\n                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]\n\nDictated By:[**Hospital 1807**]\nMEDQUIST36\n\nD:  [**2151-8-5**]  12:11\nT:  [**2151-8-5**]  12:21\nJOB#:  [**Job Number 1808**]\n'

In [10]:
# group and get mean
train_df_notes.groupby(["CATEGORY"]).agg({"ROW_ID":"size", "num_words":"mean"})

Unnamed: 0_level_0,ROW_ID,num_words
CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1
Case Management,655,162.305344
Consult,64,844.390625
Discharge summary,38324,1438.02411
ECG,88670,29.481223
Echo,21541,319.098974
General,5344,213.254865
Nursing,141452,262.019971
Nursing/other,515874,131.215973
Nutrition,5898,318.542896
Pharmacy,69,300.304348


In [None]:
#Choose interested categories, for more information, please refer to 
category_list = ['Discharge summary', 'Echo', 'Nursing', 'Physician ',
       'Rehab Services', 'Respiratory ', 'Nutrition',
       'General', 'Pharmacy', 'Consult', 'Radiology',
       'Nursing/other']

Why do we remove ECG?

In [12]:
train_df_notes[train_df_notes['CATEGORY']=="ECG"].iloc[2]['TEXT']

'Sinus rhythm with A-V conduction delay. Infero-posterolateral myocardial\ninfarction with ST-T wave configuration consistent with acute process.\nSince the previous tracing of [**2146-10-22**] the findings as outlined are now\npresent.\nTRACING #1\n\n'

In [None]:
subset_train_df_notes = train_df_notes[train_df_notes.CATEGORY.isin(category_list)]
subset_test_df_notes = test_df_notes[test_df_notes.CATEGORY.isin(category_list)]

In [16]:
subset_train_df_notes.shape

(1079151, 11)

In [17]:
subset_test_df_notes.shape

(791341, 11)

In [17]:
def preprocess1(x: str) -> str:
    y=re.sub('\\[(.*?)\\]','',x) 
    y=re.sub('[0-9]+\. ','',y) 
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('birth date:','',y)
    y=re.sub('date of birth:','',y)
    y=re.sub('chief complaint:','',y)
    y = re.sub('"','', y)
    y=re.sub('--|__|==','',y)
    #more substituion can be made to align with general knowledge such as "p.o." to "by mouth"
    
    # remove, spaces
    y = y.translate(str.maketrans("", ""))
    y = " ".join(y.split())
    return y

def preprocessing(df_notes): 
    df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
    df_notes['TEXT']=df_notes['TEXT'].apply(str.strip)
    #We use uncased text which is also used in PubMedBERT
    df_notes['TEXT']=df_notes['TEXT'].str.lower()

    df_notes['TEXT']=df_notes['TEXT'].apply(lambda x: preprocess1(x))
    df_notes['TEXT']= df_notes['TEXT'].str.replace('"','')

    return df_notes

train_df_notes_processed = preprocessing(subset_train_df_notes)
test_df_notes_processed = preprocessing(subset_test_df_notes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
A value is trying to be set on a copy of a slic

We want to save the IDs that have been used to split the data

In [18]:
# if outputdir doesn't exist - make it
grouped_save_dir = f"{save_dir}/HADM_ID_split/"
if not os.path.exists(grouped_save_dir):
    os.makedirs(grouped_save_dir)

In [19]:
unique_ids_df = pd.DataFrame({"HADM_ID":train_df_notes.HADM_ID.unique()})


In [20]:
# write to file
unique_ids_df.to_csv(f"{grouped_save_dir}/TRAIN_HADM_IDs.csv", index=None)

Now save the pre-processed train and test notes. 

The point here is we now have a large subset for both training/testing the LM training and a separate test set to use for both testing downstream tasks

In [22]:
grouped_save_dir

'/mnt/sdg/niallt/mimic_iii/processed//HADM_ID_split/'

In [21]:
# to reuse the processed data in other tasks and save time
train_df_notes_processed.to_csv(f"{grouped_save_dir}/train_df_notes_interim_preprocessed.csv", index = None)
test_df_notes_processed.to_csv(f"{grouped_save_dir}/test_df_notes_interim_preprocessed.csv", index = None)

In [41]:
# reload that 
train_df_notes_processed = pd.read_csv(f"{grouped_save_dir}/train_df_notes_interim_preprocessed.csv")

  df_notes_processed = pd.read_csv(f"{grouped_save_dir}/train_df_notes_interim_preprocessed.csv")


In [10]:
train_df_notes_processed.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,service: addendum: radiologic studies: radiolo...
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,date of birth: sex: f service: micu and then t...
2,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,date of birth: sex: m service: medicine allerg...
3,179,53181,170490.0,2172-03-08,,,Discharge summary,Report,,,date of birth: sex: f service: neurosurgery al...
4,181,42130,114236.0,2150-03-01,,,Discharge summary,Report,,,date of birth: sex: m service: neurosurgery al...


In [12]:
category_list

['Discharge summary',
 'Echo',
 'Nursing',
 'Physician ',
 'Rehab Services',
 'Respiratory ',
 'Nutrition',
 'General',
 'Pharmacy',
 'Consult',
 'Radiology',
 'Nursing/other']

In [18]:
train_df_notes_processed[train_df_notes_processed['CATEGORY']=="General"]['TEXT']

308752     clinician: nurse pt w/ prostate ca, end-stage ...
309201     title: physical therapy / rehab services depar...
309391     clinician: attending i supervised the resident...
309404     clinician: attending patient has had poor urin...
309665     clinician: attending patient with worsening hy...
                                 ...                        
2059634    - hct stable - gi recs: intubate for scope. ma...
2059638    title: critical care present for the key porti...
2059640    title: intensivist note cvicu hpi: hd11 pod 9-...
2059642    7 bjbjqpqp 8::! (::: d4!!!!!!$ #h& e z{{{ !tp0...
2059643    title: addendum respiratory care 7a-7p psv ini...
Name: TEXT, Length: 5344, dtype: object

In [11]:
train_df_notes_processed.loc[4]['TEXT']

"sex: m service: medicine allergies: patient recorded as having no known allergies to drugs attending: mr. was seen at after a mechanical fall from a height of 10 feet. ct scan noted unstable fracture of c6-7 & posterior elements. major surgical or invasive procedure: anterior cervical osteotomy, c6-c7, with decompression and excision of ossification of the posterior longitudinal ligament. anterior cervical deformity correction. interbody reconstruction. anterior cervical fusion, c5-c6-cplate instrumentation, c5-c6-ccervical laminectomy c6-c7, tposterior cervical arthrodesis c4-tcervical instrumentation c4-tarthrodesis augmentation with autograft, allograft and demineralized bone matrix. history of present illness: mr. is a 82 year old male who had a slip and fall of approximately 10 feet from a balcony. he was ambulatory at the scene. he presented to the ed here at . ct scan revealed unstable c spine fracture. he was intubated secondary to agitation. patient admitted to trauma surgery

In [36]:
# save just the text to file for LM training
train_df_notes_processed['TEXT'].to_csv(f"{save_dir}/HADM_ID_split/train.txt", header = None, index = None, sep = "\t", mode = 'a')


In [35]:
# write just sample
n_rows = 250000

train_df_notes_processed['TEXT'].head(n_rows).to_csv(f"{save_dir}/HADM_ID_split/train_{n_rows}.txt", header = None, index = None, sep = "\t", mode = 'a')

In [13]:
test_df_notes_processed['TEXT'].to_csv(f"{save_dir}/HADM_ID_split/test.txt", header = None, index = None, sep = "\t", mode = 'a')

In [12]:
n_rows = 1000
test_df_notes_processed['TEXT'].head(n_rows).to_csv(f"{save_dir}/HADM_ID_split/test_{n_rows}.txt", header = None, index = None, sep = "\t", mode = 'a')

In [29]:

df_text_train = pd.read_csv("train.txt", sep = "\t", header = None)

In [None]:
df

In [34]:
df_text_train.head(15)

Unnamed: 0,0
0,service: addendum: radiologic studies: radiolo...
1,date of birth: sex: f service: micu and then t...
2,date of birth: sex: m service: medicine allerg...
3,date of birth: sex: f service: neurosurgery al...
4,date of birth: sex: m service: neurosurgery al...
5,date of birth: sex: f service: neurosurgery al...
6,date of birth: sex: f service: neurosurgery al...
7,date of birth: sex: m service: cardiac surgery...
8,date of birth: sex: m service: cardiac surgery...
9,date of birth: sex: m service: trauma surgery ...


In [35]:
with open("./train_500.txt") as f:
    lines = f.readlines()

In [36]:
lines

['service: addendum: radiologic studies: radiologic studies also included a chest ct, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis. this also moderate-sized left pleural effusion. head ct: head ct showed no intracranial hemorrhage or mass effect, but old infarction consistent with past medical history. abdominal ct: abdominal ct showed lesions of t10 and sacrum most likely secondary to osteoporosis. these can be followed by repeat imaging as an outpatient. , md dictated by: medquist36 d: 12:11 t: 12:21 job#:\n',
 '"date of birth: sex: f service: micu and then to medicine history of present illness: this is an 81-year-old female with a history of emphysema (not on home o2), who presents with three days of shortness of breath thought by her primary care doctor to be a copd flare. two days prior to admission, she was started on a prednisone taper and one day prior to admission she required oxygen at home in order to maintain oxygen

Below splits the texts into sentences and has a row per sentence - this seems a little unneccessary and does not pair well with the other PLMs we have trained

In [13]:
from spacy.lang.en import English
nlp = English()
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7feb410ac380>

In [14]:
def toSentence(x):
    doc = nlp(x)
    text=[]
    try:
        for sent in doc.sents:
            st=str(sent).strip() 
            if len(st)<30:
                #Merging too-short sentences to appropriate length, this is inherited from ClinicalBERT with changes in merged length 
                if len(text)!=0:
                    text[-1]=' '.join((text[-1],st))
                else:
                    text=[st]
            else:
                text.append((st))
    except:
        print(doc)
    return text

pretrain_sent=df_notes_processed['TEXT'].apply(lambda x: toSentence(x))

In [18]:
# pretrain_sent.values

In [30]:

# file=open(f"{grouped_save_dir}/sentences_train.txt",'w')
file=open(f"./sentences_train.txt",'w')
# pretrain_sent = pretrain_sent.values
# #random sample 500,000 documents 
# pretrain_sent = np.random.choice(pretrain_sent,500000)


In [19]:
pretrain_sent = pretrain_sent.values

In [21]:
pretrain_sent[0]

['service: addendum: radiologic studies: radiologic studies also included a chest ct, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis.',
 'this also moderate-sized left pleural effusion.',
 'head ct: head ct showed no intracranial hemorrhage or mass effect, but old infarction consistent with past medical history.',
 'abdominal ct: abdominal ct showed lesions of t10 and sacrum most likely secondary to osteoporosis.',
 'these can be followed by repeat imaging as an outpatient. ,',
 'md dictated by: medquist36 d: 12:11 t: 12:21 job#:']

In [31]:
#write the txt file for building dataset, empty lines between docs (for NSP task)
for i in tqdm(range(len(pretrain_sent))):
    if len(pretrain_sent[i]) > 0:
        # remove the one token note
        note = pretrain_sent[i]
        for sent in note:
            file.write(sent+'\n')
        file.write('\n')

100%|██████████| 1079151/1079151 [00:06<00:00, 155963.21it/s]


In [11]:
processed_sentences =  pd.read_csv(f"./sentences_train.txt", sep = "\t", header = None)

### load icd9-triage data

In [9]:
triage_data_dir = "/mnt/sdc/niallt/mimic3-icd9-data/intermediary-data/triage/"

In [45]:
triage_train_df = pd.read_csv(f"{triage_data_dir}/train.csv")
triage_valid_df = pd.read_csv(f"{triage_data_dir}/valid.csv")
triage_test_df = pd.read_csv(f"{triage_data_dir}/test.csv")

In [11]:
triage_train_df

Unnamed: 0,text,label,triage-category
0,: : : Sex: F Service: CARDIOTHORACIC Allergies...,4240,Cardiology
1,: : : Sex: F Service: NEONATOLOGY HISTORY: wee...,V3001,Obstetrics
2,: : : Sex: M Service: CARDIOTHORACIC Allergies...,41041,Cardiology
3,: : : Sex: F Service: MEDICINE Allergies: Peni...,51881,Respiratory
4,: : : Sex: M Service: ADMISSION DIAGNOSIS: . S...,41401,Cardiology
...,...,...,...
9554,: : : Sex: F Service: MEDICINE Allergies: Pati...,5849,AcuteMedicine
9555,: : : Sex: F Service: MEDICINE Allergies: Peni...,486,Respiratory
9556,Unit No: : : : Sex: F Service: Neonatology was...,V3001,Obstetrics
9557,: : Service: CARDIOTHORACIC Allergies: Penicil...,41071,Cardiology


In [61]:
# remove
def get_cat_text_co_occurence(df):
    n_true = 0
    cat_counts = []
    text_examples = []
    offending_text = []

    for idx, row in df.iterrows():
        category = row["triage-category"].lower()
        text = row["text"].lower()
        if category in text:
            n_true += 1
            # print(f"category: {category}, text: {text}")
            # print(text.find(category))
            # print(f"part with category: {text[text.find(category):]}")
            
            cat_counts.append(category)
            text_examples.append(text)
            offending_text.append(text[text.find(category):])
            # break   
        # break
    counts_df = pd.DataFrame({"category":cat_counts, 
                              "text":text_examples,
                              "offender":offending_text})
    
    return n_true, counts_df



In [62]:
n_true, counts_df = get_cat_text_co_occurence(triage_train_df)

In [63]:
counts_df

Unnamed: 0,category,text,offender
0,cardiology,: : : sex: f service: cardiothoracic allergies...,cardiology report echo study date of *** repor...
1,respiratory,: : : sex: f service: medicine allergies: peni...,"respiratory failure; etiology was unclear, but..."
2,cardiology,: : service: csu history of present illness: t...,cardiology fellow. her sheaths were pulled as ...
3,cardiology,: : service: micu chief complaint: lightheaded...,cardiology will be consulted to: identify the ...
4,oncology,: : : sex: m service: neurosurgery allergies: ...,oncology pre-operatively and had a speech and ...
...,...,...,...
2741,respiratory,: : service: medicine allergies: iodine; iodin...,respiratory status noted at - for hrs. shallow...
2742,cardiology,: : : sex: m service: medicine allergies: aten...,cardiology service for workup of flash pulmona...
2743,neurology,: : : sex: m service: neurology allergies: pat...,neurology allergies: patient recorded as havin...
2744,respiratory,: : : sex: f service: medicine allergies: peni...,respiratory distress: respiratory distress lik...


In [48]:
triage_train_df.head()

Unnamed: 0,text,label,triage-category
0,: : : Sex: F Service: CARDIOTHORACIC Allergies...,4240,Cardiology
1,: : : Sex: F Service: NEONATOLOGY HISTORY: wee...,V3001,Obstetrics
2,: : : Sex: M Service: CARDIOTHORACIC Allergies...,41041,Cardiology
3,: : : Sex: F Service: MEDICINE Allergies: Peni...,51881,Respiratory
4,: : : Sex: M Service: ADMISSION DIAGNOSIS: . S...,41401,Cardiology


In [49]:
# try removing the category from the text
counts_df.category.value_counts()

cardiology          1160
respiratory         1026
neurology            377
oncology             105
gastroenterology      74
obstetrics             4
Name: category, dtype: int64

# try removing the most structured bit

- lets try super aggressive removal of the exact class tokens

In [64]:
class_names = triage_train_df["triage-category"].unique()

In [72]:
class_names.

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [79]:
# make lower
class_names_lower = [x.lower() for x in class_names]
class_names_regex_list = "|".join(class_names_lower)


def preprocess_triage(x):
    y=re.sub('\\[(.*?)\\]','',x) 
    y=re.sub('[0-9]+\. ','',y) 
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('birth date:','',y)
    y=re.sub('date of birth:','',y)
    y=re.sub('chief complaint:','',y)
    y=re.sub('history of present illness:','',y)
    y=re.sub(r"service:\s+\w+",'',y)
    y=re.sub(class_names_regex_list,'',y)


    y = re.sub('"','', y)
    y=re.sub('--|__|==','',y)
    #more substituion can be made to align with general knowledge such as "p.o." to "by mouth"
    
    # remove, spaces
    y = y.translate(str.maketrans("", ""))
    y = " ".join(y.split())
    return y

def preprocessing_triage(df_notes): 
    df_notes['text']=df_notes['text'].fillna(' ')
    df_notes['text']=df_notes['text'].str.replace('\n',' ')
    df_notes['text']=df_notes['text'].str.replace('\r',' ')
    df_notes['text']=df_notes['text'].apply(str.strip)
    #We use uncased text which is also used in PubMedBERT
    df_notes['text']=df_notes['text'].str.lower()

    df_notes['text']=df_notes['text'].apply(lambda x: preprocess_triage(x))
    df_notes['text']= df_notes['text'].str.replace('"','')

    return df_notes

train_triage_processed = preprocessing_triage(triage_train_df.copy())
valid_triage_processed = preprocessing_triage(triage_valid_df.copy())
test_triage_processed = preprocessing_triage(triage_test_df.copy())

In [83]:
train_triage_processed.head(10)

Unnamed: 0,text,label,triage-category
0,: : : sex: f allergies: patient recorded as ha...,4240,Cardiology
1,: : : sex: f history: week gestation age femal...,V3001,Obstetrics
2,: : : sex: m allergies: patient recorded as ha...,41041,Cardiology
3,: : : sex: f allergies: penicillins / ativan /...,51881,Respiratory
4,: : : sex: m diagnosis: . shortness of breath ...,41401,Cardiology
5,: : : sex: f diagnosis: subarachnoid hemorrhag...,430,Neurology
6,: : : sex: m is s a year old priest who presen...,41401,Cardiology
7,: : this -year-old female was admitted from ho...,41401,Cardiology
8,: : : sex: f surgery shortness of breath with ...,41401,Cardiology
9,: : : sex: m allergies: aspirin : hypotension ...,4280,Cardiology


In [80]:
new_count, new_counts_df = get_cat_text_co_occurence(train_triage_processed)

In [81]:
new_count

0

In [82]:
new_counts_df.category.value_counts()

Series([], Name: category, dtype: int64)

In [84]:
# write to file
save_path = "/mnt/sdc/niallt/mimic3-icd9-data/intermediary-data/triage/no_category_in_text/"
if not os.path.exists(save_path):
    os.makedirs(save_path)

train_triage_processed.to_csv(f"{save_path}/train.csv", index=False)
valid_triage_processed.to_csv(f"{save_path}/valid.csv", index=False)
test_triage_processed.to_csv(f"{save_path}/test.csv", index=False)


In [85]:
# double check fewshot dataset created
fewshot_df = pd.read_csv("/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split//icd9-triage//no_category_in_text//fewshot_16/train.csv")

In [86]:
fewshot_df

Unnamed: 0,text,triage-category,label
0,: : : sex: m is a -year-old white male with un...,Neurology,3
1,: : : sex: f allergies: ace inhibitors / clona...,Respiratory,2
2,unit no: : : : sex: m entification: baby is a ...,Obstetrics,1
3,: : : sex: f service:neonatology primary diagn...,Obstetrics,1
4,: : : sex: m allergies: no known allergies / a...,Gastroenterology,4
...,...,...,...
107,: : : sex: m allergies: no known allergies / a...,Cardiology,0
108,: : : sex: f allergies: patient recorded as ha...,Gastroenterology,4
109,: : : sex: f allergies: baclofen : the patient...,AcuteMedicine,5
110,: : : sex: m allergies: patient recorded as ha...,Respiratory,2


In [87]:
fs_count, fs_counts_df = get_cat_text_co_occurence(fewshot_df)

In [88]:
fs_count

0

# Train Tokenizer
Only when you pretrain from scratch!

In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [None]:
paths = [str(x) for x in Path(data_path).glob("*.txt")]

In [None]:
tokenizer = ByteLevelBPETokenizer()

In [None]:
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
tokenizer.save_model(".", "Tokenizer_Name")

# Clinical-PubMedBERT

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import TextDataset
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
from transformers import pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

model = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

In [None]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='your text data path',
    block_size=128,
    # You can also use 512 block_size to train the model, also adjust batch size.
)

In [None]:
# Use Whole Word Masking instead of ordinary masking
data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
# we use 5000 steps to warm-up, other optimization parameters are default
training_args = TrainingArguments(
    output_dir="your_output_directory",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=2_500,
    save_total_limit=3,
    prediction_loss_only=True,
    warmup_steps = 5000
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("your_model_directory")

In [None]:
#You can try some examples to check the learned model!
fill_mask = pipeline(
    "fill-mask",
    model="your_model_directory",
    tokenizer=tokenizer
)

# MIMIC - IV

In [2]:
data_dir = "/mnt/sdg/niallt/mimic_4/note/"

In [4]:
!ls $data_dir

discharge.csv  discharge_detail.csv  radiology.csv  radiology_detail.csv


In [5]:
discharge_df = pd.read_csv(f"{data_dir}/discharge.csv")

In [6]:
discharge_df.head(
)

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...
1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...
2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...
4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...


In [7]:
discharge_df.shape

(331794, 8)

In [8]:
discharge_df.note_type.value_counts()

DS    331794
Name: note_type, dtype: int64