In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import spacy
import nltk
import re
from tqdm import tqdm
import string

import os

from dataset_processing import encode_classes

# Data Preprocessing

In [2]:
#The pre-processing pipeline inherits from the original ClinicalBERT with minor changes. see https://github.com/kexinhuang12345/clinicalBERT
data_dir = "/mnt/sdc/niallt/mimic_iii/"

save_dir = "/mnt/sdc/niallt/mimic_iii/processed/"
df_notes = pd.read_csv(f"{data_dir}/NOTEEVENTS.csv")

  df_notes = pd.read_csv(f"{data_dir}/NOTEEVENTS.csv")


### Avoiding data leakage into likely downstream tasks

We have a problem with mimic-iii data in that there are many downstream tasks and papers that have utilised these notes - with some of the more popular PLMs used having seen all of the notes during pre-training. 

An attempt to avoid the leakage for our ICD9-triage task involves isolating the test sets already created for those tasks and splitting on the HADMIDs found in the test sets. This should leave us with a language modelling training dataset that shares not notes with downstream test sets.

In [3]:
# read in all test sets for the prompt learning paper
test_dir_1 = "/mnt/sdc/niallt/mimic3-icd9-data/intermediary-data/notes2diagnosis-icd-test.csv"
test_dir_2 = "/mnt/sdc/niallt/mimic3-icd9-data/intermediary-data/notes2diagnosis-icd-validate.csv"
test_dir_3 = "/mnt/sdc/niallt/mimic-readmission/discharge/test.csv"

test_dfs = pd.concat([pd.read_csv(dir) for dir in [test_dir_1, test_dir_2, test_dir_3]])

In [5]:
test_dfs.shape

(21796, 7)

In [19]:
test_dfs.head()

Unnamed: 0.1,HADM_ID,TEXT,CATEGORY,ICD9_CODE,Unnamed: 0,ID,Label
0,114601.0,: : Service: NEUROLOGY Allergies: Patient reco...,Discharge summary Discharge summary,43311,,,
1,169625.0,: : Service: CCU HISTORY OF PRESENT ILLNESS: T...,Discharge summary,486,,,
2,138492.0,: : : Sex: F Service: SURGERY Allergies: Penic...,Discharge summary Discharge summary,2381,,,
3,115918.0,: : Service: MEDICINE Allergies: Vancomycin / ...,Discharge summary Discharge summary,56881,,,
4,173668.0,: : : Sex: M Service: UROLOGY Allergies: Patie...,Discharge summary,1890,,,


In [5]:
df_test_ids = test_dfs.HADM_ID.unique()

In [8]:
len(df_test_ids)

18737

In [6]:
# held-out dataset if you want to fine-tune on MIMIC data, it is better to exclude them prior to the training
# exclude all data in test set for re-admission task where we are interested in 
# df_test_ids = pd.read_csv('discharge/test.csv').ID.unique()
train_df_notes = df_notes[~df_notes.HADM_ID.isin(df_test_ids)]
test_df_notes = df_notes[df_notes.HADM_ID.isin(df_test_ids)]

In [10]:
train_df_notes.shape

(1170217, 11)

In [7]:
test_df_notes.shape

(912963, 11)

In [6]:
train_df_notes.CATEGORY.value_counts()

Nursing/other        515874
Radiology            237373
Nursing              141452
Physician             89750
ECG                   88670
Discharge summary     38324
Echo                  21541
Respiratory           20102
Nutrition              5898
General                5344
Rehab Services         3360
Social Work            1741
Case Management         655
Pharmacy                 69
Consult                  64
Name: CATEGORY, dtype: int64

In [7]:
test_df_notes.CATEGORY.value_counts()

Nursing/other        306623
Radiology            284906
ECG                  120381
Nursing               82104
Physician             51874
Echo                  24253
Discharge summary     21328
Respiratory           11637
Nutrition              3520
General                2957
Rehab Services         2071
Social Work             929
Case Management         312
Pharmacy                 34
Consult                  34
Name: CATEGORY, dtype: int64

In [27]:
len(train_df_notes.HADM_ID.unique())

39625

In [7]:
# add the num words per doc

train_df_notes['num_words'] = train_df_notes['TEXT'].apply(lambda x: len(x.split()))
test_df_notes['num_words'] = test_df_notes['TEXT'].apply(lambda x: len(x.split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_notes['num_words'] = train_df_notes['TEXT'].apply(lambda x: len(x.split()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_notes['num_words'] = test_df_notes['TEXT'].apply(lambda x: len(x.split()))


In [9]:
train_df_notes.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,num_words
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,Admission Date: [**2151-7-16**] Dischar...,110
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,Admission Date: [**2118-6-2**] Discharg...,1943
4,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,Admission Date: [**2162-3-3**] D...,2099
5,179,53181,170490.0,2172-03-08,,,Discharge summary,Report,,,Admission Date: [**2172-3-5**] D...,1165
7,181,42130,114236.0,2150-03-01,,,Discharge summary,Report,,,Admission Date: [**2150-2-25**] ...,1524


In [16]:
train_df_notes.iloc[0]['TEXT']

'Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRADIOLOGIC STUDIES:  Radiologic studies also included a chest\nCT, which confirmed cavitary lesions in the left lung apex\nconsistent with infectious process/tuberculosis.  This also\nmoderate-sized left pleural effusion.\n\nHEAD CT:  Head CT showed no intracranial hemorrhage or mass\neffect, but old infarction consistent with past medical\nhistory.\n\nABDOMINAL CT:  Abdominal CT showed lesions of\nT10 and sacrum most likely secondary to osteoporosis. These can\nbe followed by repeat imaging as an outpatient.\n\n\n\n                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]\n\nDictated By:[**Hospital 1807**]\nMEDQUIST36\n\nD:  [**2151-8-5**]  12:11\nT:  [**2151-8-5**]  12:21\nJOB#:  [**Job Number 1808**]\n'

In [10]:
# group and get mean
train_df_notes.groupby(["CATEGORY"]).agg({"ROW_ID":"size", "num_words":"mean"})

Unnamed: 0_level_0,ROW_ID,num_words
CATEGORY,Unnamed: 1_level_1,Unnamed: 2_level_1
Case Management,655,162.305344
Consult,64,844.390625
Discharge summary,38324,1438.02411
ECG,88670,29.481223
Echo,21541,319.098974
General,5344,213.254865
Nursing,141452,262.019971
Nursing/other,515874,131.215973
Nutrition,5898,318.542896
Pharmacy,69,300.304348


In [8]:
#Choose interested categories, for more information, please refer to 
category_list = ['Discharge summary', 'Echo', 'Nursing', 'Physician ',
       'Rehab Services', 'Respiratory ', 'Nutrition',
       'General', 'Pharmacy', 'Consult', 'Radiology', 'ECG',
       'Nursing/other']

Why do we remove ECG?

In [12]:
train_df_notes[train_df_notes['CATEGORY']=="ECG"].iloc[2]['TEXT']

'Sinus rhythm with A-V conduction delay. Infero-posterolateral myocardial\ninfarction with ST-T wave configuration consistent with acute process.\nSince the previous tracing of [**2146-10-22**] the findings as outlined are now\npresent.\nTRACING #1\n\n'

In [9]:
subset_train_df_notes = train_df_notes[train_df_notes.CATEGORY.isin(category_list)]
subset_test_df_notes = test_df_notes[test_df_notes.CATEGORY.isin(category_list)]

In [11]:
subset_train_df_notes.shape

(1167821, 12)

In [12]:
subset_test_df_notes.shape

(911722, 12)

In [15]:
subset_test_df_notes.shape

(791341, 12)

In [10]:
def contains_multiple_spaces(s):
    return bool(re.search(r" {2,}", s))

In [11]:
def find_all_multi_space_records(df):
    n_with_multi_spaces = 0
    idxs = []
    for idx, data in tqdm(df.iterrows()):
        if contains_multiple_spaces(data['TEXT']):
            n_with_multi_spaces += 1
            idxs.append(idx)
    n_with_multi_spaces
    
    return idxs, n_with_multi_spaces
    

In [16]:
idxs, n_with_multi_spaces = find_all_multi_space_records(subset_train_df_notes)


1167821it [00:57, 20375.38it/s]


811023

In [18]:
idxs[:10]

[0, 1, 4, 5, 7, 8, 9, 12, 13, 14]

In [19]:
# look at those wuith multiple spaces
subset_train_df_notes.iloc[idxs[:10]]['TEXT']

0     Admission Date:  [**2151-7-16**]       Dischar...
1     Admission Date:  [**2118-6-2**]       Discharg...
7     Admission Date:  [**2150-2-25**]              ...
8     Admission Date:  [**2118-8-10**]              ...
12    Admission Date:  [**2143-4-25**]     Discharge...
13    Admission Date:  [**2143-4-25**]     Discharge...
14    Admission Date:  [**2199-2-8**]     Discharge ...
19    Admission Date:  [**2130-2-3**]              D...
20    Admission Date:  [**2131-6-28**]              ...
21    Admission Date:  [**2131-7-5**]              D...
Name: TEXT, dtype: object

In [12]:
def preprocess1(x):
    y=re.sub('\\[(.*?)\\]','',x) 
    y=re.sub('[0-9]+\. ','',y) 
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('birth date:','',y)
    y=re.sub('date of birth:','',y)
    y=re.sub('chief complaint:','',y)
    y=re.sub('service:','',y)
    y = re.sub('"','', y)
    y=re.sub('--|__|==','',y)
    #more substituion can be made to align with general knowledge such as "p.o." to "by mouth"
    
    # remove, spaces
    y = y.translate(str.maketrans("", ""))
    y = " ".join(y.split())
    return y

def preprocessing(df_notes): 
    df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
    df_notes['TEXT']=df_notes['TEXT'].apply(str.strip)
    #We use uncased text which is also used in PubMedBERT
    df_notes['TEXT']=df_notes['TEXT'].str.lower()

    df_notes['TEXT']=df_notes['TEXT'].apply(lambda x: preprocess1(x))
    df_notes['TEXT']= df_notes['TEXT'].str.replace('"','')
    
    # remove any nas
    df_notes = df_notes.dropna(subset=['TEXT'])

    return df_notes

train_df_notes_processed = preprocessing(subset_train_df_notes)
test_df_notes_processed = preprocessing(subset_test_df_notes)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
A value is trying to be set on a copy of a slic

### check how well whitespaces were removed

In [22]:
idxs, n_with_multi_spaces = find_all_multi_space_records(train_df_notes_processed)

1167821it [01:20, 14437.00it/s]


In [24]:
n_with_multi_spaces

0

We want to save the IDs that have been used to split the data

In [13]:
# if outputdir doesn't exist - make it
grouped_save_dir = f"{save_dir}/HADM_ID_split/"
if not os.path.exists(grouped_save_dir):
    os.makedirs(grouped_save_dir)

In [14]:
unique_ids_df = pd.DataFrame({"HADM_ID":train_df_notes.HADM_ID.unique()})


In [15]:
# write to file
unique_ids_df.to_csv(f"{grouped_save_dir}/TRAIN_HADM_IDs.csv", index=None)

Now save the pre-processed train and test notes. 

The point here is we now have a large subset for both training/testing the LM training and a separate test set to use for both testing downstream tasks

In [22]:
grouped_save_dir

'/mnt/sdg/niallt/mimic_iii/processed//HADM_ID_split/'

In [16]:
# to reuse the processed data in other tasks and save time
train_df_notes_processed.to_csv(f"{grouped_save_dir}/train_df_notes_interim_preprocessed.csv", index = None)
test_df_notes_processed.to_csv(f"{grouped_save_dir}/test_df_notes_interim_preprocessed.csv", index = None)

In [None]:
# reload that 
# train_df_notes_processed = pd.read_csv(f"{grouped_save_dir}/train_df_notes_interim_preprocessed.csv")

  df_notes_processed = pd.read_csv(f"{grouped_save_dir}/train_df_notes_interim_preprocessed.csv")


In [32]:
train_df_notes_processed.CATEGORY.value_counts()

Nursing/other        515874
Radiology            237373
Nursing              141452
Physician             89750
Discharge summary     38324
Echo                  21541
Respiratory           20102
Nutrition              5898
General                5344
Rehab Services         3360
Pharmacy                 69
Consult                  64
Name: CATEGORY, dtype: int64

In [36]:
train_df_notes_processed.sample(250_000).to_csv(f"{grouped_save_dir}/train_df_notes_interim_preprocessed_250000.csv", index = None)

In [25]:
train_df_notes_processed.shape

(1079151, 12)

In [38]:
test_df_notes_processed.sample(10_000).to_csv(f"{grouped_save_dir}/test_df_notes_interim_preprocessed_10000.csv", index = None)

In [12]:
category_list

['Discharge summary',
 'Echo',
 'Nursing',
 'Physician ',
 'Rehab Services',
 'Respiratory ',
 'Nutrition',
 'General',
 'Pharmacy',
 'Consult',
 'Radiology',
 'Nursing/other']

In [18]:
train_df_notes_processed[train_df_notes_processed['CATEGORY']=="General"]['TEXT']

308752     clinician: nurse pt w/ prostate ca, end-stage ...
309201     title: physical therapy / rehab services depar...
309391     clinician: attending i supervised the resident...
309404     clinician: attending patient has had poor urin...
309665     clinician: attending patient with worsening hy...
                                 ...                        
2059634    - hct stable - gi recs: intubate for scope. ma...
2059638    title: critical care present for the key porti...
2059640    title: intensivist note cvicu hpi: hd11 pod 9-...
2059642    7 bjbjqpqp 8::! (::: d4!!!!!!$ #h& e z{{{ !tp0...
2059643    title: addendum respiratory care 7a-7p psv ini...
Name: TEXT, Length: 5344, dtype: object

### Save just the text to files

- First we save a document per line

- Then we sample a sentence per line

In [11]:
train_df_notes_processed.loc[4]['TEXT']

"sex: m service: medicine allergies: patient recorded as having no known allergies to drugs attending: mr. was seen at after a mechanical fall from a height of 10 feet. ct scan noted unstable fracture of c6-7 & posterior elements. major surgical or invasive procedure: anterior cervical osteotomy, c6-c7, with decompression and excision of ossification of the posterior longitudinal ligament. anterior cervical deformity correction. interbody reconstruction. anterior cervical fusion, c5-c6-cplate instrumentation, c5-c6-ccervical laminectomy c6-c7, tposterior cervical arthrodesis c4-tcervical instrumentation c4-tarthrodesis augmentation with autograft, allograft and demineralized bone matrix. history of present illness: mr. is a 82 year old male who had a slip and fall of approximately 10 feet from a balcony. he was ambulatory at the scene. he presented to the ed here at . ct scan revealed unstable c spine fracture. he was intubated secondary to agitation. patient admitted to trauma surgery

In [13]:
# save just the text to file for LM training
train_df_notes_processed['TEXT'].to_csv(f"{save_dir}/HADM_ID_split/with_echo_train.txt", header = None, index = None, sep = "\t", mode = 'a')


In [14]:
# write just sample
n_rows = 250000

train_df_notes_processed['TEXT'].sample(n_rows).to_csv(f"{save_dir}/HADM_ID_split/with_echo_train_{n_rows}.txt", header = None, index = None, sep = "\t", mode = 'a')

In [15]:
test_df_notes_processed['TEXT'].to_csv(f"{save_dir}/HADM_ID_split/with_echo_test.txt", header = None, index = None, sep = "\t", mode = 'a')

In [16]:
n_rows = 1000
test_df_notes_processed['TEXT'].sample(n_rows).to_csv(f"{save_dir}/HADM_ID_split/with_echo_test_{n_rows}.txt", header = None, index = None, sep = "\t", mode = 'a')

In [29]:

df_text_train = pd.read_csv("train.txt", sep = "\t", header = None)

In [None]:
df

In [34]:
df_text_train.head(15)

Unnamed: 0,0
0,service: addendum: radiologic studies: radiolo...
1,date of birth: sex: f service: micu and then t...
2,date of birth: sex: m service: medicine allerg...
3,date of birth: sex: f service: neurosurgery al...
4,date of birth: sex: m service: neurosurgery al...
5,date of birth: sex: f service: neurosurgery al...
6,date of birth: sex: f service: neurosurgery al...
7,date of birth: sex: m service: cardiac surgery...
8,date of birth: sex: m service: cardiac surgery...
9,date of birth: sex: m service: trauma surgery ...


In [35]:
with open("./train_500.txt") as f:
    lines = f.readlines()

In [34]:
# lines

### use nltk to create a big old list of sentences as our training data

In [12]:
# combine all rows of "text" column
train_text = ' '.join(train_df_notes_processed['TEXT'].tolist())



In [19]:
len(train_text) # num chars not words

1784855134

In [35]:
%%timeit
# split the combined text into sentences - this takes absolutely ages - test on 10k and time 
# this 
sentences = nltk.sent_tokenize(train_text[:100000])

30.3 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [37]:
# full loop will take a long time - may be best to use a script and tmux to do this
all_train_sentences = nltk.sent_tokenize(train_text)

In [23]:
len(sentences)

100

#### Save to text file

In [None]:
# save to file
with open("./all_train_sentences.txt", "w") as f:
    for s in all_train_sentences:
        f.write(s + "\n")



### spacy - we also add a clause that combines too short sentences

In [17]:
from spacy.lang.en import English
nlp = English()
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x7f67076fbcc0>

In [18]:
def toSentence(x):
    doc = nlp(x)
    text=[]
    try:
        for sent in doc.sents:
            st=str(sent).strip() 
            if len(st)<30:
                #Merging too-short sentences to appropriate length, this is inherited from ClinicalBERT with changes in merged length 
                if len(text)!=0:
                    text[-1]=' '.join((text[-1],st))
                else:
                    text=[st]
            else:
                text.append((st))
    except:
        print(doc)
    return text



In [19]:
%%timeit
training_sentences =train_df_notes_processed['TEXT'][:10].apply(lambda x: toSentence(x))

31.2 ms ± 280 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [21]:
training_sentences =train_df_notes_processed['TEXT'][:10].apply(lambda x: toSentence(x))

In [36]:
docs = list(nlp.pipe(train_df_notes_processed['TEXT'][:10]))

In [37]:
sentences = [sent for doc in docs for sent in doc.sents]

In [42]:
for s in sentences:
    print(s.text)

addendum: radiologic studies: radiologic studies also included a chest ct, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis.
this also moderate-sized left pleural effusion.
head ct: head ct showed no intracranial hemorrhage or mass effect, but old infarction consistent with past medical history.
abdominal ct: abdominal ct showed lesions of t10 and sacrum most likely secondary to osteoporosis.
these can be followed by repeat imaging as an outpatient. ,
md dictated by: medquist36 d: 12:11 t: 12:21 job#:
sex: f micu and then to medicine history of present illness: this is an 81-year-old female with a history of emphysema (not on home o2), who presents with three days of shortness of breath thought by her primary care doctor to be a copd flare.
two days prior to admission, she was started on a prednisone taper and one day prior to admission she required oxygen at home in order to maintain oxygen saturation greater than 90%.
she has also

In [27]:
# join list of lists
training_sentences = [item for sublist in training_sentences for item in sublist]

In [28]:
training_sentences

['addendum: radiologic studies: radiologic studies also included a chest ct, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis.',
 'this also moderate-sized left pleural effusion.',
 'head ct: head ct showed no intracranial hemorrhage or mass effect, but old infarction consistent with past medical history.',
 'abdominal ct: abdominal ct showed lesions of t10 and sacrum most likely secondary to osteoporosis.',
 'these can be followed by repeat imaging as an outpatient. ,',
 'md dictated by: medquist36 d: 12:11 t: 12:21 job#:',
 'sex: f micu and then to medicine history of present illness: this is an 81-year-old female with a history of emphysema (not on home o2), who presents with three days of shortness of breath thought by her primary care doctor to be a copd flare.',
 'two days prior to admission, she was started on a prednisone taper and one day prior to admission she required oxygen at home in order to maintain oxygen saturation 

In [28]:
training_sentences[0]

['addendum: radiologic studies: radiologic studies also included a chest ct, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis.',
 'this also moderate-sized left pleural effusion.',
 'head ct: head ct showed no intracranial hemorrhage or mass effect, but old infarction consistent with past medical history.',
 'abdominal ct: abdominal ct showed lesions of t10 and sacrum most likely secondary to osteoporosis.',
 'these can be followed by repeat imaging as an outpatient. ,',
 'md dictated by: medquist36 d: 12:11 t: 12:21 job#:']

In [31]:
train_text[:10000]

"addendum: radiologic studies: radiologic studies also included a chest ct, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis. this also moderate-sized left pleural effusion. head ct: head ct showed no intracranial hemorrhage or mass effect, but old infarction consistent with past medical history. abdominal ct: abdominal ct showed lesions of t10 and sacrum most likely secondary to osteoporosis. these can be followed by repeat imaging as an outpatient. , md dictated by: medquist36 d: 12:11 t: 12:21 job#: sex: f micu and then to medicine history of present illness: this is an 81-year-old female with a history of emphysema (not on home o2), who presents with three days of shortness of breath thought by her primary care doctor to be a copd flare. two days prior to admission, she was started on a prednisone taper and one day prior to admission she required oxygen at home in order to maintain oxygen saturation greater than 90%. she has als

In [34]:
%%timeit
train_spacy_sentences = toSentence(train_text[:100000])

67.5 ms ± 191 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
# pretrain_sent.values

In [30]:

# file=open(f"{grouped_save_dir}/sentences_train.txt",'w')
file=open(f"./sentences_train.txt",'w')
# pretrain_sent = pretrain_sent.values
# #random sample 500,000 documents 
# pretrain_sent = np.random.choice(pretrain_sent,500000)


In [19]:
pretrain_sent = pretrain_sent.values

In [21]:
pretrain_sent[0]

['service: addendum: radiologic studies: radiologic studies also included a chest ct, which confirmed cavitary lesions in the left lung apex consistent with infectious process/tuberculosis.',
 'this also moderate-sized left pleural effusion.',
 'head ct: head ct showed no intracranial hemorrhage or mass effect, but old infarction consistent with past medical history.',
 'abdominal ct: abdominal ct showed lesions of t10 and sacrum most likely secondary to osteoporosis.',
 'these can be followed by repeat imaging as an outpatient. ,',
 'md dictated by: medquist36 d: 12:11 t: 12:21 job#:']

In [31]:
#write the txt file for building dataset, empty lines between docs (for NSP task)
for i in tqdm(range(len(pretrain_sent))):
    if len(pretrain_sent[i]) > 0:
        # remove the one token note
        note = pretrain_sent[i]
        for sent in note:
            file.write(sent+'\n')
        file.write('\n')

100%|██████████| 1079151/1079151 [00:06<00:00, 155963.21it/s]


In [11]:
processed_sentences =  pd.read_csv(f"./sentences_train.txt", sep = "\t", header = None)

# Train Tokenizer
Only when you pretrain from scratch!

In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [None]:
paths = [str(x) for x in Path(data_path).glob("*.txt")]

In [None]:
tokenizer = ByteLevelBPETokenizer()

In [None]:
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
tokenizer.save_model(".", "Tokenizer_Name")

# Clinical-PubMedBERT

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import TextDataset
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
from transformers import pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

model = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

In [None]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='your text data path',
    block_size=128,
    # You can also use 512 block_size to train the model, also adjust batch size.
)

In [None]:
# Use Whole Word Masking instead of ordinary masking
data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
# we use 5000 steps to warm-up, other optimization parameters are default
training_args = TrainingArguments(
    output_dir="your_output_directory",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=2_500,
    save_total_limit=3,
    prediction_loss_only=True,
    warmup_steps = 5000
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("your_model_directory")

In [None]:
#You can try some examples to check the learned model!
fill_mask = pipeline(
    "fill-mask",
    model="your_model_directory",
    tokenizer=tokenizer
)

# MIMIC - IV

In [2]:
data_dir = "/mnt/sdg/niallt/mimic_4/note/"

In [4]:
!ls $data_dir

discharge.csv  discharge_detail.csv  radiology.csv  radiology_detail.csv


In [5]:
discharge_df = pd.read_csv(f"{data_dir}/discharge.csv")

In [6]:
discharge_df.head(
)

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...
1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...
2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...
3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...
4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...


In [7]:
discharge_df.shape

(331794, 8)

In [8]:
discharge_df.note_type.value_counts()

DS    331794
Name: note_type, dtype: int64

### Prepare for pretraining

In [4]:
data_dir = "/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split/train_df_notes_interim_preprocessed.csv"
test_data_dir = "/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split/test_df_notes_interim_preprocessed_10000.csv"
train_df = pd.read_csv(data_dir)
test_df = pd.read_csv(test_data_dir)

  train_df = pd.read_csv(data_dir)


In [5]:
cat_col = "CATEGORY"
# now encode the labels - and sort by the value counts rather than string value - this well help keep ordering when subetting by class frequency
class_list, idx_to_class, class_to_idx = encode_classes(train_df,
                                                        label_col=cat_col,
                                                        sort_by_value_count=True)

In [6]:
class_list, idx_to_class, class_to_idx

(['Nursing/other',
  'Radiology',
  'Nursing',
  'Physician ',
  'ECG',
  'Discharge summary',
  'Echo',
  'Respiratory ',
  'Nutrition',
  'General',
  'Rehab Services',
  'Pharmacy',
  'Consult'],
 {0: 'Nursing/other',
  1: 'Radiology',
  2: 'Nursing',
  3: 'Physician ',
  4: 'ECG',
  5: 'Discharge summary',
  6: 'Echo',
  7: 'Respiratory ',
  8: 'Nutrition',
  9: 'General',
  10: 'Rehab Services',
  11: 'Pharmacy',
  12: 'Consult'},
 {'Nursing/other': 0,
  'Radiology': 1,
  'Nursing': 2,
  'Physician ': 3,
  'ECG': 4,
  'Discharge summary': 5,
  'Echo': 6,
  'Respiratory ': 7,
  'Nutrition': 8,
  'General': 9,
  'Rehab Services': 10,
  'Pharmacy': 11,
  'Consult': 12})

In [7]:
# convert create label column from categories mapped to label int
train_df['label'] = train_df[cat_col].map(class_to_idx)
test_df['label'] = test_df[cat_col].map(class_to_idx)
    

In [8]:
train_df

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,num_words,label
0,174,22532,167853.0,2151-08-04,,,Discharge summary,Report,,,addendum: radiologic studies: radiologic studi...,110,5
1,175,13702,107527.0,2118-06-14,,,Discharge summary,Report,,,sex: f micu and then to medicine history of pr...,1943,5
2,178,26880,135453.0,2162-03-25,,,Discharge summary,Report,,,sex: m medicine allergies: patient recorded as...,2099,5
3,179,53181,170490.0,2172-03-08,,,Discharge summary,Report,,,sex: f neurosurgery allergies: no known allerg...,1165,5
4,181,42130,114236.0,2150-03-01,,,Discharge summary,Report,,,sex: m neurosurgery allergies: no known allerg...,1524,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1167816,2070657,31097,115637.0,2132-01-21,2132-01-21 03:27:00,2132-01-21 03:38:00,Nursing/other,Report,17581.0,,npn #1 infant remains in ra with o2 sats >96%....,143,0
1167817,2070658,31097,115637.0,2132-01-21,2132-01-21 09:50:00,2132-01-21 09:53:00,Nursing/other,Report,19211.0,,"neonatology dol #5, cga 36 weeks. cvr: continu...",95,0
1167818,2070659,31097,115637.0,2132-01-21,2132-01-21 16:42:00,2132-01-21 16:44:00,Nursing/other,Report,20104.0,,family meeting note family meeting held with b...,81,0
1167819,2070660,31097,115637.0,2132-01-21,2132-01-21 18:05:00,2132-01-21 18:16:00,Nursing/other,Report,16023.0,,npn 1800 #1 resp: remains in ra w/ rr 20's-o2s...,328,0


In [9]:
# subset columns we need
n_test_samples = 10000
train_df_subset = train_df[["TEXT","CATEGORY","label"]].dropna()
test_df_subset = test_df[["TEXT","CATEGORY","label"]].sample(n_test_samples).dropna()

In [10]:
train_df_subset.shape, test_df.shape

((1167774, 3), (10000, 13))

In [11]:
# save to file

save_dir = "/mnt/sdc/niallt/mimic_iii/processed/HADM_ID_split/"



In [12]:
train_df_subset.to_csv(f"{save_dir}/lm_pretraining_train.csv", index= None)
test_df_subset.to_csv(f"{save_dir}/lm_pretraining_test_{n_test_samples}.csv", index=None)


In [5]:
# read back in
reloaded_train_df = pd.read_csv(f"{save_dir}/lm_pretraining_train_250000.csv")

In [6]:
n_test_samples = 1000
reloaded_test_df = pd.read_csv(f"{save_dir}/lm_pretraining_test_{n_test_samples}.csv")
reloaded_test_df

Unnamed: 0,TEXT,CATEGORY,label
0,10:42 am mr head w & w/o contrast clip # reaso...,Radiology,1
1,t-sicu npn 1900-0700 neuro: sedated on propofo...,Nursing/other,0
2,3:59 pm ct chest w/o contrast clip # reason: r...,Radiology,1
3,11:08 am ct chest w/contrast; lab reconstructi...,Radiology,1
4,cardiac perfusion clip # reason: liver tx eval...,Radiology,1
...,...,...,...
995,"npnote #remains in ra ir, bbs clear, equal, ea...",Nursing/other,0
996,npn 1900-0700 neuro: slept fair once given ser...,Nursing/other,0
997,np note pe: pale pink sl jaundiced premature i...,Nursing/other,0
998,npn neuro: pt conts on fent and midaz - 30 mcg...,Nursing/other,0


In [24]:
train_df_subset.CATEGORY.value_counts()

Nursing/other        119875
Radiology             54743
Nursing               32855
Physician             20649
Discharge summary      8911
Echo                   4930
Respiratory            4600
Nutrition              1339
General                1256
Rehab Services          796
Pharmacy                 23
Consult                  17
Name: CATEGORY, dtype: int64

In [28]:
train_df_subset.query("CATEGORY == 'Rehab Services'")

Unnamed: 0,TEXT,CATEGORY,label
830,title: passy-muir valve evaluation / dispense ...,Rehab Services,9
1339,title: bedside swallow evaluation patient was ...,Rehab Services,9
1435,title: rehab services received consult and app...,Rehab Services,9
1514,"consult receive and appreciated, patient remai...",Rehab Services,9
1750,attending physician: date: medical diagnosis /...,Rehab Services,9
...,...,...,...
248623,attending physician: referral date: medical di...,Rehab Services,9
248625,history attending md: referral date: reason fo...,Rehab Services,9
249259,attending physician: referral date: medical di...,Rehab Services,9
249494,attending physician: referral date: medical di...,Rehab Services,9


In [9]:

# create new class "other" 
cat_col = "CATEGORY"
n_classes_keep = 4
classes_to_keep = list(reloaded_train_df[cat_col].value_counts().keys()[:n_classes_keep])



In [23]:
classes_to_keep

['Nursing/other',
 'Radiology',
 'Nursing',
 'Physician ',
 'Discharge summary',
 'Echo',
 'Respiratory ',
 'Nutrition']

In [11]:
# get new DFs with only the top N classes in

train_df = reloaded_train_df[reloaded_train_df[cat_col].isin(classes_to_keep)]
val_df = reloaded_test_df[reloaded_test_df[cat_col].isin(classes_to_keep)]

In [14]:
train_df.CATEGORY.value_counts()

Nursing/other    119875
Radiology         54743
Nursing           32855
Physician         20649
Name: CATEGORY, dtype: int64