In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import spacy
import re
from tqdm import tqdm
import string
import torch

# Data Preprocessing

In [None]:
#The pre-processing pipeline inherits from the original ClinicalBERT with minor changes. see https://github.com/kexinhuang12345/clinicalBERT
path = 'mimic_data_directory'
df_notes = pd.read_csv('/NOTEEVENTS.csv')

In [None]:
# held-out dataset if you want to fine-tune on MIMIC data, it is better to exclude them prior to the training
# exclude all data in test set for re-admission task where we are interested in 
df_test_ids = pd.read_csv('discharge/test.csv').ID.unique()
df_notes= df_notes[~df_notes.HADM_ID.isin(df_test_ids)]

In [None]:
#Choose interested categories, for more information, please refer to 
category_list = ['Discharge summary', 'Echo', 'Nursing', 'Physician ',
       'Rehab Services', 'Respiratory ', 'Nutrition',
       'General', 'Pharmacy', 'Consult', 'Radiology',
       'Nursing/other']

In [None]:
df_notes = df_notes[df_notes.CATEGORY.isin(category_list)]

In [None]:
def preprocess1(x):
    y=re.sub('\\[(.*?)\\]','',x) 
    y=re.sub('[0-9]+\. ','',y) 
    y=re.sub('dr\.','doctor',y)
    y=re.sub('m\.d\.','md',y)
    y=re.sub('admission date:','',y)
    y=re.sub('discharge date:','',y)
    y=re.sub('--|__|==','',y)
    #more substituion can be made to align with general knowledge such as "p.o." to "by mouth"
    
    # remove, spaces
    y = y.translate(str.maketrans("", ""))
    y = " ".join(y.split())
    return y

def preprocessing(df_notes): 
    df_notes['TEXT']=df_notes['TEXT'].fillna(' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\n',' ')
    df_notes['TEXT']=df_notes['TEXT'].str.replace('\r',' ')
    df_notes['TEXT']=df_notes['TEXT'].apply(str.strip)
    #We use uncased text which is also used in PubMedBERT
    df_notes['TEXT']=df_notes['TEXT'].str.lower()

    df_notes['TEXT']=df_notes['TEXT'].apply(lambda x: preprocess1(x))
    
    return df_notes

df_notes_processed= preprocessing(df_notes)
# to reuse the processed data in other tasks and save time
df_notes_processed.to_csv('df_notes_processed')

In [None]:
from spacy.lang.en import English
nlp = English()
nlp.add_pipe('sentencizer')

In [None]:
def toSentence(x):
    doc = nlp(x)
    text=[]
    try:
        for sent in doc.sents:
            st=str(sent).strip() 
            if len(st)<30:
                #Merging too-short sentences to appropriate length, this is inherited from ClinicalBERT with changes in merged length 
                if len(text)!=0:
                    text[-1]=' '.join((text[-1],st))
                else:
                    text=[st]
            else:
                text.append((st))
    except:
        print(doc)
    return text

pretrain_sent=df_notes_processed['TEXT'].apply(lambda x: toSentence(x))

In [None]:
data_path = 'put your data path here'
file=open(data_path + '/clinical_sentences_pretrain_wo_ECG_30_length_down_sampled.txt','w')
pretrain_sent = pretrain_sent.values
#random sample 500,000 documents 
pretrain_sent = np.random.choice(pretrain_sent,500000)


In [None]:
file=open(data_path + '/clinical_sentences_pretrain_wo_ECG_30_length_truncated_500000.txt','w')

In [None]:
#write the txt file for building dataset, empty lines between docs (for NSP task)
for i in tqdm(range(len(pretrain_sent))):
    if len(pretrain_sent[i]) > 0:
        # remove the one token note
        note = pretrain_sent[i]
        for sent in note:
            file.write(sent+'\n')
        file.write('\n')

# Train Tokenizer
Only when you pretrain from scratch!

In [None]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

In [None]:
paths = [str(x) for x in Path(data_path).glob("*.txt")]

In [None]:
tokenizer = ByteLevelBPETokenizer()

In [None]:
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])
tokenizer.save_model(".", "Tokenizer_Name")

# Clinical-PubMedBERT

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import TextDataset
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
from transformers import pipeline

In [None]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

model = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

In [None]:
dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='your text data path',
    block_size=128,
    # You can also use 512 block_size to train the model, also adjust batch size.
)

In [None]:
# Use Whole Word Masking instead of ordinary masking
data_collator = DataCollatorForWholeWordMask(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
# we use 5000 steps to warm-up, other optimization parameters are default
training_args = TrainingArguments(
    output_dir="your_output_directory",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_steps=2_500,
    save_total_limit=3,
    prediction_loss_only=True,
    warmup_steps = 5000
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("your_model_directory")

In [None]:
#You can try some examples to check the learned model!
fill_mask = pipeline(
    "fill-mask",
    model="your_model_directory",
    tokenizer=tokenizer
)