In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# this code widget is referenced from https://github.com/kaggarwal/ClinicalNotesICU

from nltk import sent_tokenize, word_tokenize
import re
import pandas as pd

SECTION_TITLES = re.compile(
    r'('
    r'ABDOMEN AND PELVIS|CLINICAL HISTORY|CLINICAL INDICATION|COMPARISON|COMPARISON STUDY DATE'
    r'|EXAM|EXAMINATION|FINDINGS|HISTORY|IMPRESSION|INDICATION'
    r'|MEDICAL CONDITION|PROCEDURE|REASON FOR EXAM|REASON FOR STUDY|REASON FOR THIS EXAMINATION'
    r'|TECHNIQUE'
    r'):|FINAL REPORT',
    re.I | re.M)
    
    
def getSentences(t):
    return list(preprocess_mimic(t))
    
def pattern_repl(matchobj):
    """
    Return a replacement string to be used for match object
    """
    return ' '.rjust(len(matchobj.group(0)))
    
def clean_text(text):
    """
    Clean text
    """

    # To replace [**Patterns**] with spaces.
    text = re.sub(r'\[\*\*.*?\*\*\]', pattern_repl, text)
    # To replace `_` with spaces.
    text = re.sub(r'_', ' ', text)

    start = 0
    end = find_end(text)
    new_text = ''
    if start > 0:
        new_text += ' ' * start
    new_text = text[start:end]

    # this is to make sure the new text has the same length of old text.
    if len(text) - end > 0:
        new_text += ' ' * (len(text) - end)
    return new_text

def preprocess_mimic(text):
    """
    Preprocess reports in MIMIC-III.
    1. remove [**Patterns**] and signature
    2. split the report into sections
    3. tokenize sentences and words
    4. lowercase
    """
    for sec in split_heading(clean_text(text)):
        for sent in sent_tokenize(sec):
            text = ' '.join(word_tokenize(sent))
            yield text.lower()
            
def split_heading(text):
    """Split the report into sections"""
    start = 0
    for matcher in SECTION_TITLES.finditer(text):
        # add last piece
        end = matcher.start()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        # add title
        start = end
        end = matcher.end()
        if end != start:
            section = text[start:end].strip()
            if section:
                yield section

        start = end

    # add last piece
    end = len(text)
    if start < end:
        section = text[start:end].strip()
        if section:
            yield section
            
def find_end(text):
    """Find the end of the report."""
    ends = [len(text)]
    patterns = [
        re.compile(r'BY ELECTRONICALLY SIGNING THIS REPORT', re.I),
        re.compile(r'\n {3,}DR.', re.I),
        re.compile(r'[ ]{1,}RADLINE ', re.I),
        re.compile(r'.*electronically signed on', re.I),
        re.compile(r'M\[0KM\[0KM')
    ]
    for pattern in patterns:
        matchobj = pattern.search(text)
        if matchobj:
            ends.append(matchobj.start())
    return min(ends)

In [9]:
PREPROCESS = "/content/drive/MyDrive/Colab Notebooks/ConvolutionMedicalNer/data/"

In [10]:
import pandas as pd
import os
import numpy as np
import re
import preprocess

In [11]:
clinical_notes = pd.read_pickle(os.path.join(PREPROCESS, "sub_notes.p"))
clinical_notes.shape

(181483, 20)

In [12]:
sub_notes = clinical_notes[clinical_notes.SUBJECT_ID.notnull()]
sub_notes = sub_notes[sub_notes.CHARTTIME.notnull()]
sub_notes = sub_notes[sub_notes.TEXT.notnull()]

In [13]:
sub_notes.shape

(181483, 20)

In [14]:
sub_notes = sub_notes[['SUBJECT_ID', 'HADM_ID_y', 'CHARTTIME', 'TEXT']]

In [15]:
sub_notes['preprocessed_text'] = None

In [16]:
# import nltk
# nltk.download('punkt')
for each_note in sub_notes.itertuples():
    text = each_note.TEXT
    sub_notes.at[each_note.Index, 'preprocessed_text'] = getSentences(text)
    # or `sub_notes.at[each_note.Index, 'preprocessed_text'] = preprocess.getSentences(text)`

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### Save notes

In [17]:
pd.to_pickle(sub_notes, os.path.join(PREPROCESS, "preprocessed_notes.p"))

### Additional preprocessing

In [None]:
# sub_notes = pd.read_pickle(os.path.join(PREPROCESS, "preprocessed_notes.p"))

# def preprocess1(x):
#     y=re.sub('\\[(.*?)\\]','',x) #remove de-identified brackets
#     y=re.sub('[0-9]+\.','',y) #remove 1.2. since the segmenter segments based on this
#     y=re.sub('dr\.','doctor',y)
#     y=re.sub('m\.d\.','md',y)
#     y=re.sub('admission date:','',y)
#     y=re.sub('discharge date:','',y)
#     y=re.sub('--|__|==','',y)
#     return y

# def preprocessing(df_less_n): 
#     df_less_n['preprocessed_text_v2']=df_less_n['preprocessed_text'].fillna(' ')
#     df_less_n['preprocessed_text_v2']=df_less_n['preprocessed_text_v2'].str.replace('\n',' ')
#     #df_less_n['preprocessed_text_v2']=df_less_n['preprocessed_text_v2'].str.replace('\r',' ')
#     #df_less_n['preprocessed_text_v2']=df_less_n['preprocessed_text_v2'].apply(str.strip)
#     #df_less_n['preprocessed_text_v2']=df_less_n['preprocessed_text_v2'].str.lower()

#     df_less_n['preprocessed_text_v2']=df_less_n['preprocessed_text_v2'].apply(lambda x: preprocess1(x))
    
# sub_notes = preprocessing(sub_notes)