In [None]:
# default_exp preprocessing

In [None]:
#export
import nltk
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
import string
import re

# Preprocessing

> Methods for preprocessing raw text data

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
def decontracted(text):
    "Removes contractions from input `text`. Credit: https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python"
    contractions = [
        (r"won\'t", "will not"),
        (r"can\'t", "can not"),
        (r"hadn\'t", "had not"),
        (r"doesnt", "does not"),
        (r"youre", "you are"),
        (r"dont", "do not"),
        (r"im\s", "i am"),
        (r"ive\s", "i have"),
        (r"won\'t", "will not"),
        (r"can\'t", "can not"),
        (r"hadn\'t", "had not"),
        (r"dont\s", "do not"), 
        (r"n\'t", " not"),
        (r"\'re", " are"),
        (r"\'s", " is"),
        (r"\'d", " would"),
        (r"\'ll", " will"),
        (r"\'t", " not"),
        (r"\'ve", " have"),
        (r"\'m", " am")
    ]
    
    # Replace any contractions with their decontracted equivalent
    for contraction, decontracted in contractions:
        text = re.sub(contraction, decontracted, text)
        
    text = re.sub(r"/", " ", text) ### I added this line (Amy)
    return text

In [None]:
#export
def token_filter(token, stop_words):
    "Returns False for stop words and tokens with no alpha characters, otherwise, True"
    match = re.match('[A-z]', token)
    return match is not None and token not in stop_words

In [None]:
# token_filter tests
stop_words = ['stop_word']
assert token_filter('token', stop_words) == True
assert token_filter('TOKEN', stop_words) == True
assert token_filter('under_score', stop_words) == True
assert token_filter('a1c', stop_words) == True
assert token_filter('stop_word', stop_words) == False
assert token_filter('1', stop_words) == False
assert token_filter(' ', stop_words) == False

In [None]:
#export
def tokenize_and_stem(text):
    "Parse out sentences, remove contractions, tokenize by white space, and remove all punctuation, and lemmatize tokens"
    lemmatizer = nltk.WordNetLemmatizer()
    custom_stop_words = {"patient","mrs","hi","ob","1am","4month","o2","ed","ecmo","m3","ha","3rd","ai","csicu","wa","first",
                         "second","third","fourth","etc","eg","thus",",",".","'","(",")","!","...","'m","'s",'"',"?", "`",
                         "say","many","things","new","much","get","really","since","way","also","one","two","three","four",
                         "five","six","week","day","month","year","would","could","should","like","im","thing","v","u","d","g"}
    stop_words = set(stopwords.words('english')) | custom_stop_words
    table  = str.maketrans(' ', ' ', string.punctuation+"“"+"”")
    sent = nltk.sent_tokenize(text)
    split_sent = []
    raw_sent = []
    
    #For each sentence in document get back the list of tokenized words with contractions normalized and punctuation removed
    for s in sent:   
        raw_sent.append(s)
        tokenized = WhitespaceTokenizer().tokenize(decontracted(s).translate(table))
        
        # Lemmatize and convert to lowercase
        lemma = [lemmatizer.lemmatize(t).lower() for t in tokenized]

        # Filter out stopwords (including some punctuation) and tokens with no alpha characters
        filtered_tokens = [token for token in lemma if token_filter(token, stop_words)]   
        split_sent.append(filtered_tokens)

    return split_sent, raw_sent

In [None]:
#export
def tokenize_and_stem_original(text):
    """
    Original method which parses POS and LOC. `tokenize_and_stem` is a stripped down version of this. \n
    Parse out sentences, remove contractions, tokenize by white space, and remove all punctuation, and lemmatize tokens
    """
    lemmatizer = nltk.WordNetLemmatizer()
    my_tokens = []
    my_pos_tags = []
    my_loc = []
    full_sent = []
    full_pos = []
    full_loc = []    
    raw_sent = []
    custom_stop_words = {"patient","mrs","hi","ob","1am","4month","o2","ed","ecmo","m3","ha","3rd","ai","csicu","wa","first",
                         "second","third","fourth","etc","eg","thus",",",".","'","(",")","!","...","'m","'s",'"',"?", "`",
                         "say","many","things","new","much","get","really","since","way","also","one","two","three","four",
                         "five","six","week","day","month","year","would","could","should","like","im","thing","v","u","d","g"}
    
    stop_words = set(stopwords.words('english')) | custom_stop_words
    table  = str.maketrans(' ', ' ', string.punctuation+"“"+"”")
    sent = nltk.sent_tokenize(text)
    split_sent = []
    split_sent_pos = []
    split_sent_loc = []
    
    #For each sentence in document get back the list of tokenized words with contractions normalized and punctuation removed
    for s in sent:   
        raw_sent.append(s)
        tokenized = WhitespaceTokenizer().tokenize(decontracted(s).translate(table))
        tags = nltk.pos_tag(tokenized)
        lemma = [lemmatizer.lemmatize(t) for t in tokenized]
        #convert all remaining tokens to lowercase
        f2 = [w.lower() for w in lemma]
        
        loc2 = list(range(0, len(f2)))
        
        #remove stopwords and some punctuation
        f3 = []
        t3 = []
        loc3 = []
        for w in range(0,len(f2)):
            if f2[w] not in stop_words:
                f3.append(f2[w])
                t3.append(tags[w])
                loc3.append(loc2[w])
            
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        filtered_tokens = []
        filtered_tags = []
        filtered_loc = []
        for t in range(0, len(f3)):
            if re.search('[a-zA-Z]', f3[t]):
                filtered_tokens.append(f3[t])
                filtered_tags.append(t3[t])
                filtered_loc.append(loc3[t])
        
        split_sent.append(filtered_tokens)
        split_sent_pos.append(filtered_tags)
        split_sent_loc.append(filtered_loc)
        my_tokens = my_tokens + filtered_tokens
        my_pos_tags = my_pos_tags + filtered_tags
        my_loc = my_loc + filtered_loc
        
        full_sent.append(f2)
        full_pos.append(tags)
        full_loc.append(loc2)

    return split_sent, split_sent_pos, split_sent_loc, my_tokens, my_pos_tags, my_loc, full_sent, full_pos, full_loc, raw_sent

In [None]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted Core.ipynb.
Converted helpers.ipynb.
Converted index.ipynb.
Converted nlp_helpers.ipynb.
Converted preprocessing.ipynb.
Converted Sandbox.ipynb.
