# Building a Pneumonia Vocabulary

In [10]:
import nltk
import string
import pickle as pkl
import pandas as pd
from string import punctuation
from nltk.corpus import stopwords
from collections import Counter

## NLP Helper Packages

#### We also need to load some nltk resources. These resources will help us clean the data.

In [11]:
# Text contains a lot of noisy words that are simply there to help with fluency.
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jferraro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Syntactic NLP Processing

#### We want to coustomize tokenization so we have more control over our text and can remove unique textual forms of noise like dates, ages, etc.

In [12]:
# turn a doc into clean tokens
def tokenize_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    tokens = [word.lower() for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    return tokens

# save list to file
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

## Retrieve our Corpus

#### Let's pull in our corpus that we had serialized out to disk.

In [13]:
file = open('data/differential-corpus.pkl','rb')
corpus = pkl.load(file)
file.close()

## Data Structure Transformation

#### Let's build a dataframe so we can select case types easily.

In [14]:
df = pd.DataFrame(columns=('case', 'document', 'label'))
for case, document in corpus.items():
    if 'PNA' in case:
        df = df.append({'case': case, 'document': document, 'label': 'PNA'}, ignore_index=True)
    elif 'COPD' in case:
        df = df.append({'case': case, 'document': document, 'label': 'COPD'}, ignore_index=True)
    else: # CHF
        df = df.append({'case': case, 'document': document, 'label': 'CHF'}, ignore_index=True) 
df.head()

Unnamed: 0,case,document,label
0,PNA1,\n\n\n DATE: [**2996-12-2**] 10:25 AM\n ...,PNA
1,PNA2,\n\n\n DATE: [**2850-2-14**] 10:22 PM\n ...,PNA
2,PNA3,\n\n\n DATE: [**2631-10-3**] 9:52 AM\n ...,PNA
3,PNA4,\n\n\n DATE: [**2584-11-21**] 11:17 AM\n ...,PNA
4,PNA5,\n\n\n DATE: [**2584-11-21**] 11:17 AM\n ...,PNA


## Building a Vocabulary

#### Using a data frame makes it easy to select between different cohorts.

In [15]:
# We want to count our vocabulary to determine the frequency of words we want to keep
vocab = Counter()

In [16]:
df_pna = df.loc[df['label'] == 'PNA']
for index, row in df_pna.iterrows():
        document = row['document']
        tokens = tokenize_doc(document)
        vocab.update(tokens)
print(vocab.most_common(100))

[('right', 1479), ('left', 1311), ('chest', 1304), ('clip', 1235), ('reason', 1143), ('contrast', 1102), ('ct', 1036), ('lobe', 947), ('pleural', 858), ('pneumonia', 842), ('lower', 702), ('lung', 655), ('number', 646), ('report', 646), ('date', 641), ('final', 638), ('radiology', 633), ('effusion', 616), ('examination', 614), ('bilateral', 555), ('underlying', 542), ('upper', 538), ('old', 534), ('year', 505), ('small', 502), ('within', 500), ('condition', 499), ('medical', 496), ('pulmonary', 493), ('ap', 487), ('impression', 471), ('tube', 469), ('portable', 468), ('effusions', 417), ('seen', 413), ('interval', 399), ('prior', 391), ('pm', 375), ('opacities', 358), ('indication', 351), ('diagnosis', 350), ('comparison', 346), ('unchanged', 339), ('please', 335), ('consolidation', 332), ('admitting', 328), ('study', 319), ('abdomen', 317), ('evidence', 305), ('evaluate', 297), ('eval', 296), ('iv', 286), ('man', 285), ('opacity', 277), ('new', 275), ('change', 274), ('optiray', 274),

In [17]:
# keep tokens with a min occurrence
print('Before : %d' % len(vocab))
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print('After : %d' % len(tokens))

Before : 3375
After : 2619


In [18]:

# save tokens to a vocabulary file
save_list(tokens, 'data/pna_vocab.txt')