In [1]:
import datetime as dt
import os
import sys

import numpy as np
import pandas as pd
from scipy import interp
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_curve, average_precision_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
import sqlalchemy as sa
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline

sys.path.append('../')
from utilities import sql_utils as su
from utilities import model_eval_utils as meu

DWH = os.getenv('MIMIC_DWH')
engine = create_engine(DWH)

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.set_option('display.float_format', lambda x: '%.3f' % x)

  """)


In [2]:
QUERY = """
select
  subject_id,
  hadm_id,
  chartdate,
  text
from mimiciii.noteevents
limit 200000
"""
with engine.connect() as conn:
    df = pd.read_sql(QUERY, conn)

In [3]:
df.shape

(200000, 4)

In [4]:
df.head()

Unnamed: 0,subject_id,hadm_id,chartdate,text
0,14139,114588.0,2198-06-06,[**2198-6-6**] 4:00 PM\n CHEST (PORTABLE AP); ...
1,1563,,2172-03-18,[**2172-3-18**] 4:00 PM\n CHEST (PA & LAT) ...
2,8182,,2194-04-16,[**2194-4-16**] 12:04 PM\n ART DUP EXT LO UNI;...
3,8297,113537.0,2115-05-01,[**2115-5-1**] 12:54 PM\n RENAL TRANSPLANT U.S...
4,20473,,2126-05-29,[**2126-5-29**] 12:56 PM\n CT HEAD W/ & W/O CO...


In [5]:
data_text = df[['text']]
data_text['index'] = data_text.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [6]:
data_text.head()

Unnamed: 0,text,index
0,[**2198-6-6**] 4:00 PM\n CHEST (PORTABLE AP); ...,0
1,[**2172-3-18**] 4:00 PM\n CHEST (PA & LAT) ...,1
2,[**2194-4-16**] 12:04 PM\n ART DUP EXT LO UNI;...,2
3,[**2115-5-1**] 12:54 PM\n RENAL TRANSPLANT U.S...,3
4,[**2126-5-29**] 12:56 PM\n CT HEAD W/ & W/O CO...,4


In [7]:
documents = data_text

In [8]:
documents.head()

Unnamed: 0,text,index
0,[**2198-6-6**] 4:00 PM\n CHEST (PORTABLE AP); ...,0
1,[**2172-3-18**] 4:00 PM\n CHEST (PA & LAT) ...,1
2,[**2194-4-16**] 12:04 PM\n ART DUP EXT LO UNI;...,2
3,[**2115-5-1**] 12:54 PM\n RENAL TRANSPLANT U.S...,3
4,[**2126-5-29**] 12:56 PM\n CT HEAD W/ & W/O CO...,4


In [9]:
print(len(documents))
print(documents[:5])

200000
                                                text  index
0  [**2198-6-6**] 4:00 PM\n CHEST (PORTABLE AP); ...      0
1  [**2172-3-18**] 4:00 PM\n CHEST (PA & LAT)    ...      1
2  [**2194-4-16**] 12:04 PM\n ART DUP EXT LO UNI;...      2
3  [**2115-5-1**] 12:54 PM\n RENAL TRANSPLANT U.S...      3
4  [**2126-5-29**] 12:56 PM\n CT HEAD W/ & W/O CO...      4


## Data Preprocessing

1. Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
2. Words that have fewer than 3 characters are removed.
3. All stopwords are removed.
4. Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
5. Words are stemmed — words are reduced to their root form.

### Loading gensim and nltk libraries

In [10]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
# import nltk.stem as stemmer
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

stemmer = SnowballStemmer('english')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/VincentLa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
def lemmatize_stemming(text):
    """
    Lemmatize: lemmatized — words in third person are changed to first person
    
    Verbs in past and future tenses are changed into present.
    """
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [12]:
def preprocess(text):
    """
    Preprocess Text:
    
    Remove words in "STOPWORDS" and remove words 3 letters or less
    """
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [13]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')

words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['[**2163-9-22**]', '4:41', 'PM\n', 'CT', 'C-SPINE', 'W/CONTRAST;', 'CT', '100CC', 'NON', 'IONIC', 'CONTRAST', '', '', '', '', '', '', '', '', '', '', '', '', '', 'Clip', '#', '[**Clip', 'Number', '(Radiology)', '56611**]\n', 'CT', 'RECONSTRUCTION\n', 'Reason:', '?fluid', 'collection\n', '', 'Contrast:', 'OPTIRAY', 'Amt:', '100\n', '______________________________________________________________________________\n', '[**Hospital', '2**]', 'MEDICAL', 'CONDITION:\n', '', '52', 'year', 'old', 'man', 'with', 'cervical', 'fx/meningitis', 'now', 'w/AMS\n', 'REASON', 'FOR', 'THIS', 'EXAMINATION:\n', '', '?fluid', 'collection\n', 'No', 'contraindications', 'for', 'IV', 'contrast\n', '______________________________________________________________________________\n', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'FINAL', 'REPORT\n', 'HISTORY:', '', 'Status', 'post', 'cervical', 'spine', 'fractures

In [None]:
processed_docs = documents['text'].map(preprocess)
processed_docs[:10]

## Bag of Words on the Data set
Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [None]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

### Gensim filter_extremes
Filter out tokens that appear in

1. less than 15 documents (absolute number) or
2. more than 0.5 documents (fraction of total corpus size, not absolute number).
3. after the above two steps, keep only the first 100000 most frequent tokens.

In [None]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

## Gensim doc2bow (Bag of Words)
For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

In [None]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

## TF IDF 
Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [None]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

## Running LDA using Bag of Words
Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                       num_topics=10,
                                       id2word=dictionary,
                                       passes=2,
                                       workers=2)

In [None]:
type(lda_model)

For each topic, we will explore the words occuring in that topic and its relative weight.

In [None]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

## Running LDA using TF-IDF


In [None]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

## Performance evaluation by classifying sample document using LDA Bag of Words model

In [None]:
processed_docs[4310]

In [None]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))

## Performance evaluation by classifying sample document using LDA TF-IDF model.

In [None]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))

## Testing model on unseen document


In [None]:
unseen_document = """
Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]


Service:
ADDENDUM:

RADIOLOGIC STUDIES:  Radiologic studies also included a chest
CT, which confirmed cavitary lesions in the left lung apex
consistent with infectious process/tuberculosis.  This also
moderate-sized left pleural effusion.

HEAD CT:  Head CT showed no intracranial hemorrhage or mass
effect, but old infarction consistent with past medical
history.

ABDOMINAL CT:  Abdominal CT showed lesions of
T10 and sacrum most likely secondary to osteoporosis. These can
be followed by repeat imaging as an outpatient.



                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]

Dictated By:[**Hospital 1807**]
MEDQUIST36

D:  [**2151-8-5**]  12:11
T:  [**2151-8-5**]  12:21
JOB#:  [**Job Number 1808**]

"""
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))