In [1]:
import datetime as dt
import os
import sys

import numpy as np
import pandas as pd
from scipy import interp
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_curve, average_precision_score, precision_recall_curve
from sklearn.model_selection import StratifiedKFold
import sqlalchemy as sa
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
%matplotlib inline

sys.path.append('../')
from utilities import sql_utils as su
from utilities import model_eval_utils as meu

DWH = os.getenv('MIMIC_DWH')
engine = create_engine(DWH)

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000
pd.set_option('display.float_format', lambda x: '%.3f' % x)

  """)


In [2]:
QUERY = """
select
  subject_id,
  hadm_id,
  chartdate,
  text
from mimiciii.noteevents
limit 200000
"""
with engine.connect() as conn:
    df = pd.read_sql(QUERY, conn)

In [3]:
df.shape

(200000, 4)

In [4]:
df.head()

Unnamed: 0,subject_id,hadm_id,chartdate,text
0,22532,167853.0,2151-08-04,Admission Date: [**2151-7-16**] Dischar...
1,13702,107527.0,2118-06-14,Admission Date: [**2118-6-2**] Discharg...
2,13702,167118.0,2119-05-25,Admission Date: [**2119-5-4**] D...
3,13702,196489.0,2124-08-18,Admission Date: [**2124-7-21**] ...
4,26880,135453.0,2162-03-25,Admission Date: [**2162-3-3**] D...


In [5]:
data_text = df[['text']]
data_text['index'] = data_text.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [6]:
data_text.head()

Unnamed: 0,text,index
0,Admission Date: [**2151-7-16**] Dischar...,0
1,Admission Date: [**2118-6-2**] Discharg...,1
2,Admission Date: [**2119-5-4**] D...,2
3,Admission Date: [**2124-7-21**] ...,3
4,Admission Date: [**2162-3-3**] D...,4


In [7]:
documents = data_text

In [8]:
documents.head()

Unnamed: 0,text,index
0,Admission Date: [**2151-7-16**] Dischar...,0
1,Admission Date: [**2118-6-2**] Discharg...,1
2,Admission Date: [**2119-5-4**] D...,2
3,Admission Date: [**2124-7-21**] ...,3
4,Admission Date: [**2162-3-3**] D...,4


In [9]:
print(len(documents))
print(documents[:5])

200000
                                                text  index
0  Admission Date:  [**2151-7-16**]       Dischar...      0
1  Admission Date:  [**2118-6-2**]       Discharg...      1
2  Admission Date:  [**2119-5-4**]              D...      2
3  Admission Date:  [**2124-7-21**]              ...      3
4  Admission Date:  [**2162-3-3**]              D...      4


## Data Preprocessing

1. Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
2. Words that have fewer than 3 characters are removed.
3. All stopwords are removed.
4. Words are lemmatized — words in third person are changed to first person and verbs in past and future tenses are changed into present.
5. Words are stemmed — words are reduced to their root form.

### Loading gensim and nltk libraries

In [10]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
# import nltk.stem as stemmer
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

stemmer = SnowballStemmer('english')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/VincentLa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
def lemmatize_stemming(text):
    """
    Lemmatize: lemmatized — words in third person are changed to first person
    
    Verbs in past and future tenses are changed into present.
    """
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

In [12]:
def preprocess(text):
    """
    Preprocess Text:
    
    Remove words in "STOPWORDS" and remove words 3 letters or less
    """
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [13]:
doc_sample = documents[documents['index'] == 4310].values[0][0]
print('original document: ')

words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['Admission', 'Date:', '', '[**2171-5-6**]', '', '', '', '', 'Discharge', 'Date:', '', '[**2171-5-8**]\n\nDate', 'of', 'Birth:', '', '', '[**2147-8-13**]', '', '', '', '', 'Sex:', '', 'F\n\nService:', '', '[**Hospital1', '212**]\n\nCHIEF', 'COMPLAINT:', '', 'Hypoxia,', 'hemoptysis,', 'fever.\n\nHISTORY', 'OF', 'PRESENT', 'ILLNESS:', '', 'This', 'is', 'a', '23', 'year', 'old', 'woman', 'with\nSLE,', 'lupus', 'nephritis,', 'end', 'stage', 'renal', 'disease', 'on\nhemodialysis,', 'warm', 'antibody', 'hemolytic', 'anemia', 'on', '50', 'mg', 'of\nprednisone', 'times', 'one', 'month', 'who', 'was', 'in', 'her', 'usual', 'state', 'of\nhealth', 'until', 'Friday,', '[**2171-5-3**],', 'when', 'she', 'developed', 'a', 'cough.\nShe', 'was', 'otherwise', 'well', 'and', 'was', 'able', 'to', 'undergo', 'all', 'her', 'normal\nweekend', 'activities', 'until', 'the', 'morning', 'of', '[**5-6**]', 'when', 'she\ndeveloped', 'fever,', 'chills,', 'a', 'few', 'teaspoons', 'of', 'hemoptysi

['admiss', 'date', 'discharg', 'date', 'date', 'birth', 'servic', 'hospit', 'chief', 'complaint', 'hypoxia', 'hemoptysi', 'fever', 'histori', 'present', 'ill', 'year', 'woman', 'lupus', 'nephriti', 'stage', 'renal', 'diseas', 'hemodialysi', 'warm', 'antibodi', 'hemolyt', 'anemia', 'prednison', 'time', 'month', 'usual', 'state', 'health', 'friday', 'develop', 'cough', 'abl', 'undergo', 'normal', 'weekend', 'activ', 'morn', 'develop', 'fever', 'chill', 'teaspoon', 'hemoptysi', 'right', 'upper', 'quadrant', 'pain', 'bring', 'emerg', 'room', 'sat', 'room', 'improv', 'liter', 'respiratori', 'rate', 'give', 'ceftriaxon', 'lopressor', 'tylenol', 'right', 'upper', 'quadrant', 'short', 'breath', 'improv', 'past', 'medic', 'histori', 'notabl', 'diagnos', 'treat', 'prednison', 'warm', 'antibodi', 'hemolyt', 'anemia', 'diagnos', 'prednison', 'taper', 'initi', 'time', 'month', 'stage', 'renal', 'diseas', 'hemodialysi', 'pneumococc', 'sepsi', 'status', 'post', 'intub', 'sickl', 'cell', 'trait', 'sta

In [14]:
processed_docs = documents['text'].map(preprocess)
processed_docs[:10]

0    [admiss, date, discharg, date, servic, addendu...
1    [admiss, date, discharg, date, date, birth, se...
2    [admiss, date, discharg, date, servic, cardiot...
3    [admiss, date, discharg, date, servic, medicin...
4    [admiss, date, discharg, date, date, birth, se...
5    [admiss, date, discharg, date, date, birth, se...
6    [admiss, date, discharg, date, servic, medicin...
7    [admiss, date, discharg, date, date, birth, se...
8    [admiss, date, discharg, date, date, birth, se...
9    [admiss, date, discharg, date, date, birth, se...
Name: text, dtype: object

## Bag of Words on the Data set
Create a dictionary from ‘processed_docs’ containing the number of times a word appears in the training set.

In [15]:
dictionary = gensim.corpora.Dictionary(processed_docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 abdomin
1 addendum
2 admiss
3 apex
4 cavitari
5 chest
6 confirm
7 consist
8 date
9 dictat
10 discharg


### Gensim filter_extremes
Filter out tokens that appear in

1. less than 15 documents (absolute number) or
2. more than 0.5 documents (fraction of total corpus size, not absolute number).
3. after the above two steps, keep only the first 100000 most frequent tokens.

In [16]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

## Gensim doc2bow (Bag of Words)
For each document we create a dictionary reporting how many
words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.

In [17]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[4310]

[(0, 2),
 (2, 2),
 (5, 1),
 (8, 1),
 (9, 7),
 (11, 1),
 (12, 4),
 (15, 3),
 (16, 3),
 (18, 2),
 (23, 1),
 (25, 1),
 (26, 4),
 (27, 1),
 (30, 2),
 (32, 1),
 (33, 1),
 (37, 1),
 (39, 3),
 (40, 2),
 (41, 5),
 (46, 1),
 (47, 1),
 (52, 1),
 (53, 1),
 (62, 1),
 (68, 1),
 (80, 1),
 (81, 1),
 (84, 3),
 (86, 1),
 (87, 1),
 (98, 1),
 (102, 1),
 (104, 1),
 (105, 1),
 (114, 2),
 (117, 1),
 (119, 2),
 (120, 1),
 (124, 1),
 (125, 2),
 (138, 2),
 (146, 2),
 (149, 1),
 (154, 1),
 (162, 1),
 (172, 1),
 (179, 2),
 (184, 1),
 (193, 1),
 (194, 1),
 (195, 1),
 (196, 1),
 (198, 1),
 (202, 1),
 (204, 1),
 (213, 2),
 (218, 1),
 (220, 1),
 (223, 3),
 (226, 1),
 (227, 1),
 (231, 2),
 (238, 1),
 (248, 1),
 (254, 3),
 (261, 1),
 (263, 1),
 (265, 1),
 (268, 1),
 (270, 1),
 (283, 1),
 (286, 1),
 (288, 1),
 (296, 1),
 (308, 3),
 (324, 1),
 (328, 3),
 (331, 11),
 (332, 1),
 (337, 1),
 (340, 2),
 (341, 1),
 (342, 2),
 (345, 3),
 (346, 3),
 (348, 5),
 (350, 1),
 (352, 2),
 (364, 1),
 (369, 1),
 (370, 1),
 (371, 3),
 (3

In [18]:
bow_doc_4310 = bow_corpus[4310]
for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                               dictionary[bow_doc_4310[i][0]], 
bow_doc_4310[i][1]))

Word 0 ("abdomin") appears 2 time.
Word 2 ("admiss") appears 2 time.
Word 5 ("chest") appears 1 time.
Word 8 ("dictat") appears 1 time.
Word 9 ("discharg") appears 7 time.
Word 11 ("effus") appears 1 time.
Word 12 ("follow") appears 4 time.
Word 15 ("histori") appears 3 time.
Word 16 ("hospit") appears 3 time.
Word 18 ("includ") appears 2 time.
Word 23 ("like") appears 1 time.
Word 25 ("mass") appears 1 time.
Word 26 ("medic") appears 4 time.
Word 27 ("medquist") appears 1 time.
Word 30 ("number") appears 2 time.
Word 32 ("outpati") appears 1 time.
Word 33 ("past") appears 1 time.
Word 37 ("repeat") appears 1 time.
Word 39 ("secondari") appears 3 time.
Word 40 ("servic") appears 2 time.
Word 41 ("show") appears 5 time.
Word 46 ("abl") appears 1 time.
Word 47 ("abnorm") appears 1 time.
Word 52 ("adequ") appears 1 time.
Word 53 ("admit") appears 1 time.
Word 62 ("allergi") appears 1 time.
Word 68 ("appear") appears 1 time.
Word 80 ("bilater") appears 1 time.
Word 81 ("birth") appears 1 t

## TF IDF 
Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, then apply transformation to the entire corpus and call it ‘corpus_tfidf’. Finally we preview TF-IDF scores for our first document.

In [19]:
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.20984918688538046),
 (1, 0.1734458382975791),
 (2, 0.05878068974385455),
 (3, 0.13768462239052914),
 (4, 0.2888172222311172),
 (5, 0.07264987984801156),
 (6, 0.15198630791070009),
 (7, 0.1724289077557035),
 (8, 0.11261352342192697),
 (9, 0.058850794221396274),
 (10, 0.1336591513998094),
 (11, 0.05421785856883934),
 (12, 0.06450866721623826),
 (13, 0.20657681946685544),
 (14, 0.13409115367756516),
 (15, 0.06229248355414667),
 (16, 0.05931060352926453),
 (17, 0.07302414488312661),
 (18, 0.08682731994405998),
 (19, 0.08223894821140666),
 (20, 0.1333669640980876),
 (21, 0.16233409514367825),
 (22, 0.2374998012980466),
 (23, 0.08257282324492556),
 (24, 0.07908867735750663),
 (25, 0.09376000101596972),
 (26, 0.05975402634300554),
 (27, 0.11535716578158435),
 (28, 0.06666382785366787),
 (29, 0.20786951323489367),
 (30, 0.1643090375098856),
 (31, 0.2128710642947576),
 (32, 0.0991744671072435),
 (33, 0.06721596082119884),
 (34, 0.1086420714397059),
 (35, 0.13476007304855736),
 (36, 0.310

## Running LDA using Bag of Words
Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’

In [20]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                       num_topics=10,
                                       id2word=dictionary,
                                       passes=2,
                                       workers=2)

In [21]:
type(lda_model)

gensim.models.ldamulticore.LdaMulticore

For each topic, we will explore the words occuring in that topic and its relative weight.

In [22]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.050*"valv" + 0.036*"mild" + 0.032*"aortic" + 0.027*"mitral" + 0.025*"ventricular" + 0.024*"leaflet" + 0.019*"right" + 0.018*"size" + 0.018*"systol" + 0.017*"wall"
Topic: 1 
Words: 0.035*"blood" + 0.012*"patient" + 0.010*"hospit" + 0.009*"cultur" + 0.008*"discharg" + 0.007*"tablet" + 0.007*"daili" + 0.005*"negat" + 0.005*"continu" + 0.005*"final"
Topic: 2 
Words: 0.051*"patient" + 0.014*"discharg" + 0.011*"histori" + 0.010*"hospit" + 0.008*"medic" + 0.008*"time" + 0.008*"blood" + 0.008*"admiss" + 0.007*"arteri" + 0.007*"status"
Topic: 3 
Words: 0.050*"tablet" + 0.023*"daili" + 0.016*"discharg" + 0.013*"pain" + 0.012*"blood" + 0.011*"refil" + 0.011*"disp" + 0.011*"medic" + 0.010*"histori" + 0.009*"hospit"
Topic: 4 
Words: 0.015*"right" + 0.009*"hospit" + 0.009*"patient" + 0.009*"discharg" + 0.008*"head" + 0.007*"bilater" + 0.006*"medic" + 0.006*"histori" + 0.006*"hemorrhag" + 0.006*"tablet"
Topic: 5 
Words: 0.015*"patient" + 0.010*"discharg" + 0.009*"bleed" + 0.009*"li

## Running LDA using TF-IDF


In [23]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.006*"blood" + 0.006*"reappear" + 0.005*"cultur" + 0.005*"tablet" + 0.004*"final" + 0.004*"analyz" + 0.004*"discharg" + 0.004*"hospit" + 0.004*"negat" + 0.003*"daili"
Topic: 1 Word: 0.020*"tablet" + 0.011*"daili" + 0.006*"discharg" + 0.006*"blood" + 0.006*"disp" + 0.005*"refil" + 0.005*"hospit" + 0.005*"releas" + 0.004*"histori" + 0.004*"capsul"
Topic: 2 Word: 0.023*"infant" + 0.012*"life" + 0.009*"immun" + 0.008*"feed" + 0.008*"newborn" + 0.008*"screen" + 0.007*"gestat" + 0.007*"babi" + 0.007*"discharg" + 0.007*"deliveri"
Topic: 3 Word: 0.008*"tablet" + 0.005*"blood" + 0.005*"discharg" + 0.005*"hospit" + 0.005*"daili" + 0.004*"histori" + 0.003*"medic" + 0.003*"patient" + 0.003*"admiss" + 0.002*"seizur"
Topic: 4 Word: 0.009*"tablet" + 0.006*"fractur" + 0.006*"discharg" + 0.006*"mar" + 0.005*"blood" + 0.005*"pain" + 0.004*"hospit" + 0.004*"medic" + 0.004*"daili" + 0.003*"histori"
Topic: 5 Word: 0.012*"discharg" + 0.010*"postop" + 0.009*"patient" + 0.009*"number" + 0.008*

## Performance evaluation by classifying sample document using LDA Bag of Words model

In [24]:
processed_docs[4310]

['admiss',
 'date',
 'discharg',
 'date',
 'date',
 'birth',
 'servic',
 'hospit',
 'chief',
 'complaint',
 'hypoxia',
 'hemoptysi',
 'fever',
 'histori',
 'present',
 'ill',
 'year',
 'woman',
 'lupus',
 'nephriti',
 'stage',
 'renal',
 'diseas',
 'hemodialysi',
 'warm',
 'antibodi',
 'hemolyt',
 'anemia',
 'prednison',
 'time',
 'month',
 'usual',
 'state',
 'health',
 'friday',
 'develop',
 'cough',
 'abl',
 'undergo',
 'normal',
 'weekend',
 'activ',
 'morn',
 'develop',
 'fever',
 'chill',
 'teaspoon',
 'hemoptysi',
 'right',
 'upper',
 'quadrant',
 'pain',
 'bring',
 'emerg',
 'room',
 'sat',
 'room',
 'improv',
 'liter',
 'respiratori',
 'rate',
 'give',
 'ceftriaxon',
 'lopressor',
 'tylenol',
 'right',
 'upper',
 'quadrant',
 'short',
 'breath',
 'improv',
 'past',
 'medic',
 'histori',
 'notabl',
 'diagnos',
 'treat',
 'prednison',
 'warm',
 'antibodi',
 'hemolyt',
 'anemia',
 'diagnos',
 'prednison',
 'taper',
 'initi',
 'time',
 'month',
 'stage',
 'renal',
 'diseas',
 'hem

In [25]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.48047584295272827	 
Topic: 0.051*"patient" + 0.014*"discharg" + 0.011*"histori" + 0.010*"hospit" + 0.008*"medic" + 0.008*"time" + 0.008*"blood" + 0.008*"admiss" + 0.007*"arteri" + 0.007*"status"

Score: 0.2322763204574585	 
Topic: 0.015*"patient" + 0.010*"discharg" + 0.009*"bleed" + 0.009*"liver" + 0.009*"abdomin" + 0.009*"hospit" + 0.008*"histori" + 0.008*"blood" + 0.007*"medic" + 0.006*"pain"

Score: 0.16941964626312256	 
Topic: 0.023*"tablet" + 0.016*"daili" + 0.015*"hospit" + 0.013*"discharg" + 0.009*"patient" + 0.009*"histori" + 0.009*"medic" + 0.008*"admiss" + 0.008*"blood" + 0.007*"time"

Score: 0.06561106443405151	 
Topic: 0.035*"blood" + 0.012*"patient" + 0.010*"hospit" + 0.009*"cultur" + 0.008*"discharg" + 0.007*"tablet" + 0.007*"daili" + 0.005*"negat" + 0.005*"continu" + 0.005*"final"

Score: 0.05099734663963318	 
Topic: 0.018*"infant" + 0.017*"discharg" + 0.012*"life" + 0.011*"hospit" + 0.010*"week" + 0.010*"blood" + 0.010*"namepattern" + 0.009*"feed" + 0.009*"adm

## Performance evaluation by classifying sample document using LDA TF-IDF model.

In [26]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.7945289015769958	 
Topic: 0.012*"patient" + 0.007*"discharg" + 0.006*"hospit" + 0.005*"number" + 0.005*"histori" + 0.005*"blood" + 0.004*"namepattern" + 0.004*"unit" + 0.004*"admiss" + 0.004*"medic"

Score: 0.1946193277835846	 
Topic: 0.006*"blood" + 0.006*"reappear" + 0.005*"cultur" + 0.005*"tablet" + 0.004*"final" + 0.004*"analyz" + 0.004*"discharg" + 0.004*"hospit" + 0.004*"negat" + 0.003*"daili"


## Testing model on unseen document


In [29]:
unseen_document = """
Admission Date:  [**2151-7-16**]       Discharge Date:  [**2151-8-4**]


Service:
ADDENDUM:

RADIOLOGIC STUDIES:  Radiologic studies also included a chest
CT, which confirmed cavitary lesions in the left lung apex
consistent with infectious process/tuberculosis.  This also
moderate-sized left pleural effusion.

HEAD CT:  Head CT showed no intracranial hemorrhage or mass
effect, but old infarction consistent with past medical
history.

ABDOMINAL CT:  Abdominal CT showed lesions of
T10 and sacrum most likely secondary to osteoporosis. These can
be followed by repeat imaging as an outpatient.



                            [**First Name8 (NamePattern2) **] [**First Name4 (NamePattern1) 1775**] [**Last Name (NamePattern1) **], M.D.  [**MD Number(1) 1776**]

Dictated By:[**Hospital 1807**]
MEDQUIST36

D:  [**2151-8-5**]  12:11
T:  [**2151-8-5**]  12:21
JOB#:  [**Job Number 1808**]

"""
bow_vector = dictionary.doc2bow(preprocess(unseen_document))
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print(index)
    print(type(lda_model.print_topic(index, 5)))
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

4
<class 'str'>
Score: 0.5339741110801697	 Topic: 0.015*"right" + 0.009*"hospit" + 0.009*"patient" + 0.009*"discharg" + 0.008*"head"
2
<class 'str'>
Score: 0.40144088864326477	 Topic: 0.051*"patient" + 0.014*"discharg" + 0.011*"histori" + 0.010*"hospit" + 0.008*"medic"
0
<class 'str'>
Score: 0.05208202451467514	 Topic: 0.050*"valv" + 0.036*"mild" + 0.032*"aortic" + 0.027*"mitral" + 0.025*"ventricular"
