In [1]:
# Libraries:
import pandas as pd
import numpy as np
import os
import re
import time
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import nltk
from nltk import FreqDist
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from collections import defaultdict

In [4]:
# Reading in the ICD Diagonoses dataset (which connects ICD9 code to their appropriate titles in diagonoses)
ICD_Diagonoses = pd.read_csv(r"D:/Mimic data/mimic-iii-clinical-database-1.4/MimicFinal/D_ICD_DIAGNOSES.csv")

In [5]:
ICD_Diagonoses.head()

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
0,174,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac..."
1,175,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified"
2,176,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h..."
3,177,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h..."
4,178,1173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou..."


In [9]:
ICD_Diagonoses.shape

(14567, 4)

In [10]:
ICD_Diagonoses['DATASET_TYPE'] = 'DIAGONOSES'

In [11]:
# Reading the Procedure Dataset (which connects ICD9 code to their appropriate titles in procedures)
ICD_Procedures = pd.read_csv(r"D:/Mimic data/mimic-iii-clinical-database-1.4/MimicFinal/D_ICD_PROCEDURES.csv")

In [12]:
ICD_Procedures.shape

(3882, 4)

In [13]:
ICD_Procedures['DATASET_TYPE'] = 'PROCEDURES'

In [14]:
# Concatinating Short and Long titles for both Diagonoses and Procedure for each ICD9 codes
Proc_Diag = pd.concat([ICD_Diagonoses, ICD_Procedures])

In [15]:
Proc_Diag['ALL_TITLE'] = Proc_Diag['SHORT_TITLE'] + ". " + Proc_Diag['LONG_TITLE']

In [16]:
print(Proc_Diag.shape)
Proc_Diag.head()

(18449, 6)


Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE,DATASET_TYPE,ALL_TITLE
0,174,1166,TB pneumonia-oth test,"Tuberculous pneumonia [any form], tubercle bac...",DIAGONOSES,TB pneumonia-oth test. Tuberculous pneumonia [...
1,175,1170,TB pneumothorax-unspec,"Tuberculous pneumothorax, unspecified",DIAGONOSES,TB pneumothorax-unspec. Tuberculous pneumothor...
2,176,1171,TB pneumothorax-no exam,"Tuberculous pneumothorax, bacteriological or h...",DIAGONOSES,TB pneumothorax-no exam. Tuberculous pneumotho...
3,177,1172,TB pneumothorx-exam unkn,"Tuberculous pneumothorax, bacteriological or h...",DIAGONOSES,TB pneumothorx-exam unkn. Tuberculous pneumoth...
4,178,1173,TB pneumothorax-micro dx,"Tuberculous pneumothorax, tubercle bacilli fou...",DIAGONOSES,TB pneumothorax-micro dx. Tuberculous pneumoth...


In [17]:
print(len(Proc_Diag.ICD9_CODE.unique()))
print(len(Proc_Diag.ROW_ID.unique()))

18376
14567


In [18]:
# Checking duplicates
#Proc_Diag[Proc_Diag.duplicated(subset=['ICD9_CODE'])]

In [21]:
# Reading the 'NOTE EVENTS' table (we only read in the required columns as the dataset can be huge and difficult to manage)
NOTEEVENTS = pd.read_csv(r"D:/Mimic data/mimic-iii-clinical-database-1.4/MimicFinal/NOTEEVENTS.csv",
                         usecols = ['ROW_ID', 'SUBJECT_ID', 'HADM_ID', 'CATEGORY', 'TEXT'])

In [22]:
print(NOTEEVENTS.shape)
NOTEEVENTS.head()

(2083180, 5)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CATEGORY,TEXT
0,174,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...
1,175,13702,107527.0,Discharge summary,Admission Date: [**2118-6-2**] Discharg...
2,176,13702,167118.0,Discharge summary,Admission Date: [**2119-5-4**] D...
3,177,13702,196489.0,Discharge summary,Admission Date: [**2124-7-21**] ...
4,178,26880,135453.0,Discharge summary,Admission Date: [**2162-3-3**] D...


In [23]:
# Reading the Diagonoses ICD dataframe.. that connects the hospital and patient id to associated ICD9 codes
Diagonoses_ICD = pd.read_csv(r"D:/Mimic data/mimic-iii-clinical-database-1.4/MimicFinal/DIAGNOSES_ICD.csv")

In [24]:
print(Diagonoses_ICD.shape)
Diagonoses_ICD.head()

(651047, 5)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254


In [25]:
# Merge the NOTEEVENTS and DIAGNOSES_ICD dataframes on Subjet ID and HAMD_ID (hospital ID)
NOTE_ICD_df = pd.merge(NOTEEVENTS[['SUBJECT_ID', 'HADM_ID', 'CATEGORY', 'TEXT']], 
                       Diagonoses_ICD[['SUBJECT_ID','HADM_ID', 'ICD9_CODE']],  
                       how='left', on=['SUBJECT_ID','HADM_ID'])

In [26]:
NOTE_ICD_df.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,ICD9_CODE
0,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,1193
1,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,4254
2,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,42731
3,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,2639
4,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,2762


In [27]:
#NOTE_ICD_df.TEXT[0]

In [28]:
# Merging the diagnoses and procedure titles to note events merged dataset
NOTE_ICD_df = NOTE_ICD_df.merge(Proc_Diag[['ICD9_CODE', 'DATASET_TYPE', 'ALL_TITLE']], how="left", on='ICD9_CODE')
print(NOTE_ICD_df.shape)
NOTE_ICD_df.head()

(25967884, 7)


Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,ICD9_CODE,DATASET_TYPE,ALL_TITLE
0,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,1193,DIAGONOSES,Pulmon TB NOS-micro dx. Pulmonary tuberculosis...
1,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
2,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,42731,DIAGONOSES,Atrial fibrillation. Atrial fibrillation
3,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,2639,DIAGONOSES,Protein-cal malnutr NOS. Unspecified protein-c...
4,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,2762,DIAGONOSES,Acidosis. Acidosis


In [29]:
# Removing duplicates (rows)
NOTE_ICD_df = NOTE_ICD_df[~NOTE_ICD_df.duplicated()]
print(NOTE_ICD_df.shape)

(25339733, 7)


In [92]:
#" ".join(NOTE_ICD_df[NOTE_ICD_df.ICD9_CODE == '01193']['TEXT'])

In [96]:
#NOTE_ICD_df[NOTE_ICD_df.ICD9_CODE == '01193']['TEXT'][1159345]#[1157904]

'Atrial fibrillation with a slow ventricular response. Left ventricular\nhypertrophy. Prior lateral myocardial infarction. T wave inversion in\nleads VI-V4 which may represent active anterior ischemic process. Compared to\nthe previous tracing of [**2151-7-16**] the ventricular response has slowed. The\nT waves are no longer tall and peaked and left bundle-branch block is no longer\nrecorded. However, the present tracing is different from the aspect of the\nanterior T wave abnormalities compared to the previous tracing of [**2147-3-2**]\nsimilarly in the absence of left bundle-branch block. Clinical correlation is\nsuggested.\nTRACING #1\n\n'

In [94]:
#NOTE_ICD_df[NOTE_ICD_df.ICD9_CODE == '01193']['TEXT'][1157912]

'Atrial fibrillation with a slow ventricular response. More prominent T wave\ninvresion in leads VI-V2. Rule out active anterior ischemic process and/or\ninfarction. Clinical correlation is suggested.\nTRACING #2\n\n'

In [97]:
#NOTE_ICD_df[NOTE_ICD_df.ICD9_CODE == '01193']['ALL_TITLE'].unique()

array([ 'Pulmon TB NOS-micro dx. Pulmonary tuberculosis, unspecified, tubercle bacilli found (in sputum) by microscopy'], dtype=object)

In [30]:
# building a test set
test = NOTE_ICD_df[NOTE_ICD_df.ICD9_CODE.isin(['01193', '4254', '42731'])]
test.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,ICD9_CODE,DATASET_TYPE,ALL_TITLE
0,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,1193,DIAGONOSES,Pulmon TB NOS-micro dx. Pulmonary tuberculosis...
1,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
2,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,42731,DIAGONOSES,Atrial fibrillation. Atrial fibrillation
52,26880,135453.0,Discharge summary,Admission Date: [**2162-3-3**] D...,42731,DIAGONOSES,Atrial fibrillation. Atrial fibrillation
74,42130,114236.0,Discharge summary,Admission Date: [**2150-2-25**] ...,42731,DIAGONOSES,Atrial fibrillation. Atrial fibrillation


In [22]:
test.ICD9_CODE.value_counts()

42731    434222
4254      55490
01193        32
Name: ICD9_CODE, dtype: int64

In [25]:
#test[test.ICD9_CODE == '4254']

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT,ICD9_CODE,DATASET_TYPE,ALL_TITLE
1,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
214,9805,177212.0,Discharge summary,Admission Date: [**2131-6-28**] ...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
225,9805,177212.0,Discharge summary,Admission Date: [**2131-7-5**] D...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
2042,55116,186183.0,Discharge summary,Admission Date: [**2179-1-29**] ...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
2458,96950,176286.0,Discharge summary,Admission Date: [**2103-6-19**] ...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
2869,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
3610,13054,184484.0,Discharge summary,Admission Date: [**2134-12-12**] Discha...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
3944,14515,168174.0,Discharge summary,Admission Date: [**2166-10-4**] Discharge...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
4491,23201,131913.0,Discharge summary,Admission Date: [**2181-10-5**] ...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...
4813,262,106019.0,Discharge summary,Admission Date: [**2153-9-25**] ...,4254,DIAGONOSES,Prim cardiomyopathy NEC. Other primary cardiom...


In [23]:
# Count the frequency for each ICD9 code appearance
icd9_freq = pd.DataFrame(NOTE_ICD_df['ICD9_CODE'].value_counts())

In [24]:
# Number of ICD9 codes that appear from 1 to 1000 times
len(list(icd9_freq[icd9_freq.ICD9_CODE.between(1, 1000)].index))

5174

In [82]:
#test[test['ICD9_CODE'] == '4254']['CATEGORY'].value_counts()

In [None]:
# NOTE: for more sophisticated models, CATEGORY and also be used

In [25]:
# Finding the stopwords
stop_words = pd.read_csv('english_stopwords.txt', sep=" ", header=None)

In [49]:
#'to' in list(stop_words[0])

True

### Text Preprocessing Step

In [86]:
#nltk.download()
#stop_words = set(stopwords.words('english')) 

# Function to remove Punctuation and keep everything in lower case
def rem_Punct(sent):
    sent = re.sub(r'\d+', ' ', sent)
    sent = re.sub(r'_', " ", sent)
    tokenizer = RegexpTokenizer(r'\w+')
    word_tokens = tokenizer.tokenize(sent)
    all_tokens = [w.lower() for w in word_tokens if not w.lower() in list(stop_words[0])]
    return(all_tokens) # taking unique & then returning a list

### Applying the pre-processing function to ICD9 code (can be used in an apply function)

In [27]:
def icd9_text(data, icd9code):
    df = data[data.ICD9_CODE == icd9code]
    listoftitles = df.ALL_TITLE.unique()
    listoftitles = [x for x in listoftitles if str(x) != 'nan']
    alltitle_string = " ".join(listoftitles)
    listoftextnotes = df.TEXT.unique()
    textnotes_string = " ".join(listoftextnotes)
    
    finaldoc = alltitle_string + textnotes_string
    finaldoc = rem_Punct(finaldoc)
    
    return(finaldoc)

### Applying the processing step to ICD9 code but outputing a dictionary

In [28]:
def icd9_text2(data, icd9_list):
    finaldict = defaultdict()
    df = data[data.ICD9_CODE.isin(icd9_list)]
    for index, row in df.iterrows():
        if row['ICD9_CODE'] not in list(finaldict.keys()):
            finaldict[row['ICD9_CODE']] = str(row['ALL_TITLE']) + row['TEXT']
        else:
            finaldict[row['ICD9_CODE']] = finaldict[row['ICD9_CODE']] + " " + str(row['ALL_TITLE']) + row['TEXT']
    print("moving to NLP step")
    for k,v in finaldict.items():
        finaldict[k] = list(set(rem_Punct(finaldict[k])))
    return(dict(finaldict))

**Due to high number of ICD9 codes. it takes a long time to run. Therefore we are using just those ICD9 code that appear at max 100 times in the dataset. To test our methodology and functionality.**

In [87]:
# for 6000+ icd9 codes which had number of rows between 1 & 100
s = time.time()
ICD9_tokens = icd9_text2(NOTE_ICD_df, icd9_list=list(icd9_freq[icd9_freq.ICD9_CODE.between(1, 100)].index))
e = time.time()
print((e-s)/60)

12.718841032187145


In [88]:
print(len(ICD9_tokens.keys()))
list(ICD9_tokens.keys())[0:3]

2627


['01193', '6950', '71828']

In [110]:
# Creating a Corpus from the above dictionary (which is essentially a list of all the words)
corpus = []
for k,v in ICD9_tokens.items():
    corpus.extend(v)

In [111]:
len(corpus)

4186361

In [112]:
# Counting the frequency of each word in the corpus
freq_dict = {} 
for item in corpus: 
    if (item in freq_dict): 
        freq_dict[item] += 1
    else: 
        freq_dict[item] = 1

In [113]:
"microscopy" in list(freq_dict.keys())

True

In [114]:
freq_dict['microscopy']

19


# Top 500 words
#max(freq_dict.iterkeys(), key=(lambda key: freq_dict[key]))[0:5]
sorted(freq_dict, key=freq_dict.get, reverse=True)[0:500]

# bottom 500 words
sorted(freq_dict, key=freq_dict.get, reverse=False)[0:500]

## Note:
The preprocessing step can be further improved here, as we need some words which are just a amalgamation of two words. This needs to be done manually for best results!

In [99]:
# Creating a dataframe out of the dictionary 
ICD9_df =  pd.DataFrame(list(ICD9_tokens.items()), columns=['ICD9_CODE', 'WORDS_ASSOCIATED'])
ICD9_df.head()

Unnamed: 0,ICD9_CODE,WORDS_ASSOCIATED
0,1193,"[paranasal, equal, boost, necessary, total, di..."
1,6950,"[ppx, total, displayed, erythemaprobable, unle..."
2,71828,"[reagarding, gnr, sitess, equal, appropriate, ..."
3,9899,"[ssi, gnr, ppx, paranasal, equal, ronchi, payi..."
4,4572,"[ppx, m, bilaterally, doppler, lv, dullness, m..."


In [100]:
#ICD9_df.columns

In [101]:
# Creating the embedding out of all the 8.9k+ distinct words
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
ICD9_corpus = ICD9_df.join(pd.DataFrame(mlb.fit_transform(ICD9_df.pop('WORDS_ASSOCIATED')),
                           columns=mlb.classes_,
                           index=ICD9_df.index))

In [117]:
'consciousnessy' in list(ICD9_corpus.columns)

False

In [118]:
'onot' in sorted(freq_dict, key=freq_dict.get, reverse=False)[0:500]

False

In [119]:
# As we do not need all the words, we limit the embedding to 500 most frequent and least frequent words.
ICD9_corpus2 = ICD9_corpus[['ICD9_CODE'] + sorted(freq_dict, key=freq_dict.get, reverse=True)[0:500] + sorted(freq_dict, key=freq_dict.get, reverse=False)[0:500]]
ICD9_corpus2.head(20)

Unnamed: 0,ICD9_CODE,name,last,hospital,history,first,condition,date,medical,present,...,nbh,moniutor,hyptonic,democycline,allieviating,niride,ersistent,unintelligle,unsuscessful,tongight
0,01193,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,6950,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,71828,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,9899,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,4572,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
5,40401,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
6,72283,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
7,53450,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
8,36589,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
9,1952,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [2]:
#The above hot encoding can be used as input for the model which can be used to evaluate embeddings. 

----------

### Testing

In [53]:
# s = time.time()
# test2 = pd.DataFrame(columns=['ICD9_CODE', 'TEXT'])
# #test2 = default
# for icd9 in list(test[test.ICD9_CODE == '01193'].ICD9_CODE.unique()):
#     test2 = test2.append({'ICD9_CODE':icd9,
#                           'TEXT':icd9_text(test, icd9)
#                          }, ignore_index = True)
# e = time.time()
# print((e-s)/60)

0.0025971531867980957


In [121]:
#test2.head()

In [55]:
# s = time.time()
# test2 = {}
# #test2 = default
# for icd9 in list(test[test.ICD9_CODE == '01193'].ICD9_CODE.unique()):
#     test2[icd9] = icd9_text(test, icd9)
# e = time.time()
# print((e-s)/60)  
# #     test2 = test2.append({'ICD9_CODE':icd9,
# #                           'TEXT':icd9_text(test, icd9)
# #                          }, ignore_index = True)

0.00142135222752889
