In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 

#from imblearn.over_sampling import SMOTE
from keras.models import load_model

from keras.callbacks import ModelCheckpoint, EarlyStopping
from sklearn.metrics import f1_score, confusion_matrix
from random import seed
from random import choice
from sklearn.metrics import precision_score

In [2]:
clinical_text_df = pd.read_csv("mtsamples.csv")

In [6]:
clinical_text_df.head()

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [5]:
if "Unnamed: 0" in clinical_text_df.columns:
    clinical_text_df.drop("Unnamed: 0", axis=1, inplace=True) 

In [7]:
clinical_text_df.shape

(4999, 5)

In [8]:
print(clinical_text_df.columns)
clinical_text_df.head(5)

Index(['description', 'medical_specialty', 'sample_name', 'transcription',
       'keywords'],
      dtype='object')


Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [11]:
# get the uniqye number of medical speciality
len(clinical_text_df["medical_specialty"].unique())

40

In [12]:
# Check the number of samples in each unique medical_speciality
clinical_text_df["medical_specialty"].value_counts()

 Surgery                          1103
 Consult - History and Phy.        516
 Cardiovascular / Pulmonary        372
 Orthopedic                        355
 Radiology                         273
 General Medicine                  259
 Gastroenterology                  230
 Neurology                         223
 SOAP / Chart / Progress Notes     166
 Obstetrics / Gynecology           160
 Urology                           158
 Discharge Summary                 108
 ENT - Otolaryngology               98
 Neurosurgery                       94
 Hematology - Oncology              90
 Ophthalmology                      83
 Nephrology                         81
 Emergency Room Reports             75
 Pediatrics - Neonatal              70
 Pain Management                    62
 Psychiatry / Psychology            53
 Office Notes                       51
 Podiatry                           47
 Dermatology                        29
 Cosmetic / Plastic Surgery         27
 Dentistry               

In [8]:
## Create medical _speciality classes
data_categories  = clinical_text_df.groupby(clinical_text_df['medical_specialty'])
i = 1
print('===========Original Categories =======================')
for catName,dataCategory in data_categories:
    print('Class:'+str(i)+' '+catName + ' : '+ str(len(dataCategory)) )
    i = i+1
print('==================================')

Class:1  Allergy / Immunology : 7
Class:2  Autopsy : 8
Class:3  Bariatrics : 18
Class:4  Cardiovascular / Pulmonary : 372
Class:5  Chiropractic : 14
Class:6  Consult - History and Phy. : 516
Class:7  Cosmetic / Plastic Surgery : 27
Class:8  Dentistry : 27
Class:9  Dermatology : 29
Class:10  Diets and Nutritions : 10
Class:11  Discharge Summary : 108
Class:12  ENT - Otolaryngology : 98
Class:13  Emergency Room Reports : 75
Class:14  Endocrinology : 19
Class:15  Gastroenterology : 230
Class:16  General Medicine : 259
Class:17  Hematology - Oncology : 90
Class:18  Hospice - Palliative Care : 6
Class:19  IME-QME-Work Comp etc. : 16
Class:20  Lab Medicine - Pathology : 8
Class:21  Letters : 23
Class:22  Nephrology : 81
Class:23  Neurology : 223
Class:24  Neurosurgery : 94
Class:25  Obstetrics / Gynecology : 160
Class:26  Office Notes : 51
Class:27  Ophthalmology : 83
Class:28  Orthopedic : 355
Class:29  Pain Management : 62
Class:30  Pediatrics - Neonatal : 70
Class:31  Physical Medicine - 

In [9]:
clinical_text_df = clinical_text_df[clinical_text_df['transcription'].notna()]
clinical_text_df

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."
...,...,...,...,...,...,...
4994,4994,Patient having severe sinusitis about two to ...,Allergy / Immunology,Chronic Sinusitis,"HISTORY:, I had the pleasure of meeting and e...",
4995,4995,This is a 14-month-old baby boy Caucasian who...,Allergy / Immunology,Kawasaki Disease - Discharge Summary,"ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...","allergy / immunology, mucous membranes, conjun..."
4996,4996,A female for a complete physical and follow u...,Allergy / Immunology,Followup on Asthma,"SUBJECTIVE: , This is a 42-year-old white fema...",
4997,4997,Mother states he has been wheezing and coughing.,Allergy / Immunology,Asthma in a 5-year-old,"CHIEF COMPLAINT: , This 5-year-old male presen...",


In [10]:
def get_sentence_word_count(text_list):
    sent_count = 0
    word_count = 0
    vocab = {}
    for text in text_list:
        sentences=sent_tokenize(str(text).lower())
        sent_count = sent_count + len(sentences)
        for sentence in sentences:
            words=word_tokenize(sentence)
            
            for word in words:
                if(word in vocab.keys()):
                    vocab[word] = vocab[word] +1
                else:
                    vocab[word] =1 
    word_count = len(vocab.keys())
    return sent_count,word_count

In [11]:
#clinical_text_df = clinical_text_df[clinical_text_df['transcription'].notna()]
sent_count,word_count= get_sentence_word_count(clinical_text_df['transcription'].tolist())
print("Number of sentences in transcriptions column: "+ str(sent_count))
print("Number of unique words in transcriptions column: "+str(word_count))

Number of sentences in transcriptions column: 140214
Number of unique words in transcriptions column: 35805


In [12]:
filtered_data_categories = data_categories.filter(lambda x:x.shape[0] > 20)
final_data_categories = filtered_data_categories.groupby(filtered_data_categories['medical_specialty'])
i=1
print('============Reduced Categories ======================')
for catName,dataCategory in final_data_categories:
    print('Cat:'+str(i)+' '+catName + ' : '+ str(len(dataCategory)) )
    i = i+1

print('============ Reduced Categories ======================')


Cat:1  Cardiovascular / Pulmonary : 372
Cat:2  Consult - History and Phy. : 516
Cat:3  Cosmetic / Plastic Surgery : 27
Cat:4  Dentistry : 27
Cat:5  Dermatology : 29
Cat:6  Discharge Summary : 108
Cat:7  ENT - Otolaryngology : 98
Cat:8  Emergency Room Reports : 75
Cat:9  Gastroenterology : 230
Cat:10  General Medicine : 259
Cat:11  Hematology - Oncology : 90
Cat:12  Letters : 23
Cat:13  Nephrology : 81
Cat:14  Neurology : 223
Cat:15  Neurosurgery : 94
Cat:16  Obstetrics / Gynecology : 160
Cat:17  Office Notes : 51
Cat:18  Ophthalmology : 83
Cat:19  Orthopedic : 355
Cat:20  Pain Management : 62
Cat:21  Pediatrics - Neonatal : 70
Cat:22  Physical Medicine - Rehab : 21
Cat:23  Podiatry : 47
Cat:24  Psychiatry / Psychology : 53
Cat:25  Radiology : 273
Cat:26  SOAP / Chart / Progress Notes : 166
Cat:27  Surgery : 1103
Cat:28  Urology : 158


In [13]:
data = filtered_data_categories[['transcription', 'medical_specialty']]
data = data.drop(data[data['transcription'].isna()].index)
data.shape

(4821, 2)

In [14]:
data

Unnamed: 0,transcription,medical_specialty
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary
...,...,...
4972,"INDICATION: , Chest pain.,TYPE OF TEST: , Aden...",Cardiovascular / Pulmonary
4973,"CHIEF COMPLAINT: , Chest pain.,HISTORY OF PRES...",Cardiovascular / Pulmonary
4974,"HISTORY OF PRESENT ILLNESS: , The patient is a...",Cardiovascular / Pulmonary
4975,"HISTORY OF PRESENT ILLNESS: , Mr. ABC is a 60-...",Cardiovascular / Pulmonary


In [15]:
data

Unnamed: 0,transcription,medical_specialty
3,"2-D M-MODE: , ,1. Left atrial enlargement wit...",Cardiovascular / Pulmonary
4,1. The left ventricular cavity size and wall ...,Cardiovascular / Pulmonary
7,"2-D ECHOCARDIOGRAM,Multiple views of the heart...",Cardiovascular / Pulmonary
9,"DESCRIPTION:,1. Normal cardiac chambers size....",Cardiovascular / Pulmonary
11,"2-D STUDY,1. Mild aortic stenosis, widely calc...",Cardiovascular / Pulmonary
...,...,...
4972,"INDICATION: , Chest pain.,TYPE OF TEST: , Aden...",Cardiovascular / Pulmonary
4973,"CHIEF COMPLAINT: , Chest pain.,HISTORY OF PRES...",Cardiovascular / Pulmonary
4974,"HISTORY OF PRESENT ILLNESS: , The patient is a...",Cardiovascular / Pulmonary
4975,"HISTORY OF PRESENT ILLNESS: , Mr. ABC is a 60-...",Cardiovascular / Pulmonary


In [16]:
transcription = data['transcription'].tolist()

In [17]:
labels = data['medical_specialty'].tolist()

In [18]:
len(labels)

4821

In [19]:
len(transcription)

4821

In [20]:
sting = transcription[214]

In [21]:
labels[214]

' Surgery'

In [22]:
if 'Radiology' in sting.split():
    print('yes')
else:
    print("no")

no


In [24]:
trainx = data['transcription']
trainy = data['medical_specialty']

In [25]:
trainy.shape, trainx.shape

((4821,), (4821,))

In [26]:
filtered_data_categories['medical_specialty'] =filtered_data_categories['medical_specialty'].apply(lambda x:str.strip(x))
mask = filtered_data_categories['medical_specialty'] == 'Surgery'
filtered_data_categories = filtered_data_categories[~mask]
final_data_categories = filtered_data_categories.groupby(filtered_data_categories['medical_specialty'])
mask = filtered_data_categories['medical_specialty'] == 'SOAP / Chart / Progress Notes'
filtered_data_categories = filtered_data_categories[~mask]
mask = filtered_data_categories['medical_specialty'] == 'Office Notes'
filtered_data_categories = filtered_data_categories[~mask]
mask = filtered_data_categories['medical_specialty'] == 'Consult - History and Phy.'
filtered_data_categories = filtered_data_categories[~mask]
mask = filtered_data_categories['medical_specialty'] == 'Emergency Room Reports'
filtered_data_categories = filtered_data_categories[~mask]
mask = filtered_data_categories['medical_specialty'] == 'Discharge Summary'
filtered_data_categories = filtered_data_categories[~mask]

'''
mask = filtered_data_categories['medical_specialty'] == 'Pediatrics - Neonatal'
filtered_data_categories = filtered_data_categories[~mask]
'''
mask = filtered_data_categories['medical_specialty'] == 'Pain Management'
filtered_data_categories = filtered_data_categories[~mask]
mask = filtered_data_categories['medical_specialty'] == 'General Medicine'
filtered_data_categories = filtered_data_categories[~mask]


mask = filtered_data_categories['medical_specialty'] == 'Neurosurgery'
filtered_data_categories.loc[mask, 'medical_specialty'] = 'Neurology'
mask = filtered_data_categories['medical_specialty'] == 'Nephrology'
filtered_data_categories.loc[mask, 'medical_specialty'] = 'Urology'

i=1
print('============Reduced Categories======================')
for catName,dataCategory in final_data_categories:
    print('Cat:'+str(i)+' '+catName + ' : '+ str(len(dataCategory)) )
    i = i+1

print('============Reduced Categories======================')


data = filtered_data_categories[['transcription', 'medical_specialty']]
data = data.drop(data[data['transcription'].isna()].index)
data.shape

Cat:1 Cardiovascular / Pulmonary : 372
Cat:2 Consult - History and Phy. : 516
Cat:3 Cosmetic / Plastic Surgery : 27
Cat:4 Dentistry : 27
Cat:5 Dermatology : 29
Cat:6 Discharge Summary : 108
Cat:7 ENT - Otolaryngology : 98
Cat:8 Emergency Room Reports : 75
Cat:9 Gastroenterology : 230
Cat:10 General Medicine : 259
Cat:11 Hematology - Oncology : 90
Cat:12 Letters : 23
Cat:13 Nephrology : 81
Cat:14 Neurology : 223
Cat:15 Neurosurgery : 94
Cat:16 Obstetrics / Gynecology : 160
Cat:17 Office Notes : 51
Cat:18 Ophthalmology : 83
Cat:19 Orthopedic : 355
Cat:20 Pain Management : 62
Cat:21 Pediatrics - Neonatal : 70
Cat:22 Physical Medicine - Rehab : 21
Cat:23 Podiatry : 47
Cat:24 Psychiatry / Psychology : 53
Cat:25 Radiology : 273
Cat:26 SOAP / Chart / Progress Notes : 166
Cat:27 Urology : 158


(2498, 2)

## Use Spacy

In [27]:
import spacy
import en_ner_bionlp13cg_md

In [28]:

#pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.0/en_ner_bionlp13cg_md-0.5.0.tar.gz


In [29]:
nlp = en_ner_bionlp13cg_md.load()

In [30]:
def process_Text( text):
    wordlist=[]
    doc = nlp(text)
    for ent in doc.ents:
        wordlist.append(ent.text)
    return ' '.join(wordlist)   

In [31]:
def clean_text(text ): 
    text = text.translate(str.maketrans('', '', string.punctuation))
    text1 = ''.join([w for w in text if not w.isdigit()]) 
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    #BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    
    text2 = text1.lower()
    text2 = REPLACE_BY_SPACE_RE.sub('', text2) # replace REPLACE_BY_SPACE_RE symbols by space in text
    #text2 = BAD_SYMBOLS_RE.sub('', text2)
    return text2

def clearup(document):
    document = document.translate(string.punctuation)
    numbers = re.search('[0-9]+', document) # finds the numbers in the string
    document = re.sub('\(\d+.\d+\)|\d-\d|\d', '', document) \
        .replace('.', '').replace(',', '').replace(',', '').replace(':', '').replace('~', '') \
        .replace('!', '').replace('@', '').replace('#', '').replace('$', '').replace('/', '') \
        .replace('%', '').replace('(', '').replace(')', '').replace('?', '') \
        .replace('—', '').replace(';', '').replace('&quot', '').replace('&lt', '') \
        .replace('^', '').replace('"', '').replace('{', '').replace('}', '').replace('\\', '').replace('+', '') \
        .replace('&gt', '').replace('&apos', '').replace('*', '').strip().lower().split()
    # return re.sub('[l]+', ' ', str(document)).strip()
    return document

def lemmatize_text(text):
    wordlist=[]
    lemmatizer = WordNetLemmatizer() 
    sentences=sent_tokenize(text)
    
    intial_sentences= sentences[0:1]
    final_sentences = sentences[len(sentences)-2: len(sentences)-1]
    for sentence in intial_sentences:
        words=word_tokenize(sentence)
        
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))
    for sentence in final_sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))       
    return ' '.join(wordlist) 

In [32]:
data['transcription'] = data['transcription'].apply(process_Text)

In [33]:
data['transcription']

3       Left atrial left ventricular pericardial aorti...
4       left ventricular wall wall left atrium right a...
7       heart vessels intracardiac vessel Cardiac peri...
9       Normal cardiac pericardial intracardiac mitral...
11      systolic tricuspid mitral aortic heart tricusp...
                              ...                        
4972    Adenosine nuclear patient heart blood heart bl...
4973    patient patient coronary artery patient shortn...
4974    patient sublingual nitroglycerin nitroglycerin...
4975    gentleman stress sublingual nitroglycerin pati...
4976    heart valvular insufficiency.,HISTORY patient ...
Name: transcription, Length: 2498, dtype: object

In [34]:
trainx_index = list(data['transcription'].index)


In [35]:
data['transcription'].shape

(2498,)

In [36]:
data['transcription'] = data['transcription'].apply(lemmatize_text)

In [37]:
data['transcription']

3       Left atrial left ventricular pericardial aorti...
4       left ventricular wall wall left atrium right a...
7       heart vessel intracardiac vessel Cardiac peric...
9       Normal cardiac pericardial intracardiac mitral...
11      systolic tricuspid mitral aortic heart tricusp...
                              ...                        
4972    Adenosine nuclear patient heart blood heart bl...
4973    patient patient coronary artery patient shortn...
4974    patient sublingual nitroglycerin nitroglycerin...
4975    gentleman stress sublingual nitroglycerin pati...
4976    heart valvular insufficiency. , HISTORY patien...
Name: transcription, Length: 2498, dtype: object

In [38]:
data['transcription'] = data['transcription'].apply(clearup)

In [39]:
data['transcription']

3       [left, atrial, left, ventricular, pericardial,...
4       [left, ventricular, wall, wall, left, atrium, ...
7       [heart, vessel, intracardiac, vessel, cardiac,...
9       [normal, cardiac, pericardial, intracardiac, m...
11      [systolic, tricuspid, mitral, aortic, heart, t...
                              ...                        
4972    [adenosine, nuclear, patient, heart, blood, he...
4973    [patient, patient, coronary, artery, patient, ...
4974    [patient, sublingual, nitroglycerin, nitroglyc...
4975    [gentleman, stress, sublingual, nitroglycerin,...
4976    [heart, valvular, insufficiency, history, pati...
Name: transcription, Length: 2498, dtype: object

In [40]:
#data['transcription'] = data['transcription'].apply(clean_text)
#data['transcription']

In [41]:
doc2 = data['transcription'].tolist()

In [43]:
len(doc2)

2498

In [44]:
doc2[4]

['systolic', 'tricuspid', 'mitral', 'aortic', 'heart', 'tricuspid', 'heart']

In [45]:
import pickle

In [47]:
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences


In [41]:
def size(alist):
    return len(alist)
def prep_data_CNN(documents):
    """
    Prepare the padded docs and vocab_size for CNN training
    """
    
    t = Tokenizer()
    docs = list(filter(None, documents))
    print("Size of the documents in prep_data {}".format(len(documents)))
    t.fit_on_texts(documents)

    vocab_size = len(t.word_counts)
    print("Vocab size {}".format(vocab_size))
    encoded_docs = t.texts_to_sequences(documents)
    print("Size of the encoded documents {}".format(len(encoded_docs)))
    e_lens = []
    for i in range(len(encoded_docs)):
        e_lens.append(len(encoded_docs[i]))
    lens_edocs = list(map(size, encoded_docs))
    max_length = np.average(lens_edocs)
    sequence_length = 300  # Can use this instead of the above average max_length value
    max_length = sequence_length
    padded_docs = pad_sequences(
        encoded_docs, maxlen=int(max_length), padding='post')
    print("Length of a padded row {}".format(padded_docs.shape))
    print("max_length {} and min_length {} and average {}".format(
        max_length, min(lens_edocs), np.average(lens_edocs)))
    return padded_docs, max_length, vocab_size, t.word_index

In [42]:
padded_docs2, max_length2, vocab_size2, word_index2 = prep_data_CNN(doc2)

Vocab size 6024
Size of the encoded documents 2498
Length of a padded row (2498, 300)
max_length 300 and min_length 0 and average 46.3306645316253


In [43]:
padded_docs2

array([[   9,  130,    9, ...,    0,    0,    0],
       [   9,   74,   36, ...,    0,    0,    0],
       [  10,   26,  828, ...,    0,    0,    0],
       ...,
       [   1, 1618,  505, ...,    0,    0,    0],
       [ 244,  276, 1618, ...,    0,    0,    0],
       [  10,  665, 2249, ...,    0,    0,    0]])