## NLP pipeline

In [1]:
import pymysql
import pandas as pd
import getpass

In [2]:
import glob
import re
import os

In [3]:
from PyRuSH.RuSH import RuSH

from pyConTextNLP import pyConTextGraph
from pyConTextNLP.utils import get_document_markups

from DocumentClassifier import FeatureInferencer
from DocumentClassifier import DocumentInferencer
from DocumentClassifier import DocumentClassifier

from nlp_pneumonia_utils import markup_sentence
from nlp_pneumonia_utils import read_doc_annotations
from nlp_pneumonia_utils import list_errors

from itemData import get_item_data

In [4]:
from pipeUtils import Annotation
from pipeUtils import Document
from wrapperPyConText import ConTextPipe
from wrapperPyConText import convertMarkupsAnnotations

In [5]:
# from visual import Vis
from visual import snippets_markup
from visual import view_pycontext_outputs
from visual import display_doc_text
from visual import convertMarkups2DF

# packages for interaction
from IPython.display import display, HTML
import ipywidgets

In [6]:
# begin to define MyPipe class
class MyPipe:
    def __init__(self, sentence_rules, target_rules, context_rules, feature_inference_rule, document_inference_rule):
        # initiate necessary components here
        self.sentence_segmenter = RuSH(sentence_rules)
        self.feature_inferencer = FeatureInferencer(feature_inference_rule)
        self.document_inferencer = DocumentInferencer(document_inference_rule)

        self.targets=get_item_data(target_rules)
        self.modifiers=get_item_data(context_rules)
        
        self.target_rules_abi=self.getTargetRegexes()
        
    def process(self, doc_text, doc):        
        #process your input doc_text, return the required results                                   
        sentences = self.sentence_segmenter.segToSentenceSpans(doc_text)
             
        # initiate a pyConTextGraph to hold the pyConText output
        context_doc = pyConTextGraph.ConTextDocument()
        
        for sentence in sentences:
            # print(sentence)
            sentence_text = doc_text[sentence.begin:sentence.end].lower()
            # Process every sentence by adding markup
            m = markup_sentence(sentence_text, modifiers=self.modifiers, targets=self.targets)
            doc.annotations.extend(convertMarkupsAnnotations(m, offset=sentence.begin))
            context_doc.addMarkup(m)
            context_doc.getSectionMarkups()
            #print(doc.toString())
                       
          
        ## Apply document classification
        
        # convert graphic markups into dataframe    
        markups = get_document_markups(context_doc)
        annotations, relations, doc_txt = convertMarkups2DF(markups) 
        
        # apply inferences for document classication
        inferenced_types = self.feature_inferencer.process(annotations, relations)
        doc_class = self.document_inferencer.process(inferenced_types)
                        
        return doc_class, context_doc, annotations, relations
    
    def abi_extraction(self, document):
        document_id = document.document_id
        ann_index=0
        for reg in self.target_rules_abi:
            for match in reg.finditer(document.text):
                ann_id = 'NLP_'+ str(document_id) + '_' + str(ann_index)
                ann_index=ann_index+1
                new_annotation = Annotation(start_index=int(match.start()), 
                                    end_index=int(match.end()), 
                                    type='abi_annotation',
                                    ann_id = ann_id
                                    )
                new_annotation.spanned_text = document.text[new_annotation.start_index:new_annotation.end_index]
                print(new_annotation.spanned_text)

                # find the numeric value from the string/sentence
                
                if re.findall(r'\b\d+\.\d+\b', new_annotation.spanned_text):
                    str_abi_value = re.findall(r'\b\d+\.\d+\b', new_annotation.spanned_text)
                    if len(str_abi_value) > 0:
                        extracted_abi_value = str_abi_value[0]
                        new_annotation.attributes["ABI-Value"] = extracted_abi_value     
                        # convert to float, compare to a threshold
                        try:
                            float_abi_value = float(extracted_abi_value)
                             # create new attribute if +ve pad if above threshold 1 and below = 0
                            if float_abi_value < 0.90:
                                 new_annotation.attributes["PAD positive"] = 1
                            else:
                                new_annotation.attributes["PAD negative"] = 0  
                            
                            document.annotations.append(new_annotation)
                            
                        except:
                            continue
        
        return document 
    
    def getTargetRegexes(self):
        target_regexes = []
        regexes = [r'(\babi\b|ankle brachial index)( \w{0,20}){0,10} \d+\.\d+']
        for reg in regexes:
            target_regexes.append(re.compile(reg, re.IGNORECASE))
        return target_regexes
          
        
         
           

Once your pipeline class is defined, you can use it multiple times for different set of rules.

In [7]:
# configure your rules 

# ------------- SENTENCE RULES

sentence_rules='KB/rush_rules.tsv'

# ------------- TARGET RULES

target_rules_pad='KB/pad_targets.yml'
target_rules_abi='KB/abi_targets.yml'

# ------------- CONTEXT RULES 

context_rules='KB/pad_modifiers.yml'

# ------------- FEATURE INFERENCE RULES

feature_inference_rule='KB/pad_featurer_inferences.csv'

# ------------- DOCUMENT INFERENCE RULES

document_inference_rule='KB/pad_doc_inferences.csv'


Select documents from the MIMIC database, write a script to process all of them, and output a dictionary which uses document name as keys and document level classification as values.

In [8]:
#initiate an instance of MyPipe for PAD
myPipe = MyPipe(sentence_rules, target_rules_pad, context_rules, feature_inference_rule, document_inference_rule)

In [None]:
#initiate an instance of MyPipe for ABI
# myPipe_abi = MyPipe(sentence_rules, target_rules_abi, context_rules, feature_inference_rule, document_inference_rule)

In [None]:
try:
    conn.close()
except:
    print("Conn already closed!")

In [None]:
conn = pymysql.connect(host="mysql.chpc.utah.edu",
                       port=3306,user="mimicclass",
                       passwd=getpass.getpass("Enter MySQL passwd for user:"),db='mimic3')
 
cursor = conn.cursor()

In [None]:
df_docs_text = pd.read_sql('SELECT subject_id, text from NOTEEVENTS \
                        where \
                        (subject_id in (select subject_id from DIAGNOSES_ICD where ICD9_CODE in ("443.9")) AND text like "% ABI %")\
                        OR (subject_id in (select subject_id from DIAGNOSES_ICD where ICD9_CODE in ("443.9")) AND lower(text) like "% peripheral artery disease %")\
                        OR (subject_id in (select subject_id from DIAGNOSES_ICD where ICD9_CODE in ("443.9")) AND lower(text) like "% peripheral vascular disease %")\
                        OR (subject_id in (select subject_id from DIAGNOSES_ICD where ICD9_CODE in (443.9")) AND lower(text) like "% peripheral arterial occlusive disease %")\
                        OR (subject_id in (select subject_id from DIAGNOSES_ICD where ICD9_CODE in ("443.9")) AND lower(text) like "% ankle brachial index %")\
                        OR (subject_id in (select subject_id from DIAGNOSES_ICD where ICD9_CODE in ("443.9")) AND lower(text) like "% pvd %")\
                           LIMIT 75',conn)


In [None]:

# write to BRAT folder

unid = 'uxxxxxxx'
path = "/home/"+str(unid)+"/BRAT/"+str(unid)+"/Project_1"

for index, row in df_docs_text.iterrows():
    new_file_path_txt = path+"/"+str(row.subject_id) + "_" + str(index) + ".txt" 
    new_file_path_ann = path+"/"+str(row.subject_id) + "_" + str(index) + ".ann" 
    f=open(new_file_path_txt, "w")
    f.write(row.text)
    f.close()
    f=open(new_file_path_ann, "w")
    f.write("")
    f.close()
    


### Error Analysis - Validation

In [9]:
path_1 = "/home/u0410167/BRAT/uxxxxxxx/training_data"
path_2 = "/home/u0410167/BRAT/uxxxxxxx/testing_data"

In [10]:
test_docs=dict()
test_doc_paths = glob.glob(str(path_1+'/*.txt')) 
for d in test_doc_paths:
    doc = Document()
    
    #print(d)
    doc.load_document_from_file(d)
    if (len(doc.text.strip()))==0:
        continue
    #print(str(d[:-3])+'ann')
    doc.load_annotations_from_brat(str(d[:-3])+'ann')
    #print(os.path.basename(d))
    test_docs[os.path.basename(d)]=doc
    myPipe.process(doc.text, doc)
    myPipe.abi_extraction(doc)


test_docs 

ABI 1.13
ABI 1.02
ABI 1.13
ABI 1.02
ABI is 0.76
ABI is 1.0
ABI 1.13
ABI 1.02
ABI was 0.88
ABI 0.81
ABI 0.62
ABI 0.47
ABI 0.28
ABI 1.13
ABI 1.02


{'10757_24.txt': <pipeUtils.Document at 0x7faf97df8fd0>,
 '10757_25.txt': <pipeUtils.Document at 0x7faf97e50f28>,
 '13948_3.txt': <pipeUtils.Document at 0x7faf97cf2f98>,
 '16013_22.txt': <pipeUtils.Document at 0x7fb0418957f0>,
 '16022_88.txt': <pipeUtils.Document at 0x7faf97de7208>,
 '16490_67.txt': <pipeUtils.Document at 0x7faf980912b0>,
 '18123_40.txt': <pipeUtils.Document at 0x7faf97b94c50>,
 '19583_69.txt': <pipeUtils.Document at 0x7faf97e16358>,
 '19583_70.txt': <pipeUtils.Document at 0x7faf97df8240>,
 '19583_71.txt': <pipeUtils.Document at 0x7faf9805fcf8>,
 '19583_72.txt': <pipeUtils.Document at 0x7faf97d01400>,
 '19632_41.txt': <pipeUtils.Document at 0x7faf97cf2160>,
 '19689_8.txt': <pipeUtils.Document at 0x7faf97deaef0>,
 '21458_33.txt': <pipeUtils.Document at 0x7faf9805fc88>,
 '21458_35.txt': <pipeUtils.Document at 0x7faf97ce7400>,
 '21_92.txt': <pipeUtils.Document at 0x7faf980813c8>,
 '21_93.txt': <pipeUtils.Document at 0x7faf97d1bfd0>,
 '2243_59.txt': <pipeUtils.Document at 

In [11]:
[(anno.type, anno.getSpan()) for anno in test_docs['10757_24.txt'].annotations]

[('PAD', (567, 570)),
 ('PAD_DOC', (0, 14)),
 ('PAD', (9079, 9106)),
 ('pad_nlp', (567, 570)),
 ('historical', (506, 511)),
 ('pad_nlp', (2638, 2641)),
 ('pad_nlp', (6733, 6736)),
 ('historical', (6668, 6673)),
 ('pad_nlp', (9125, 9128)),
 ('pad_nlp', (9079, 9106)),
 ('historical', (8935, 8940)),
 ('conj', (9037, 9042))]

In [12]:
test_docs['10757_24.txt'].annotations[7].attributes

{}

### Validation for PAD

In [13]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).compare_types_by_span('PAD','pad_nlp', False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 86 FP = 31 FN = 1
Precision= 0.735
Recall= 0.989
265050688447726460978795573442514302308 pad_nlp 392 395   || T2 PAD 392 395 PVD [Negation:Affirmed]
265248045800549493443741091422801278308 pad_nlp 11631 11634   || T1 PAD 11631 11634 PVD [Negation:Affirmed]
266244882657079590199692422374336279908 pad_nlp 567 570  [historical:266245995812762915613635611666838500708] || T1 PAD 567 570 PVD [Negation:Affirmed]
266395153920638770226162721249499067748 pad_nlp 9079 9106   || T3 PAD 9079 9106 Peripheral Vascular Disease [Negation:Affirmed]
267831718963347411195408860156991419748 pad_nlp 417 420  [family:267833669560708512383400413209048692068] || T1 PAD 417 420 PVD [Negation:Affirmed]
267847345133840099550712434830316189028 pad_nlp 1389 1392   || T3 PAD 1389 1392 PVD [Negation:Affirmed]
268699642092087298162374983876055709028 pad_nlp 528 555   || T4 PAD 528 555 peripheral vascular disease [Negation:Affirmed]
268704388651303527738840213094120338788 pad_nlp 762 765   || T2 PAD 762 765 PVD [N

#### Negation for PAD mention level

In [14]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('PAD', 'Negation', 'Negated'),('pad_nlp', 'Negation', 'Negated')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).\
    compare_types_by_span_and_attributes('PAD','pad_nlp', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

#for a in tp_list_total:
#    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 86 FP = 31 FN = 1
Precision= 0.735
Recall= 0.989
266283858159346857397928190385264571748 pad_nlp 2638 2641  
266364422108681112232253561486603236708 pad_nlp 6733 6736  [historical:266365905259883379260653312629353526628]
266395113514275887951350548542084396388 pad_nlp 9125 9128  
274038896892424959282777617072435541348 pad_nlp 432 435  [acute:274039319970812785454340366597130335588]
274064153246071256468315687012923651428 pad_nlp 1877 1880  
274178351134956066799236269392058955108 pad_nlp 7569 7572  [historical:274179400115827755659066007913961403748]
281833261316248641414234975234795287908 pad_nlp 10203 10206  [historical:281833513261805436774828522704557356388]
281853570663427547934533704291024418148 pad_nlp 11082 11085  
281909538229709249405253159672981271908 pad_nlp 13223 13248  
282721172918109252130932574157089846628 pad_nlp 5490 5493  
282763199496914943648809069545545577828 pad_nlp 7585 7588  
284740517348105692430858920235508217188 pad_nlp 14631 14655  [historical:284740

#### Affirmation for PAD mention level -- not right since the pipeline doesn't do affirmation attribute

In [15]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('PAD', 'Negation', 'Affirmed'),('pad_nlp', 'Negation', 'Affirmed')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).\
    compare_types_by_span_and_attributes('PAD','pad_nlp', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

#for a in tp_list_total:
#    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 0 FP = 117 FN = 87
265050688447726460978795573442514302308 pad_nlp 392 395  
265248045800549493443741091422801278308 pad_nlp 11631 11634  
266244882657079590199692422374336279908 pad_nlp 567 570  [historical:266245995812762915613635611666838500708]
266283858159346857397928190385264571748 pad_nlp 2638 2641  
266364422108681112232253561486603236708 pad_nlp 6733 6736  [historical:266365905259883379260653312629353526628]
266395113514275887951350548542084396388 pad_nlp 9125 9128  
266395153920638770226162721249499067748 pad_nlp 9079 9106  
267831718963347411195408860156991419748 pad_nlp 417 420  [family:267833669560708512383400413209048692068]
267847345133840099550712434830316189028 pad_nlp 1389 1392  
268699642092087298162374983876055709028 pad_nlp 528 555  
268704388651303527738840213094120338788 pad_nlp 762 765  
268836957174230520541718610509020050788 pad_nlp 5710 5713  
269711693924946634241717779007813236068 pad_nlp 462 489  [historical:269712246145239358664150806009147077988]
26

### Validation for ABI

In [16]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).compare_types_by_span('ABI_value','abi_annotation', False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 15 FP = 0 FN = 9
Precision= 1.0
Recall= 0.625
NLP_75568_27.txt_0 abi_annotation 2547 2555 ABI 1.13 [ABI-Value:1.13][PAD negative:0] || T3 ABI_value 2551 2555 1.13 
NLP_75568_27.txt_1 abi_annotation 2562 2570 ABI 1.02 [ABI-Value:1.02][PAD negative:0] || T5 ABI_value 2566 2570 1.02 
NLP_75568_37.txt_0 abi_annotation 2750 2758 ABI 1.13 [ABI-Value:1.13][PAD negative:0] || T3 ABI_value 2754 2758 1.13 
NLP_75568_37.txt_1 abi_annotation 2765 2773 ABI 1.02 [ABI-Value:1.02][PAD negative:0] || T5 ABI_value 2769 2773 1.02 
NLP_13948_3.txt_0 abi_annotation 7787 7798 ABI is 0.76 [ABI-Value:0.76][PAD positive:1] || T5 ABI_value 7794 7798 0.76 
NLP_13948_3.txt_1 abi_annotation 7981 7991 ABI is 1.0 [ABI-Value:1.0][PAD negative:0] || T6 ABI_value 7988 7991 1.0 
NLP_75568_28.txt_0 abi_annotation 2750 2758 ABI 1.13 [ABI-Value:1.13][PAD negative:0] || T2 ABI_value 2754 2758 1.13 
NLP_75568_28.txt_1 abi_annotation 2765 2773 ABI 1.02 [ABI-Value:1.02][PAD negative:0] || T4 ABI_value 2769 2773 1.02 
NLP_

### ????  ABI doesn't have negations, we need to find the values associated with ABI

In [17]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('PAD', 'Negation', 'Negated'),('pad_nlp', 'DEFINITE_NEGATED_EXISTENCE'.lower(), 'Negated')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).\
    compare_types_by_span_and_attributes('PAD','pad_nlp', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 86 FP = 31 FN = 1
Precision= 0.735
Recall= 0.989
265050688447726460978795573442514302308 pad_nlp 392 395   || T2 PAD 392 395 PVD [Negation:Affirmed]
265248045800549493443741091422801278308 pad_nlp 11631 11634   || T1 PAD 11631 11634 PVD [Negation:Affirmed]
266244882657079590199692422374336279908 pad_nlp 567 570  [historical:266245995812762915613635611666838500708] || T1 PAD 567 570 PVD [Negation:Affirmed]
266395153920638770226162721249499067748 pad_nlp 9079 9106   || T3 PAD 9079 9106 Peripheral Vascular Disease [Negation:Affirmed]
267831718963347411195408860156991419748 pad_nlp 417 420  [family:267833669560708512383400413209048692068] || T1 PAD 417 420 PVD [Negation:Affirmed]
267847345133840099550712434830316189028 pad_nlp 1389 1392   || T3 PAD 1389 1392 PVD [Negation:Affirmed]
268699642092087298162374983876055709028 pad_nlp 528 555   || T4 PAD 528 555 peripheral vascular disease [Negation:Affirmed]
268704388651303527738840213094120338788 pad_nlp 762 765   || T2 PAD 762 765 PVD [N

#### We use ABI value to tell if the it is PAD positive or PAD negative

In [18]:
# ABI value catching PAD positive

tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('ABI', 'ABI-Value', 'Normal'),('abi_annotation', 'ABI-Value', 'Normal')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).\
    compare_types_by_span_and_attributes('PAD','pad_nlp', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 86 FP = 31 FN = 1
Precision= 0.735
Recall= 0.989
265050688447726460978795573442514302308 pad_nlp 392 395   || T2 PAD 392 395 PVD [Negation:Affirmed]
265248045800549493443741091422801278308 pad_nlp 11631 11634   || T1 PAD 11631 11634 PVD [Negation:Affirmed]
266244882657079590199692422374336279908 pad_nlp 567 570  [historical:266245995812762915613635611666838500708] || T1 PAD 567 570 PVD [Negation:Affirmed]
266395153920638770226162721249499067748 pad_nlp 9079 9106   || T3 PAD 9079 9106 Peripheral Vascular Disease [Negation:Affirmed]
267831718963347411195408860156991419748 pad_nlp 417 420  [family:267833669560708512383400413209048692068] || T1 PAD 417 420 PVD [Negation:Affirmed]
267847345133840099550712434830316189028 pad_nlp 1389 1392   || T3 PAD 1389 1392 PVD [Negation:Affirmed]
268699642092087298162374983876055709028 pad_nlp 528 555   || T4 PAD 528 555 peripheral vascular disease [Negation:Affirmed]
268704388651303527738840213094120338788 pad_nlp 762 765   || T2 PAD 762 765 PVD [N

In [19]:
# for normal ABI values

tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('ABI', 'ABI-Value', 'Normal'),('abi_annotation', 'ABI-Value', 'Normal')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).\
    compare_types_by_span_and_attributes('PAD','pad_nlp', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 86 FP = 31 FN = 1
Precision= 0.735
Recall= 0.989
265050688447726460978795573442514302308 pad_nlp 392 395   || T2 PAD 392 395 PVD [Negation:Affirmed]
265248045800549493443741091422801278308 pad_nlp 11631 11634   || T1 PAD 11631 11634 PVD [Negation:Affirmed]
266244882657079590199692422374336279908 pad_nlp 567 570  [historical:266245995812762915613635611666838500708] || T1 PAD 567 570 PVD [Negation:Affirmed]
266395153920638770226162721249499067748 pad_nlp 9079 9106   || T3 PAD 9079 9106 Peripheral Vascular Disease [Negation:Affirmed]
267831718963347411195408860156991419748 pad_nlp 417 420  [family:267833669560708512383400413209048692068] || T1 PAD 417 420 PVD [Negation:Affirmed]
267847345133840099550712434830316189028 pad_nlp 1389 1392   || T3 PAD 1389 1392 PVD [Negation:Affirmed]
268699642092087298162374983876055709028 pad_nlp 528 555   || T4 PAD 528 555 peripheral vascular disease [Negation:Affirmed]
268704388651303527738840213094120338788 pad_nlp 762 765   || T2 PAD 762 765 PVD [N

In [20]:
# for abnormal ABI values

tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('ABI', 'ABI-Value', 'Abnormal'),('abi_annotation', 'ABI-Value', 'Abnormal')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).\
    compare_types_by_span_and_attributes('PAD','pad_nlp', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 86 FP = 31 FN = 1
Precision= 0.735
Recall= 0.989
265050688447726460978795573442514302308 pad_nlp 392 395   || T2 PAD 392 395 PVD [Negation:Affirmed]
265248045800549493443741091422801278308 pad_nlp 11631 11634   || T1 PAD 11631 11634 PVD [Negation:Affirmed]
266244882657079590199692422374336279908 pad_nlp 567 570  [historical:266245995812762915613635611666838500708] || T1 PAD 567 570 PVD [Negation:Affirmed]
266395153920638770226162721249499067748 pad_nlp 9079 9106   || T3 PAD 9079 9106 Peripheral Vascular Disease [Negation:Affirmed]
267831718963347411195408860156991419748 pad_nlp 417 420  [family:267833669560708512383400413209048692068] || T1 PAD 417 420 PVD [Negation:Affirmed]
267847345133840099550712434830316189028 pad_nlp 1389 1392   || T3 PAD 1389 1392 PVD [Negation:Affirmed]
268699642092087298162374983876055709028 pad_nlp 528 555   || T4 PAD 528 555 peripheral vascular disease [Negation:Affirmed]
268704388651303527738840213094120338788 pad_nlp 762 765   || T2 PAD 762 765 PVD [N

In [21]:
# for missing ABI values

tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
attributes_to_compare=[]
# To compare attributes, create a list of tuples for each pair to compare:
# attributes_to_compare.append[(A1_type, A1_att_name, A1_att_value),(A2_type, A2_att_name, A2_att_value)]
attributes_to_compare.append([('ABI', 'ABI-Value', 'Missing'),('abi_annotation', 'ABI-Value', 'Missing')])

for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).\
    compare_types_by_span_and_attributes('PAD','pad_nlp', attributes_to_compare , False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 86 FP = 31 FN = 1
Precision= 0.735
Recall= 0.989
265050688447726460978795573442514302308 pad_nlp 392 395   || T2 PAD 392 395 PVD [Negation:Affirmed]
265248045800549493443741091422801278308 pad_nlp 11631 11634   || T1 PAD 11631 11634 PVD [Negation:Affirmed]
266244882657079590199692422374336279908 pad_nlp 567 570  [historical:266245995812762915613635611666838500708] || T1 PAD 567 570 PVD [Negation:Affirmed]
266395153920638770226162721249499067748 pad_nlp 9079 9106   || T3 PAD 9079 9106 Peripheral Vascular Disease [Negation:Affirmed]
267831718963347411195408860156991419748 pad_nlp 417 420  [family:267833669560708512383400413209048692068] || T1 PAD 417 420 PVD [Negation:Affirmed]
267847345133840099550712434830316189028 pad_nlp 1389 1392   || T3 PAD 1389 1392 PVD [Negation:Affirmed]
268699642092087298162374983876055709028 pad_nlp 528 555   || T4 PAD 528 555 peripheral vascular disease [Negation:Affirmed]
268704388651303527738840213094120338788 pad_nlp 762 765   || T2 PAD 762 765 PVD [N

### Validation Document level

In [22]:
tp_total = 0
fp_total = 0
fn_total = 0
tp_list_total = []
fp_list_total= []
fn_list_total = []
for doc_id in test_docs.keys():
    tp, fp, fn, tp_list, fp_list, fn_list = (test_docs.get(doc_id)).compare_types_by_span('PAD','pad_nlp', False)
    tp_total = tp_total + tp
    fp_total = fp_total + fp
    fn_total = fn_total + fn
    tp_list_total.extend(tp_list)
    fp_list_total.extend(fp_list)
    fn_list_total.extend(fn_list)

print('TP =',tp_total, 'FP =',fp_total, 'FN =',fn_total)

if tp_total > 0 :
    precision = tp_total / (tp_total + fp_total)
    print('Precision=',round(precision,3))

if tp_total > 0 :
    recall = tp_total / (tp_total + fn_total)
    print('Recall=',round(recall,3))

for a in tp_list_total:
    print(a[0].toString(),'||', a[1].toString())
for a in fp_list_total:
    print(a.toString())
for a in fn_list_total:
    print(a.toString())

TP = 86 FP = 31 FN = 1
Precision= 0.735
Recall= 0.989
265050688447726460978795573442514302308 pad_nlp 392 395   || T2 PAD 392 395 PVD [Negation:Affirmed]
265248045800549493443741091422801278308 pad_nlp 11631 11634   || T1 PAD 11631 11634 PVD [Negation:Affirmed]
266244882657079590199692422374336279908 pad_nlp 567 570  [historical:266245995812762915613635611666838500708] || T1 PAD 567 570 PVD [Negation:Affirmed]
266395153920638770226162721249499067748 pad_nlp 9079 9106   || T3 PAD 9079 9106 Peripheral Vascular Disease [Negation:Affirmed]
267831718963347411195408860156991419748 pad_nlp 417 420  [family:267833669560708512383400413209048692068] || T1 PAD 417 420 PVD [Negation:Affirmed]
267847345133840099550712434830316189028 pad_nlp 1389 1392   || T3 PAD 1389 1392 PVD [Negation:Affirmed]
268699642092087298162374983876055709028 pad_nlp 528 555   || T4 PAD 528 555 peripheral vascular disease [Negation:Affirmed]
268704388651303527738840213094120338788 pad_nlp 762 765   || T2 PAD 762 765 PVD [N

### System Deployment

In [23]:
try:
    conn.close()
except:
    print("Conn already closed!")

Conn already closed!


In [24]:
conn = pymysql.connect(host="mysql.chpc.utah.edu",
                       port=3306,user="mimicclass",
                       passwd=getpass.getpass("Enter MySQL passwd for user:"),db='mimic3')
 
cursor = conn.cursor()

Enter MySQL passwd for user:········


In [25]:
pad_data = pd.read_sql("""SELECT subject_id, text FROM NOTEEVENTS limit 500 """,conn)

In [26]:
pad_data.count()

subject_id    500
text          500
dtype: int64

In [28]:
pad_data.head()

Unnamed: 0,subject_id,text
0,23224,Admission Date: [**2823-9-29**] ...
1,23224,Admission Date: [**2830-3-12**] ...
2,19051,Admission Date: [**2714-3-12**] ...
3,14605,Admission Date: [**2678-10-4**] ...
4,9446,Admission Date: [**2936-5-6**] Discharge ...


In [29]:
final_nlp_system = MyPipe(sentence_rules, target_rules_pad, context_rules, feature_inference_rule, document_inference_rule)

In [31]:
output = []
counter = 0
for index , row in pad_data.sample(500).iterrows():    
    doc = Document(document_id=str(row.subject_id) + '_' + str(index), text=row.text)
    
    final_nlp_system.process(doc.text, doc)
    
    # print(doc.document_id)
    
    if(len(doc.annotations) > 0):
        i = 1
        for a in doc.annotations:
            if( a.type == 'pad_nlp'):
                neg_flag = 0
                # Switch the flag to 1 when the mention is negated
                if('definite_negated_existence' in a.attributes):
                    neg_flag=1
                ### Each row in the dictionary
                record_id  = str(row.subject_id) + '_' + str(index)+'_'+str(i)
                subject_id =  row.subject_id
                note_id = str(row.subject_id) + '_' + str(index)
                annotation_type = a.type
                snippet = doc.text[int(a.start_index): int(a.end_index)]
                out_list = [record_id, subject_id, note_id, annotation_type, \
                            a.start_index, a.end_index, \
                            snippet, neg_flag]
                output.append(out_list)
                i=i+1
                counter=counter+1
                # Print . after 10 identified records
                if counter%10 == 0:
                    print('.', end='')
        else:
            continue
        break
    

.........

In [32]:
output_abi = []
counter = 0
for index , row in pad_data.sample(500).iterrows():    
    doc = Document(document_id=str(row.subject_id) + '_' + str(index), text=row.text)
    final_nlp_system.abi_extraction(doc)
    if(len(doc.annotations) > 0):
        i = 1
        for a in doc.annotations:
            if( a.type == 'abi_annotation'):
                neg_flag = 0
                abi_output_value = 0.0
                # Switch the flag to 1 when the mention is negated
                if('ABI-Value' in a.attributes):
                    abi_output_value = a.attributes.get("ABI-Value")
                if('PAD positive' in a.attributes):
                    pad_positive = a.attributes.get("PAD positive")
                ### Each row in the dictionary
                record_id  = str(row.subject_id) + '_' + str(index)+'_'+str(i)
                subject_id =  row.subject_id
                note_id = str(row.subject_id) + '_' + str(index)
                annotation_type = a.type
                snippet = doc.text[int(a.start_index): int(a.end_index)]
                out_list = [record_id, subject_id, note_id, annotation_type, \
                            a.start_index, a.end_index, \
                            snippet, abi_output_value, pad_positive]
                
                output_abi.append(out_list)
                i=i+1
                counter=counter+1
                # Print . after 10 identified records
                if counter%10 == 0:
                    print('.', end='')
        else:
            continue
        break
    

In [34]:
output_doc_class = []
counter = 0
for index, row in pad_data.sample(500).iterrows():    
    doc = Document(document_id=str(row.subject_id) + '_' + str(index), text=row.text) 
    doc_class, b, c, d = final_nlp_system.process(doc.text, doc)
         
    if(len(doc.annotations) > 0):
        i = 1
        for a in doc.annotations:
            if( a.type == 'pad_doc'):
                neg_flag = 0
            else:
                neg_flag = 1
                ### Each row in the dictionary
                record_id  = str(row.subject_id) + '_' + str(index)+'_'+str(i)
                subject_id =  row.subject_id
                note_id = str(row.subject_id) + '_' + str(index)
                annotation_type = a.type
                snippet = doc.text[int(a.start_index): int(a.end_index)]
                out_list = [record_id, subject_id, note_id, annotation_type, \
                            a.start_index, a.end_index, \
                            snippet, neg_flag]
                output_doc_class.append(out_list)
                i=i+1
                counter=counter+1
                # Print . after 10 identified records
                if counter%10 == 0:
                    print('.', end='')
        else:
            continue
        break
    

............

### Output to CSV file

In [35]:
columns=['record_id','subject_id', 'note_id', 'annotation_type', 'span_start', 'span_end', 'PAD_snippet', 'neg_flag']
result_data_frame_pad = (pd.DataFrame(output, columns=columns))

result_data_frame_pad.describe()

Unnamed: 0,subject_id,span_start,span_end,neg_flag
count,99.0,99.0,99.0,99.0
mean,24219.656566,3445.828283,3459.979798,0.020202
std,23312.340745,3455.295209,3454.574211,0.141407
min,1136.0,199.0,226.0,0.0
25%,7666.0,950.0,975.5,0.0
50%,18764.0,2012.0,2039.0,0.0
75%,27869.0,5314.5,5317.5,0.0
max,92724.0,13025.0,13028.0,1.0


In [36]:
columns=['record_id','subject_id', 'note_id', 'annotation_type', 'span_start', 'span_end', 'ABI_snippet', 'abi_output_value', 'pad_positive']
result_data_frame_abi = (pd.DataFrame(output_abi, columns=columns))

result_data_frame_abi.describe()

Unnamed: 0,record_id,subject_id,note_id,annotation_type,span_start,span_end,ABI_snippet,abi_output_value,pad_positive
count,0,0,0,0,0,0,0,0,0
unique,0,0,0,0,0,0,0,0,0


In [37]:
columns=['record_id','subject_id', 'note_id', 'annotation_type', 'span_start', 'span_end', 'PAD_snippet', 'neg_flag']
result_data_frame_doc = (pd.DataFrame(output_doc_class, columns=columns))

result_data_frame_doc.describe()

Unnamed: 0,subject_id,span_start,span_end,neg_flag
count,129.0,129.0,129.0,129.0
mean,24485.434109,3418.682171,3430.922481,1.0
std,22739.559235,3418.519647,3418.006559,0.0
min,1136.0,199.0,226.0,1.0
25%,10634.0,982.0,988.0,1.0
50%,21449.0,2031.0,2039.0,1.0
75%,27869.0,5045.0,5048.0,1.0
max,92724.0,13025.0,13028.0,1.0


In [38]:
result_data_frame_pad.to_csv('out_table_pad.csv', index=False)
print('Done')

Done


In [39]:
result_data_frame_abi.to_csv('out_table_abi.csv', index=False)
print('Done')

Done


In [40]:
result_data_frame_doc.to_csv('out_table_doc.csv', index=False)
print('Done')

Done
