# Error Analysis (1)

From the previous notebook, you have seen that our current NLP solution does not get 100% correct. Although it is unrealistic to reach that goal, we definitely can make it closer. 

This notebook will show you how to analysis errors related to name entity recognition, and guide you through step by step to improve the recall. We will talk about how to improve precision tomorrow.

## 1. Locate the errors

In [None]:
#first import libraries
import urllib.request
import os
import codecs
import zipfile
import pandas as pd
from IPython.display import display, HTML
import sklearn.metrics

Reuse the classes and functions that we have created in previous notebook.

In [None]:
class Annotation(object):
    def __init__(self):
        self.start_index = -1
        self.end_index = -1
        self.type = ''
        self.spanned_text = ''
        
    # adding this so that pyConText's HTML markup can work seamlessly
    def getSpan(self):
        return (self.start_index, self.end_index)
    
    def getCategory(self):
        # pyConText graph objects actually expect a list here
        return [self.type]

class AnnotatedDocument(object):
    def __init__(self):
        self.text = ''
        self.annotations = []
        self.positive_label = -1
        
def read_brat_annotations(lines):
    annotations = []
    # BRAT FORMAT is:
    # NUMBER[TAB]TYPE[SPACE]START_INDEX[SPACE]END_INDEX[SPACE]SPANNED_TEXT
    for line in lines:
        line = str(line)
        tab_tokens = line.split('\t')
        space_tokens = tab_tokens[1].split()
        anno = Annotation()
        anno.spanned_text = tab_tokens[-1]
        anno.type = space_tokens[0]
        anno.start_index = int(space_tokens[1])
        anno.end_index = int(space_tokens[2])
        annotations.append(anno)
    return annotations       

def mark_text(txt,nodes,colors = {"name":"red","pet":"blue"},default_color="black"):
    from pyConTextNLP.display.html import __insert_color
    # this function had to be copied and modified from pyConTextNLP.display.html.mark_text 
    # so that the default_color could be passed through
    if not nodes:
        return txt
    else:
        n = nodes.pop(-1)
        return mark_text(__insert_color(txt,
                                        n.getSpan(),
                                        colors.get(n.getCategory()[0],default_color)),
                         nodes,
                         colors=colors,
                         # this was not being passed through 
                        default_color = default_color)
    
def pneumonia_html_markup(anno_doc):
    from pyConTextNLP.display.html import __sort_by_span
    # this bit mimics 'mark_document_with_html' from pyConTextNLP.display.html
    colors = {}
    colors['DOCUMENT_PNEUMONIA_YES'] = 'red'
    colors['DOCUMENT_PNEUMONIA_NO'] = 'green'
    colors['SPAN_POSITIVE_PNEUMONIA_EVIDENCE'] = 'orange'
    default_color = 'red'
    html = """<p> {0} </p>""".format(" ".join([mark_text(anno_doc.text,
                                                 __sort_by_span(anno_doc.annotations),
                                                 colors=colors,
                                                 default_color=default_color)]))
    return html

Before we get to the errors, first we need to tweek a little bit on the function below: instead of returning a list of values, we need to return the "*annotated_doc_map*" where the keys are the document names, and the values are "*AnnotatedDocument*"s. Because you may want to see in which documents our prediction got errors

In [None]:
def read_annotations(archive_file, force_redownload = False):
    print('Reading annotations from file : ' + archive_file)
    filename = archive_file.split('/')[-1]
    
    if force_redownload or not os.path.isfile(filename):
        print('Downloading remote file : '+ archive_file)
        urllib.request.urlretrieve(archive_file, filename)
    
    annotated_doc_map = {}
    
    print('Opening local file : ' + filename)
    z = zipfile.ZipFile(filename, "r")
    zinfo = z.namelist()
    for name in zinfo:
        if name.endswith('.txt') or name.endswith('.ann'):
            basename = name.split('.')[0]
            if basename not in annotated_doc_map:
                annotated_doc_map[basename] = AnnotatedDocument()
            anno_doc = annotated_doc_map[basename]
            # handle text and BRAT annotation files (.ann) differently
            if name.endswith('.txt'):
                with z.open(name) as f1:
                    anno_doc.text = f1.read().decode('utf8')
            else:
                with z.open(name) as f1:
                    # handle this as utf8 or we get back byte arrays
                    anno_doc.annotations = read_brat_annotations(codecs.iterdecode(f1, 'utf8'))
                    
    # now let's finally assign a 0 or 1 to each document based on whether we see our expected type for the pneumonia label
    for key, anno_doc in annotated_doc_map.items():
        annos = anno_doc.annotations
        anno_doc.positive_label = 0
        for anno in annos:
            if anno.type == 'DOCUMENT_PNEUMONIA_YES':
                anno_doc.positive_label = 1
#Note: here is the tweak we need to make                    
    return annotated_doc_map

Next, we tweak the function **calculate_prediction_metrics** to list the difference--errors, instead of calculate the measurements:

Question before we move on:

Why we only care *false negatives* for now?


In [None]:
def list_false_negatives(gold_docs, prediction_function):
    fn_docs={}
    for doc_name, gold_doc in gold_docs.items():
        gold_label=gold_doc.positive_label;
        pred_label = prediction_function(gold_doc.text)
        if gold_label==1 and pred_label==0:
            fn_docs[doc_name]=gold_doc            
    return fn_docs     


## 2. Display errors

Now we put everything together to display errors:

In [None]:
i=0
annotated_docs = read_annotations('https://github.com/burgersmoke/DeCART_2017_rulebased_NLP/raw/master/data/BRAT/BratTestArchive.zip')
print('Total Annotated Documents : {0}'.format(len(annotated_docs)))

fn=list_false_negatives(annotated_docs, keyword_classifier.predict)

docs=list(fn.keys());

Show one document a time:

In [None]:
if i<len(docs):
    print (docs[i])

    anno_doc=fn[docs[i]]

    display(HTML(pneumonia_html_markup(anno_doc).replace('\n', '<br>')))
    
    i+=1

## 3. More efficient review:
Not convenient to read? Let's try snippet view instead. Now we need to make another function to replace "*pneumonia_html_markup*". 

Although we measuring the document level annotation, we will focus on mention level ("**SPAN_POSITIVE_PNEUMONIA_EVIDENCE**") error analyses. Because the later is where the errors originate from.



In [None]:

def snippets_markup(annotated_doc_map):
    html = ["<html>","<table width=100% >",
            "<col style=\"width:25%\"><col style=\"width:75%\">"
            "<tr><th style=\"text-align:center\">document name</th><th style=\"text-align:center\">Snippets</th>"]
    for doc_name, anno_doc in annotated_doc_map.items():
        html.extend(snippet_markup(doc_name,anno_doc))
    html.append("</table>")
    html.append("</html>")
    return ''.join(html) 


def snippet_markup(doc_name,anno_doc):
    from pyConTextNLP.display.html import __sort_by_span
    from pyConTextNLP.display.html import __insert_color
    html=[]
    color= 'orange'    
    window_size=50    
    html.append("<tr>")
    html.append("<td style=\"text-align:left\">{0}</td>".format(doc_name))
    html.append("<td></td>")
    html.append("</tr>")
    for anno in anno_doc.annotations:
        if anno.type == 'SPAN_POSITIVE_PNEUMONIA_EVIDENCE':
#           make sure the our snippet will be cut inside the text boundary
            begin=anno.start_index-window_size
            end=anno.end_index+window_size
            begin=begin if begin>0 else 0
            end=end if end<len(anno_doc.text) else len(anno_doc.text)    
#           render a highlighted snippet
            cell=__insert_color(anno_doc.text[begin:end],[anno.start_index-begin,anno.end_index-end],color)
#           add the snippet into table
            html.append("<tr>")
            html.append("<td></td>")
            html.append("<td style=\"text-align:left\">{0}</td>".format(cell))
            html.append("</tr>") 
    return html

Let's try it out:

In [None]:
display(HTML(snippets_markup(fn)))

## 4. Now what?