In [None]:
import urllib.request
import os
import codecs
import zipfile
import pandas as pd
from IPython.display import display, HTML
import sklearn.metrics

# and also our utilities for this class
from nlp_pneumonia_utils import Annotation
from nlp_pneumonia_utils import AnnotatedDocument
from nlp_pneumonia_utils import read_brat_annotations
from nlp_pneumonia_utils import read_doc_annotations
from nlp_pneumonia_utils import read_annotations
from nlp_pneumonia_utils import calculate_prediction_metrics
from nlp_pneumonia_utils import mark_text
from nlp_pneumonia_utils import clearPyConTextRegularExpressions
from nlp_pneumonia_utils import pneumonia_annotation_html_markup
print('Loaded utilities...')

# First, we'll load in our dataset but throughout these notebooks, there are a lot of utility functions used.  Feel free to look at them here : nlp_pneumonia_utils.py 

In [None]:
# First thing, let's load our training set
annotated_doc_map = read_doc_annotations('data/training.zip')
#annotated_doc_map = read_doc_annotations('pneumonia_brat_full_set1.zip')
# let's also use a simple list of documents as well as this map
annotated_docs = list(annotated_doc_map.values())
print('Total Annotated Documents : {0}'.format(len(annotated_docs)))

total_positives = 0
for anno_doc in annotated_docs:
    if anno_doc.positive_label:
        total_positives += 1
    
print('Total Positive Pneumonia Documents : {0}'.format(total_positives))

In [None]:
# let's find the document with the most annotations
most_annotated_doc = None
for anno_doc in annotated_docs:
    if most_annotated_doc is None or len(anno_doc.annotations) > len(most_annotated_doc.annotations):
        most_annotated_doc = anno_doc
        #print('Most Annotations so far : {}'.format(len(most_annotated_doc.annotations)))

# Next, let's look at some of the annotations annotated by our expert.  Note that there are 3 annotation types in this set : 
1. DOCUMENT_PNEUMONIA_YES -> Document shows active or possible case of pneumonia
2. DOCUMENT_PNEUMONIA_NO -> Document shows no evidence of pneumonia
3. SPAN_POSITIVE_PNEUMONIA_EVIDENCE -> Spans of phrases/sentence which show positive or possible evidence of pneumonia which led the expert annotator to the final document-level conclusion

## Let's render one of our annotated documents in HTML.  When using the function 'pneumonia_annotation_html_markup' these show up as the colors:
1. DOCUMENT_PNEUMONIA_YES -> RED
2. DOCUMENT_PNEUMONIA_NO -> GREEN
3. SPAN_POSITIVE_PNEUMONIA_EVIDENCE -> RED

In [None]:
# let's display one of our documents in HTML
display(HTML(pneumonia_annotation_html_markup(most_annotated_doc).replace('\n', '<br>')))

# Now let's establish some baselines.  These initial baselines will be fairly naive but they will help to illustrate the principles of Precision, Recall and F1 measure

In [None]:
# let's first illustrate a naive baseline by always prediction NO pneumonia (i.e. 0)
def naive_negative_pneumonia_prediction(text):
    return 0
    
print('Predicting and validating the naive baseline of always predicting NO')
calculate_prediction_metrics(annotated_docs, naive_negative_pneumonia_prediction)

In [None]:
# let's first illustrate a naive baseline by always prediction NO pneumonia (i.e. 0)
def naive_positive_pneumonia_prediction(text):
    return 1
    
print('Predicting and validating the naive baseline of always predicting YES')
calculate_prediction_metrics(annotated_docs, naive_positive_pneumonia_prediction)

In [None]:
# now let's try a very naive simulated baseline to assign positive Pneumonia anytime the work "pneumonia" appears in a document
def naive_pneumonia_keyword_prediction(text):
    if 'pneumonia' in text:
        return 1
    else:
        return 0
    
print('Predicting and validating the naive PNEUMONIA keyword baseline')
calculate_prediction_metrics(annotated_docs, naive_pneumonia_keyword_prediction)