In [None]:
import urllib.request
import os
import codecs
import zipfile
import pandas as pd
from IPython.display import display, HTML
import sklearn.metrics

# and also our utilities for this class
from nlp_pneumonia_utils import Annotation
from nlp_pneumonia_utils import AnnotatedDocument
from nlp_pneumonia_utils import read_brat_annotations
from nlp_pneumonia_utils import read_doc_annotations
from nlp_pneumonia_utils import read_annotations
from nlp_pneumonia_utils import calculate_prediction_metrics
from nlp_pneumonia_utils import mark_text
from nlp_pneumonia_utils import clearPyConTextRegularExpressions
from nlp_pneumonia_utils import pneumonia_annotation_html_markup
print('Loaded utilities...')

## First, we'll load in our dataset but throughout these notebooks, there are a lot of utility functions used.  

### (OPTIONAL) Feel free to look at them here in the repository : <a href="https://github.com/UUDeCART/decart_rule_based_nlp/blob/master/nlp_pneumonia_utils.py">nlp_pneumonia_utils.py</a> 

In [None]:
# First thing, let's load our training set
annotated_doc_map = read_doc_annotations('data/training_v2.zip')

# let's also use a simple list of documents as well as this map
annotated_docs = list(annotated_doc_map.values())
print('Total Annotated Documents : {0}'.format(len(annotated_docs)))

total_positives = 0
for anno_doc in annotated_docs:
    if anno_doc.positive_label:
        total_positives += 1
    
print('Total Positive Pneumonia Documents : {0}'.format(total_positives))

In [None]:
# let's find the document with the most annotations
most_annotated_doc = None
for anno_doc in annotated_docs:
    if most_annotated_doc is None or len(anno_doc.annotations) > len(most_annotated_doc.annotations):
        most_annotated_doc = anno_doc
        #print('Most Annotations so far : {}'.format(len(most_annotated_doc.annotations)))

## Next, recall annotations annotated by our expert.  Note that there are 3 total annotation types in this set : 
1. **DOCUMENT_PNEUMONIA_YES* -> Document shows **active** or **possible** case of pneumonia
2. **DOCUMENT_PNEUMONIA_NO** -> Document shows **no evidence** of pneumonia
3. **SPAN_POSITIVE_PNEUMONIA_EVIDENCE** -> Spans of phrases/sentence which show positive or possible evidence of pneumonia which led the expert annotator to the final document-level conclusion

## Let's render one of our annotated documents in HTML.  When using the function 'pneumonia_annotation_html_markup' these show up as the colors:
1. **DOCUMENT_PNEUMONIA_YES** -> RED
2. **DOCUMENT_PNEUMONIA_NO** -> GREEN
3. **SPAN_POSITIVE_PNEUMONIA_EVIDENCE** -> RED

In [None]:
# let's display one of our documents in HTML
display(HTML(pneumonia_annotation_html_markup(most_annotated_doc).replace('\n', '<br>')))

# In this course, we will work in groups to develop rule-based systems to correctly identify cases of pneumonia.

## Before we do that, let's establish some baselines.  These initial baselines will be fairly naive but they will help to illustrate the principles of Precision, Recall and F1 measure

## Let's recall those formulas from the previous notebook:

## Precision and Recall:
<img src=images/precision_recall.jpg>

## F1:
<img src=https://wikimedia.org/api/rest_v1/media/math/render/svg/7d63c1f5c659f95b5dfe5893213cc8ea7f8bea0a>
<p>Source: Wikipedia</p>

## Before we set up a few baselines, some explanation on calculate_prediction_metrics() from our utilities file.  Its arguments are:
1. A list of documents which have been annotated (AnnotatedDocument class in our scripts)
2. A function which takes a string (i.e. text) and returns either a 0 or 1 (0 for NO PNEUMONIA, 1 for DEFINITE or POSSIBLE)

From these predictions, it calculates Precision, Recall and F1 measure and writes a <a href="https://en.wikipedia.org/wiki/Confusion_matrix">confusion matrix</a>.

## This function will enable your group projects, but more on that later...

##  First, a naive baseline by always predicting NO pneumonia (i.e. 0)

In [None]:
def naive_negative_pneumonia_prediction(text):
    return 0
    
print('Predicting and validating the naive baseline of always predicting NO')
calculate_prediction_metrics(annotated_docs, naive_negative_pneumonia_prediction)

## Next, a naive baseline by always predicting YES pneumonia (i.e. 1)

In [None]:
def naive_positive_pneumonia_prediction(text):
    return 1
    
print('Predicting and validating the naive baseline of always predicting YES')
calculate_prediction_metrics(annotated_docs, naive_positive_pneumonia_prediction)

## Then let's try a more intelligent baseline to assign positive Pneumonia anytime the word "pneumonia" appears in a document

In [None]:
def naive_pneumonia_keyword_prediction(text):
    if 'pneumonia' in text:
        return 1
    else:
        return 0
    
print('Predicting and validating the naive PNEUMONIA keyword baseline')
calculate_prediction_metrics(annotated_docs, naive_pneumonia_keyword_prediction)

<br/><br/>This material presented as part of the DeCART Data Science for the Health Science Summer Program at the University of Utah in 2017.<br/>
Presenters : Dr. Wendy Chapman, Jianlin Shi and Kelly Peterson