In [5]:
#Install and import required packages
import sys
import spacy
import random
import time
import json 
import warnings
import numpy as np
from spacy import displacy
from spacy.util import minibatch, compounding
from os import path, mkdir
from itertools import chain


warnings.filterwarnings("ignore")
#load the model and the dataset
nlp = spacy.load("en_core_web_sm")
f = open ('/Users/l_parau/OntoNotes.json', "r")
TEST_DATA = json.loads(f.read())
# extract the sentences from [sentence, entity]
test_sentences = [x[0] for x in TEST_DATA[0:15]]
colors = {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
options = {"ent": ["ORG"], "colors": colors}
for x in test_sentences:
    doc = nlp(x)
    displacy.render(doc, jupyter = True, style = "ent", options=options)
warnings.filterwarnings("default")
#Evaluation Metrics
#Model performance is assessed on the entirety of the test dataset (2,443 sentences) 
#based on the following metrics and their definitions.
#Precision: true positives / (true positives + false positives)
#Recall: true positives / (true positives + false negatives)
#F1-score: harmonic average of precision and recall
# true positives / total pred

def calc_precision(pred, true):
    precision = len([x for x in pred if x in true]) / (len(pred) + 1e-20) 
    return precision
# true positives / total test
def calc_recall(pred, true):
    recall = len([x for x in true if x in pred]) / (len(true) + 1e-20)
    return recall
def calc_f1(precision, recall):
    f1 = 2 * ((precision * recall) / (precision + recall + 1e-20))
    return f1
# run the predictions on each sentence in the test dataset, and return the spacy object
preds = [nlp(x[0]) for x in TEST_DATA]
precisions, recalls, f1s = [], [], []
# iterate over predictions and test data and calculate precision, recall, and F1-score
#x[2] = annotation, true[1] = (start, end, annot)
#i.label_ = annotation label, pred.ents = list of annotations
print('Precision metrics for OntoNotes subset')
for pred, true in zip(preds, TEST_DATA):
    true = [x[2] for x in list(chain.from_iterable(true[1].values()))]
    pred = [i.label_ for i in pred.ents]
    precision = calc_precision(true, pred)
    precisions.append(precision)
    recall = calc_recall(true, pred)
    recalls.append(recall)
    f1s.append(calc_f1(precision, recall))
print("Precision: {} \nRecall: {} \nF1-score: {}".format(np.around(np.mean(precisions), 3), np.around(np.mean(recalls), 3), np.around(np.mean(f1s), 3)))
print('Precision metrics for OntoNotes subset done!')

Precision metrics for OntoNotes subset
Precision: 0.963 
Recall: 0.957 
F1-score: 0.957
Precision metrics for OntoNotes subset done!
