# Evaluate spaCy NER on CoNLL 2003 data

1. report token-level performance (per class and total)
    - accuracy of correctly recognizing all tokens that belong to named entities (i.e. tag-level accuracy) 
2. report CoNLL chunk-level performance (per class and total);
    - precision, recall, f-measure of correctly recognizing all the named entities in a chunk per class and total

## Import database

In [1]:
import zipfile
import itertools

#Read test corpus
zip = zipfile.ZipFile('../src/conll2003.zip')
file = zip.open('test.txt')
lines = file.readlines()
lines = [line[:-1].decode('utf-8') for line in lines[2:]]
# Divide into sentences
sentences = [list(g[1]) for g in itertools.groupby(lines, key= lambda x: x != '') if g[0]]
# Divide each word in 4-uples: word, POS_tag, CHUNK_tag, NE_tag
sentences = [[tuple(line.split()) for line in sentence] for sentence in sentences]

# Extract sentences to valuate
test = [[word for word, POS_tag, CHUNK_tag, NE_tag in sent] for sent in sentences]

# Extract references
refs = [[(word, NE_tag) for word, POS_tag, CHUNK_tag, NE_tag in sent] for sent in sentences]

# Evaluate model enforcing tokenization

In [2]:
import spacy

# Predict
from spacy.tokens import Doc

nlp = spacy.load('en_core_web_sm')

conll2003_ner_tags = {'PER', 'ORG', 'LOC', 'MISC'}

hyps = []
for sent in test:
    # Crate doc
    doc = Doc(nlp.vocab, sent)
    # Apply pipeline avoiding tokenization
    for name, proc in nlp.pipeline:
        doc = proc(doc)
    # Fix tagging
    for tok in doc:
        if tok.ent_type_ == 'PERSON':
            tok.ent_type_ = 'PER'
        elif tok.ent_type_ == 'GPE':
            tok.ent_type_ = 'LOC'
        elif tok.ent_type_ == 'NORP':
            tok.ent_type_ = 'ORG'
        elif tok.ent_type_ not in conll2003_ner_tags:
            tok.ent_type_ = 'MISC'
    
    hyps.append(([(t.text, t.ent_iob_+'-'+ t.ent_type_) if t.ent_iob != 2 else (t.text, t.ent_iob_) for t in doc]))

### Evaluate token-level accuracy

In [3]:
import pandas as pd

def accuracy (counts):
    accuracies = {}
    for key in counts:
        count = counts[key]
        accuracies[key] = (count['TP'] + count['TN'])/(count['TP'] + count['TN']+count['FP'] + count['FN'])
    return accuracies

def get_counts_class (refs, hyps, class_t):
    counts = {'TP': 0, 'TN': 0, 'FP': 0, 'FN':0}
    for i in range(len(refs)):
        for j in range(len(refs[i])):
            _, r = refs[i][j]
            _, h = hyps[i][j]
            if (r == class_t):
                if h == r:
                    counts['TP'] +=1
                else:
                    counts['FN'] +=1
            else:
                if h == class_t:
                    counts['FP'] += 1
                else:
                    counts['TN'] += 1
            
    return counts

def all_counts(refs, hyps):
    counts = {}
    classes = ['O', 'I-PER', 'B-PER', 'I-LOC', 'B-LOC', 'I-ORG', 'B-ORG', 'I-MISC', 'B-MISC']
    for cl in  classes:
        counts[cl] = get_counts_class(refs, hyps, cl)
    return counts

# Get counts of TruePositive, False Negative, ...

counts = all_counts(refs, hyps)

# Evaluate accuracy for each class

accuracies = accuracy(counts)

#Plot

pd_tbl = pd.DataFrame().from_dict(counts, orient='index')
pd_tbl.round(decimals=3)
display(pd_tbl)

pd_tbl = pd.DataFrame().from_dict({'Accuracy': accuracies}, orient='index')
pd_tbl.round(decimals=3)
display(pd_tbl)





Unnamed: 0,TP,TN,FP,FN
O,33271,6173,1939,5282
I-PER,874,45311,198,282
B-PER,985,44780,268,632
I-LOC,160,46302,106,97
B-LOC,1174,44657,340,494
I-ORG,438,45221,609,397
B-ORG,523,44135,869,1138
I-MISC,80,44902,1547,136
B-MISC,60,42739,3224,642


Unnamed: 0,O,I-PER,B-PER,I-LOC,B-LOC,I-ORG,B-ORG,I-MISC,B-MISC
Accuracy,0.845259,0.989714,0.980714,0.99565,0.982128,0.978442,0.956991,0.963934,0.917154


### Evaluate CoNLL chunk-level performance

In [4]:
# import conll

import os
import sys
sys.path.insert(0, os.path.abspath('../src/'))
from conll import evaluate

# Evaluate with provided function

results = evaluate(refs, hyps)

#Plot

pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)
display(pd_tbl)

Unnamed: 0,p,r,f,s
MISC,0.016748,0.078348,0.027597,702
PER,0.761373,0.589981,0.664808,1617
LOC,0.766182,0.695444,0.729101,1668
ORG,0.336207,0.281758,0.306584,1661
total,0.354293,0.466891,0.402872,5648


# Evaluate model's full pipeline 

In [5]:
# Reconstruct sentences

sents = []
for sent in test:
    sents.append(' '.join(sent))

# Predict

hyps= []

for sent in sents:
    hyps.append(nlp(sent))

print(hyps[0])
print(refs[0])


SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT .
[('SOCCER', 'O'), ('-', 'O'), ('JAPAN', 'B-LOC'), ('GET', 'O'), ('LUCKY', 'O'), ('WIN', 'O'), (',', 'O'), ('CHINA', 'B-PER'), ('IN', 'O'), ('SURPRISE', 'O'), ('DEFEAT', 'O'), ('.', 'O')]


### Reallign ref and model tokenization

In [6]:
def align (hyps):
    new_hyps = []
    for sent in hyps:
        curr_hyp = [[sent[0].text,sent[0].ent_iob_,sent[0].ent_type_,sent[0].whitespace_]]
        for i in range(1, len(sent)):
            last_tok = curr_hyp[-1]
            # If whitespace not present, merge the two tokens
            if(last_tok[3] == ''):
                curr_hyp.remove(last_tok)
                new_tok = [last_tok[0]+sent[i].text,last_tok[1],last_tok[2],sent[i].whitespace_]
                curr_hyp.append(new_tok)
            # Simply append new token
            else:
                curr_hyp.append([sent[i].text,sent[i].ent_iob_,sent[i].ent_type_,sent[i].whitespace_])
        new_hyps.append(curr_hyp)
    return new_hyps

new_hyps = align(hyps)

print('OLD:')
print([(t.text, t.ent_iob_, t.ent_type_, t.whitespace_) for t in hyps[2]])
print('NEW:')
print(new_hyps[2])

OLD:
[('AL', 'B', 'ORG', ''), ('-', 'I', 'ORG', ''), ('AIN', 'I', 'ORG', ' '), (',', 'O', '', ' '), ('United', 'B', 'ORG', ' '), ('Arab', 'I', 'ORG', ' '), ('Emirates', 'I', 'ORG', ' '), ('1996', 'B', 'DATE', ''), ('-', 'I', 'DATE', ''), ('12', 'I', 'DATE', ''), ('-', 'I', 'DATE', ''), ('06', 'I', 'DATE', '')]
NEW:
[['AL-AIN', 'B', 'ORG', ' '], [',', 'O', '', ' '], ['United', 'B', 'ORG', ' '], ['Arab', 'I', 'ORG', ' '], ['Emirates', 'I', 'ORG', ' '], ['1996-12-06', 'B', 'DATE', '']]


### Reallign format of hyps and refs

In [7]:
print(refs[2])
print(new_hyps[2])

# Set format in tuples with retagging

for sent in new_hyps:
    for tok in sent:
        if tok[2] == 'PERSON':
            tok[2] = 'PER'
        elif tok[2] == 'GPE':
            tok[2] = 'LOC'
        elif tok[2] == 'NORP':
            tok[2] = 'ORG'
        elif tok[2] not in conll2003_ner_tags:
            tok[2] = 'MISC'

hyps_ = [[(tok[0], tok[1]+'-'+tok[2]) if tok[1] != 'O' else (tok[0], tok[1]) for tok in sent] for sent in new_hyps]
print(hyps_[2])

[('AL-AIN', 'B-LOC'), (',', 'O'), ('United', 'B-LOC'), ('Arab', 'I-LOC'), ('Emirates', 'I-LOC'), ('1996-12-06', 'O')]
[['AL-AIN', 'B', 'ORG', ' '], [',', 'O', '', ' '], ['United', 'B', 'ORG', ' '], ['Arab', 'I', 'ORG', ' '], ['Emirates', 'I', 'ORG', ' '], ['1996-12-06', 'B', 'DATE', '']]
[('AL-AIN', 'B-ORG'), (',', 'O'), ('United', 'B-ORG'), ('Arab', 'I-ORG'), ('Emirates', 'I-ORG'), ('1996-12-06', 'B-MISC')]


### Evaluate token level performance

In [8]:
# Count TruePositive, FalseNegative, ...

counts = all_counts(refs, hyps_)

# Evaluate accuracy for each tag

accuracies = accuracy(counts)

#Plot

pd_tbl = pd.DataFrame().from_dict(counts, orient='index')
pd_tbl.round(decimals=3)
display(pd_tbl)

pd_tbl = pd.DataFrame().from_dict({'Accuracy': accuracies}, orient='index')
pd_tbl.round(decimals=3)
display(pd_tbl)

Unnamed: 0,TP,TN,FP,FN
O,33088,6190,1922,5465
I-PER,911,45333,176,245
B-PER,1017,44794,254,600
I-LOC,137,46304,104,120
B-LOC,1127,44652,345,541
I-ORG,434,45220,610,401
B-ORG,529,44156,848,1132
I-MISC,80,44884,1565,136
B-MISC,62,42507,3456,640


Unnamed: 0,O,I-PER,B-PER,I-LOC,B-LOC,I-ORG,B-ORG,I-MISC,B-MISC
Accuracy,0.841701,0.990978,0.981699,0.9952,0.981014,0.978335,0.95757,0.963549,0.912225


### Evaluate CoNLL chunk-level performance

In [9]:
# Evaluate with provided function

results = evaluate(refs, hyps_)

#Plot

pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)
display(pd_tbl)

Unnamed: 0,p,r,f,s
MISC,0.015634,0.078348,0.026066,702
PER,0.774194,0.608534,0.68144,1617
LOC,0.755435,0.666667,0.70828,1668
ORG,0.344953,0.285972,0.312706,1661
total,0.343807,0.464943,0.395303,5648


# Grouping of Entities.
1. Write a function to group recognized named entities using `noun_chunks` method of [spaCy](https://spacy.io/usage/linguistic-features#noun-chunks).
2. Analyze the groups in terms of most frequent combinations (i.e. NER types that go together). 

### Group recognized named entities

In [10]:
def group_ners(doc):
    entities = list(doc.ents)
    entities = [ent.label_ for ent in entities]
    noun_chunks = list(doc.noun_chunks)
    noun_chunks = [[ent.label_ for ent in span.ents] for span in noun_chunks]
    # Remove unlabeled entities
    noun_chunks = [elem for elem in noun_chunks if elem != []]
    final_list = []
    #print(entities)
    #print(noun_chunks)
    j=0
    ran = list(range(len(noun_chunks)))
    for i in ran:
        if(entities[j] == noun_chunks[i][0]):
            final_list.append(noun_chunks[i])
            j+=len(noun_chunks[i])
        else:
            final_list.append([entities[j]])
            ran.insert(0, i-1)
            j+=1
    # Once the noun_chunks are ended, insert the remaining single ners
    for ner in entities[j:]:
        final_list.append([ner])
    return(final_list)

sentence = "Apple's Steve Jobs died in 2011 in Palo Alto, California."

doc = nlp(sentence)
print(group_ners(doc))

sentence = "New York University was established in 1800."

doc = nlp(sentence)
print(group_ners(doc))

[['ORG', 'PERSON'], ['DATE'], ['GPE'], ['GPE']]
[['ORG'], ['DATE']]


### Analyze frequencies

In [11]:
# Reconstruct sentences

sents = []
for sent in test:
    sents.append(' '.join(sent))

# Predict

hyps= []
grouped = []

for sent in sents:
    doc = nlp(sent)
    grouped.append(group_ners(doc))
    hyps.append(doc)

In [20]:
from itertools import groupby

grouped_flat = []
for sent in grouped:
    for group in sent:
        grouped_flat.append(group)

counts = {}

for key, group in groupby(sorted(grouped_flat), lambda x: x):
    cnts = len(list(group))
    dict_key = tuple(key)
    counts[dict_key] = cnts
    #print(key, cnts)

def get_stats_tag (counts, tag):
    '''
    Extract stats for current tag: 
    - number of occurrencies of groups composed by (1, ..., 5) ners
    - most frequent combination of ners for each dimension
    '''
    stats = {}
    for i in range(1,5):
        act_stat = {}
        # Find keys for tag tag with len i
        keys = [key for key in counts.keys() if tag in key and len(key)==i]

        tot = 0
        max_so_far = -1
        max_key = None
        for key in keys:
            tot+=counts[key]
            if(counts[key]>max_so_far):
                max_so_far=counts[key]
                max_key = key
        act_stat['freq'] = tot
        if max_key is not None:
            act_stat['most_pres_comb'] = (max_key, counts[max_key])
        else:
            act_stat['most_pres_comb'] = (max_key, 0)
        stats[i] = act_stat
    return stats

def get_stats (counts, tags):
    stats = {}
    for tag in tags:
        stats[tag] = get_stats_tag(counts, tag)
    return stats
    
tags = {'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL', 'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART'} 

stats = get_stats(counts, tags)

for tag in sorted(tags):
    pd_tbl = pd.DataFrame().from_dict(stats[tag], orient='columns')
    display(pd_tbl)

Unnamed: 0,1,2,3,4
freq,1621,125,12,1
most_pres_comb,"((CARDINAL,), 1621)","((CARDINAL, PERSON), 51)","((CARDINAL, PERSON, CARDINAL), 3)","((MONEY, CARDINAL, CARDINAL, ORG), 1)"


Unnamed: 0,1,2,3,4
freq,997,51,6,0
most_pres_comb,"((DATE,), 997)","((DATE, EVENT), 8)","((DATE, NORP, PERSON), 2)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,58,14,0,0
most_pres_comb,"((EVENT,), 58)","((DATE, EVENT), 8)","(None, 0)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,22,5,0,0
most_pres_comb,"((FAC,), 22)","((GPE, FAC), 2)","(None, 0)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,1255,126,8,0
most_pres_comb,"((GPE,), 1255)","((GPE, PERSON), 34)","((GPE, DATE, ORG), 2)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,6,2,1,0
most_pres_comb,"((LANGUAGE,), 6)","((LANGUAGE, ORDINAL), 2)","((DATE, LANGUAGE, ORDINAL), 1)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,11,0,0,0
most_pres_comb,"((LAW,), 11)","(None, 0)","(None, 0)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,54,6,0,0
most_pres_comb,"((LOC,), 54)","((GPE, LOC), 2)","(None, 0)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,146,5,0,1
most_pres_comb,"((MONEY,), 146)","((MONEY, DATE), 1)","(None, 0)","((MONEY, CARDINAL, CARDINAL, ORG), 1)"


Unnamed: 0,1,2,3,4
freq,293,89,4,0
most_pres_comb,"((NORP,), 293)","((NORP, PERSON), 43)","((DATE, NORP, PERSON), 2)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,111,29,3,0
most_pres_comb,"((ORDINAL,), 111)","((NORP, ORDINAL), 5)","((DATE, LANGUAGE, ORDINAL), 1)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,875,102,5,1
most_pres_comb,"((ORG,), 875)","((ORG, PERSON), 21)","((CARDINAL, CARDINAL, ORG), 2)","((MONEY, CARDINAL, CARDINAL, ORG), 1)"


Unnamed: 0,1,2,3,4
freq,81,2,0,0
most_pres_comb,"((PERCENT,), 81)","((CARDINAL, PERCENT), 1)","(None, 0)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,1076,173,13,0
most_pres_comb,"((PERSON,), 1076)","((CARDINAL, PERSON), 51)","((CARDINAL, PERSON, CARDINAL), 3)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,22,11,0,0
most_pres_comb,"((PRODUCT,), 22)","((GPE, PRODUCT), 9)","(None, 0)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,51,3,0,0
most_pres_comb,"((QUANTITY,), 51)","((QUANTITY, QUANTITY), 2)","(None, 0)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,83,8,1,0
most_pres_comb,"((TIME,), 83)","((DATE, TIME), 7)","((CARDINAL, GPE, TIME), 1)","(None, 0)"


Unnamed: 0,1,2,3,4
freq,10,2,0,0
most_pres_comb,"((WORK_OF_ART,), 10)","((DATE, WORK_OF_ART), 1)","(None, 0)","(None, 0)"


# Fix segmentation errors
One of the possible post-processing steps is to fix segmentation errors.
Write a function that extends the entity span to cover the full noun-compounds. Make use of `compound` dependency relation.

In [13]:
sentence = "Apple's Steve Jobs died in 2011 in Palo Alto, California."
# sentence = "New York University was established in 1800"
sentence = "Tripp's lite USB 3.0 SATA Hard Drive Dock gives you great performances"

doc = nlp(sentence)

ents = [ent for ent in doc.ents]
print(ents)

noun_c = list(doc.noun_chunks)
print(noun_c)

tmp = [(tok.text, tok.dep_) for tok in doc]
print(tmp)
spacy.displacy.render(doc)

[Tripp]
[Tripp's lite USB, 3.0 SATA, Hard Drive Dock, you, great performances]
[('Tripp', 'poss'), ("'s", 'case'), ('lite', 'amod'), ('USB', 'nsubj'), ('3.0', 'nummod'), ('SATA', 'appos'), ('Hard', 'compound'), ('Drive', 'compound'), ('Dock', 'nsubj'), ('gives', 'ROOT'), ('you', 'dative'), ('great', 'amod'), ('performances', 'dobj')]
