# Evaluate spaCy NER on CoNLL 2003 data

1. report token-level performance (per class and total)
    - accuracy of correctly recognizing all tokens that belong to named entities (i.e. tag-level accuracy) 
2. report CoNLL chunk-level performance (per class and total);
    - precision, recall, f-measure of correctly recognizing all the named entities in a chunk per class and total

## Import database

In [1]:
import zipfile
import itertools

#Read test corpus
zip = zipfile.ZipFile('../src/conll2003.zip')
file = zip.open('test.txt')
lines = file.readlines()
lines = [line[:-1].decode('utf-8') for line in lines[2:]]
# Divide into sentences
sentences = [list(g[1]) for g in itertools.groupby(lines, key= lambda x: x != '') if g[0]]
# Divide each word in 4-uples: word, POS_tag, CHUNK_tag, NE_tag
sentences = [[tuple(line.split()) for line in sentence] for sentence in sentences]

# Extract sentences to valuate
test = [[word for word, POS_tag, CHUNK_tag, NE_tag in sent] for sent in sentences]

# Extract references
refs = [[(word, NE_tag) for word, POS_tag, CHUNK_tag, NE_tag in sent] for sent in sentences]

# Evaluate model enforcing tokenization

In [2]:
import spacy

# Predict
from spacy.tokens import Doc

nlp = spacy.load('en_core_web_sm')

conll2003_ner_tags = {'PER', 'ORG', 'LOC', 'MISC'}

To evaluate correctly the behaviour of the spaCy model with respect to the Conll2003 dataset, the 15 labels derived from Ontonotes15 have to be remapped into the four tags used in the dataset. In this case I used the mapping described in the following table. The decisions were made according to the descriptions of the tags.

|spaCy tag|description|conll2003 tag|
|:--------|:----------|:------------|
|PERSON        |People, including fictional                           | PER|
|NORP          |Nationalities or religious or political groups        | MISC|
|FAC           |Buildings, airports, highways, bridges, etc.          | MISC|
|ORG           |Companies, agencies, institutions, etc.               | ORG|
|GPE           |Countries, cities, states                             | LOC|
|LOC           |Non-GPE locations, mountain ranges, bodies of water   | LOC|
|PRODUCT       |Vehicles, weapons, foods, etc. (Not services)         | MISC|
|EVENT         |Named hurricanes, battles, wars, sports events, etc.  | MISC|
|WORK_OF_ART   |Titles of books, songs, etc.                          | MISC|
|LAW           |Named documents made into laws                        | MISC|
|LANGUAGE      |Any named language                                    | MISC|
|DATE          |Absolute or relative dates or periods                 | -|
|TIME          |Times smaller than a day                              | -|
|PERCENT       |Percentage (including “%”)                            | -|
|MONEY         |Monetary values, including unit                       | -|
|QUANTITY      |Measurements, as of weight or distance                | -|
|ORDINAL       |“first”, “second”                                     | -|
|CARDINAL      |Numerals that do not fall under another type          | -|


In [3]:
mapping = {}
mapping['PERSON'] = 'PER'
mapping['NORP'] = 'MISC'
mapping['FAC'] = 'MISC'
mapping['ORG'] = 'ORG'
mapping['GPE'] = 'LOC'
mapping['LOC'] = 'LOC'
mapping['PRODUCT'] = 'MISC'
mapping['EVENT'] = 'MISC'
mapping['WORK_OF_ART'] = 'MISC'
mapping['LAW'] = 'MISC'
mapping['LANGUAGE'] = 'MISC'
mapping['DATE'] = ''
mapping['TIME'] = ''
mapping['PERCENT'] = ''
mapping['MONEY'] = ''
mapping['QUANTITY'] = ''
mapping['ORDINAL'] = ''
mapping['CARDINAL'] = ''

In [4]:
hyps = []
for sent in test:
    # Crate doc
    doc = Doc(nlp.vocab, sent)
    # Apply pipeline avoiding tokenization
    for name, proc in nlp.pipeline:
        doc = proc(doc)
    # Fix tagging
    for tok in doc:
        if tok.ent_iob_ != 'O':
            tok.ent_type_ = mapping[tok.ent_type_]
    
    to_app = []
    for t in doc:
        if t.ent_iob_!='O' and t.ent_type_!='':
            to_app.append((t.text, t.ent_iob_+'-'+ t.ent_type_))
        elif t.ent_iob_=='O':
            to_app.append((t.text, t.ent_iob_))
        elif t.ent_type!='':
            to_app.append((t.text, 'O'))
    
    hyps.append(to_app)

### Evaluate token-level accuracy

In [5]:
import pandas as pd

def stats (counts):
    stats = {}
    stats['Precision'] = precision(counts)
    stats['Recall'] = recall(counts)
    stats['f1'] = f1(stats['Precision'], stats['Recall'])
    return stats

def accuracy (counts):
    accuracies = {}
    for key in counts:
        count = counts[key]
        accuracies[key] = (count['TP'] + count['TN'])/(count['TP'] + count['TN']+count['FP'] + count['FN'])
    return accuracies

def precision (counts):
    precisions = {}
    for key in counts:
        count = counts[key]
        precisions[key] = (count['TP'])/(count['TP'] + count['FP'])
    return precisions

def recall (counts):
    recalls = {}
    for key in counts:
        count = counts[key]
        recalls[key] = (count['TP'])/(count['TP'] + count['FN'])
    return recalls

def f1 (prec, rec):
    f1_scores = {}
    for key in prec:
        f1_scores[key] = 2*(prec[key]*rec[key])/(prec[key]+rec[key])
    return f1_scores        
        
def get_counts_class (refs, hyps, class_t):
    counts = {'TP': 0, 'TN': 0, 'FP': 0, 'FN':0}
    for i in range(len(refs)):
        for j in range(len(refs[i])):
            _, r = refs[i][j]
            _, h = hyps[i][j]
            if (r == class_t):
                if h == r:
                    counts['TP'] +=1
                else:
                    counts['FN'] +=1
            else:
                if h == class_t:
                    counts['FP'] += 1
                else:
                    counts['TN'] += 1
            
    return counts

def all_counts(refs, hyps):
    counts = {}
    classes = ['O', 'I-PER', 'B-PER', 'I-LOC', 'B-LOC', 'I-ORG', 'B-ORG', 'I-MISC', 'B-MISC']
    for cl in  classes:
        counts[cl] = get_counts_class(refs, hyps, cl)
    total = {'TP': 0, 'FN': 0, 'FP': 0, 'TN': 0}
    return counts

# Get counts of TruePositive, False Negative, ...

counts = all_counts(refs, hyps)

# Evaluate statistics for each class

statistics = stats(counts)

#Plot

pd_tbl = pd.DataFrame().from_dict(counts, orient='index')
pd_tbl = pd_tbl.sort_index()
display(pd_tbl)


pd_tbl = pd.DataFrame().from_dict(statistics, orient='columns')
pd_tbl = pd_tbl.sort_index()
display(pd_tbl)

Unnamed: 0,TP,TN,FP,FN
B-LOC,1174,44657,340,494
B-MISC,391,45802,161,311
B-ORG,504,44500,504,1157
B-PER,985,44780,268,632
I-LOC,160,46302,106,97
I-MISC,87,46338,111,129
I-ORG,434,45230,600,401
I-PER,874,45311,198,282
O,37783,6127,1985,770


Unnamed: 0,Precision,Recall,f1
B-LOC,0.775429,0.703837,0.737901
B-MISC,0.708333,0.55698,0.623604
B-ORG,0.5,0.303432,0.37767
B-PER,0.786113,0.609153,0.686411
I-LOC,0.601504,0.622568,0.611855
I-MISC,0.439394,0.402778,0.42029
I-ORG,0.419729,0.51976,0.464419
I-PER,0.815299,0.756055,0.78456
O,0.950085,0.980027,0.964824


### Using scikit-learn library

In [6]:
from sklearn.metrics import classification_report

skl_refs = [tok[-1] for sent in refs for tok in sent]
skl_hyps = [tok[-1] for sent in hyps for tok in sent]

report = classification_report(skl_refs, skl_hyps)
    
print(report)

              precision    recall  f1-score   support

       B-LOC       0.78      0.70      0.74      1668
      B-MISC       0.71      0.56      0.62       702
       B-ORG       0.50      0.30      0.38      1661
       B-PER       0.79      0.61      0.69      1617
       I-LOC       0.60      0.62      0.61       257
      I-MISC       0.44      0.40      0.42       216
       I-ORG       0.42      0.52      0.46       835
       I-PER       0.82      0.76      0.78      1156
           O       0.95      0.98      0.96     38553

    accuracy                           0.91     46665
   macro avg       0.67      0.61      0.63     46665
weighted avg       0.90      0.91      0.90     46665



### Evaluate CoNLL chunk-level performance

In [7]:
# import conll

import os
import sys
sys.path.insert(0, os.path.abspath('../src/'))
from conll import evaluate

# Evaluate with provided function

results = evaluate(refs, hyps)

#Plot

pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)
display(pd_tbl)

Unnamed: 0,p,r,f,s
LOC,0.766182,0.695444,0.729101,1668
PER,0.761373,0.589981,0.664808,1617
MISC,0.697464,0.548433,0.614035,702
ORG,0.448413,0.272125,0.338704,1661
total,0.681997,0.522486,0.591679,5648


# Evaluate model's full pipeline 

In [8]:
# Reconstruct sentences

sents = []
for sent in test:
    sents.append(' '.join(sent))

# Predict

hyps= []

for sent in sents:
    hyps.append(nlp(sent))

### Reallign ref and model tokenization

In [9]:
def align (hyps):
    new_hyps = []
    for sent in hyps:
        curr_hyp = [[sent[0].text,sent[0].ent_iob_,sent[0].ent_type_,sent[0].whitespace_]]
        for i in range(1, len(sent)):
            last_tok = curr_hyp[-1]
            # If whitespace not present, merge the two tokens
            if(last_tok[3] == ''):
                curr_hyp.remove(last_tok)
                new_tok = [last_tok[0]+sent[i].text,last_tok[1],last_tok[2],sent[i].whitespace_]
                curr_hyp.append(new_tok)
            # Simply append new token
            else:
                curr_hyp.append([sent[i].text,sent[i].ent_iob_,sent[i].ent_type_,sent[i].whitespace_])
        new_hyps.append(curr_hyp)
    return new_hyps

hyps = align(hyps)

### Reallign format of hyps and refs

In [10]:
# Set format in tuples with retagging

for sent in hyps:
    for tok in sent:
        if tok[1] != 'O':
            tok[2] = mapping[tok[2]]
        if tok[2] == '':
            tok[1] = 'O'

hyps_ = []
for sent in hyps:
    to_app = []
    for tok in sent:
        if tok[1] != 'O' and tok[1] != '':
            to_app.append((tok[0], tok[1]+'-'+tok[2]))
        elif tok[1] == 'O':
            to_app.append((tok[0], 'O'))
    hyps_.append(to_app)
hyps = hyps_ # to del?

### Evaluate token level performance

In [11]:
# Count TruePositive, FalseNegative, ...

counts = all_counts(refs, hyps)

# Evaluate statistics for each class

statistics = stats(counts)

#Plot

pd_tbl = pd.DataFrame().from_dict(counts, orient='index')
pd_tbl = pd_tbl.sort_index()
display(pd_tbl)

pd_tbl = pd.DataFrame().from_dict(statistics, orient='columns')
pd_tbl = pd_tbl.sort_index()
display(pd_tbl)

Unnamed: 0,TP,TN,FP,FN
B-LOC,1127,44652,345,541
B-MISC,393,45804,159,309
B-ORG,510,44525,479,1151
B-PER,1017,44794,254,600
I-LOC,137,46304,104,120
I-MISC,86,46344,105,130
I-ORG,430,45228,602,405
I-PER,911,45333,176,245
O,37826,6108,2004,727


Unnamed: 0,Precision,Recall,f1
B-LOC,0.765625,0.675659,0.717834
B-MISC,0.711957,0.559829,0.626794
B-ORG,0.515672,0.307044,0.384906
B-PER,0.800157,0.628942,0.704294
I-LOC,0.568465,0.533074,0.550201
I-MISC,0.450262,0.398148,0.422604
I-ORG,0.416667,0.51497,0.460632
I-PER,0.838086,0.788062,0.812305
O,0.949686,0.981143,0.965158


### Using scikit-learn library

In [12]:
skl_refs = [tok[-1] for sent in refs for tok in sent]
skl_hyps = [tok[-1] for sent in hyps for tok in sent]

report = classification_report(skl_refs, skl_hyps)
    
print(report)

              precision    recall  f1-score   support

       B-LOC       0.77      0.68      0.72      1668
      B-MISC       0.71      0.56      0.63       702
       B-ORG       0.52      0.31      0.38      1661
       B-PER       0.80      0.63      0.70      1617
       I-LOC       0.57      0.53      0.55       257
      I-MISC       0.45      0.40      0.42       216
       I-ORG       0.42      0.51      0.46       835
       I-PER       0.84      0.79      0.81      1156
           O       0.95      0.98      0.97     38553

    accuracy                           0.91     46665
   macro avg       0.67      0.60      0.63     46665
weighted avg       0.90      0.91      0.90     46665



### Evaluate CoNLL chunk-level performance

In [13]:
# Evaluate with provided function

results = evaluate(refs, hyps)

#Plot

pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)
display(pd_tbl)

Unnamed: 0,p,r,f,s
LOC,0.755435,0.666667,0.70828,1668
PER,0.774194,0.608534,0.68144,1617
MISC,0.702899,0.552707,0.61882,702
ORG,0.464105,0.27634,0.346415,1661
total,0.686975,0.521069,0.59263,5648


# Grouping of Entities.
1. Write a function to group recognized named entities using `noun_chunks` method of [spaCy](https://spacy.io/usage/linguistic-features#noun-chunks).
2. Analyze the groups in terms of most frequent combinations (i.e. NER types that go together). 

### Group recognized named entities

In [14]:
def group_ners(doc):
    entities = list(doc.ents)
    entities = [ent.label_ for ent in entities]
    noun_chunks = list(doc.noun_chunks)
    noun_chunks = [[ent.label_ for ent in span.ents] for span in noun_chunks]
    # Remove unlabeled entities
    noun_chunks = [elem for elem in noun_chunks if elem != []]
    
    final_list = []
    j=0
    ran = list(range(len(noun_chunks)))
    for i in ran:
        if(entities[j] == noun_chunks[i][0]):
            final_list.append(noun_chunks[i])
            j+=len(noun_chunks[i])
        else:
            final_list.append([entities[j]])
            ran.insert(0, i-1)
            j+=1
    # Once the noun_chunks are ended, insert the remaining single ners
    for ner in entities[j:]:
        final_list.append([ner])
    return(final_list)

sentence = "Apple's Steve Jobs died in 2011 in Palo Alto, California."

doc = nlp(sentence)
print(sentence)
print('Entities: ', [ent.label_ for ent in doc.ents])
print('Noun chunks: ', [[span.label_ for span in chunk.ents] for chunk in doc.noun_chunks])
print(group_ners(doc))
print()

sentence = "New York University was established in 1800."

doc = nlp(sentence)
print(sentence)
print('Entities: ', [ent.label_ for ent in doc.ents])
print('Noun chunks: ', [[span.label_ for span in chunk.ents] for chunk in doc.noun_chunks])
print(group_ners(doc))

Apple's Steve Jobs died in 2011 in Palo Alto, California.
Entities:  ['ORG', 'PERSON', 'DATE', 'GPE', 'GPE']
Noun chunks:  [['ORG', 'PERSON'], ['GPE'], ['GPE']]
[['ORG', 'PERSON'], ['DATE'], ['GPE'], ['GPE']]

New York University was established in 1800.
Entities:  ['ORG', 'DATE']
Noun chunks:  [['ORG']]
[['ORG'], ['DATE']]


### Analyze frequencies

In [15]:
# Reconstruct sentences

sents = []
for sent in test:
    sents.append(' '.join(sent))

# Predict

hyps= []
grouped = []

for sent in sents:
    doc = nlp(sent)
    grouped.append(group_ners(doc))
    hyps.append(doc)

In [16]:
from itertools import groupby

# Flatten -> grouped is a list of list of groups -> grouped_flat is a list of groups
grouped_flat = []
for sent in grouped:
    for group in sent:
        grouped_flat.append(group)

# Evaluate counts using groupby 
# counts will map each group to the number of times it appears in the corpus
counts = {}
for key, group in groupby(sorted(grouped_flat), lambda x: x):
    cnts = len(list(group))
    dict_key = tuple(key)
    counts[dict_key] = cnts
    


def get_stats_tag (counts, tag):
    '''
    Extract stats for current tag: 
    - number of occurrencies of groups composed by (1, ..., 4) ners containing the tag
    - most frequent combination of ners for each group dimensionality
    '''
    stats = {}
    for i in range(1,5):
        act_stat = {}
        if tag is not None:
            # Find keys containing tag with len i
            keys = [key for key in counts.keys() if tag in key and len(key)==i]
        else:
            keys = [key for key in counts.keys() if len(key)==i]
        # Total number of occurrencies
        tot = 0
        # Max number of occurrencies of single group
        max_so_far = -1
        # Group that occurs more times
        max_key = None
        for key in keys:
            tot+=counts[key]
            if(counts[key]>max_so_far):
                max_so_far=counts[key]
                max_key = key
        act_stat['freq'] = tot
        if max_key is not None:
            act_stat['most_pres_comb'] = (max_key, counts[max_key])
        else:
            act_stat['most_pres_comb'] = (max_key, 0)
        stats[i] = act_stat
    return stats

def get_stats (counts, tags):
    stats = {}
    for tag in tags:
        stats[tag] = get_stats_tag(counts, tag)
    stats['general'] = get_stats_tag(counts, None)
    return stats
    
tags = set(mapping.keys())

statistics = get_stats(counts, tags)

# To display also general stats
tags.add('general')

for tag in sorted(tags):
    print(tag+': ')
    pd_tbl = pd.DataFrame().from_dict(statistics[tag], orient='index')
    display(pd_tbl)
    print('\n\n')

# To maintain tags clean
tags.remove('general')

CARDINAL: 


Unnamed: 0,freq,most_pres_comb
1,1621,"((CARDINAL,), 1621)"
2,125,"((CARDINAL, PERSON), 51)"
3,12,"((CARDINAL, PERSON, CARDINAL), 3)"
4,1,"((MONEY, CARDINAL, CARDINAL, ORG), 1)"





DATE: 


Unnamed: 0,freq,most_pres_comb
1,997,"((DATE,), 997)"
2,51,"((DATE, EVENT), 8)"
3,6,"((DATE, NORP, PERSON), 2)"
4,0,"(None, 0)"





EVENT: 


Unnamed: 0,freq,most_pres_comb
1,58,"((EVENT,), 58)"
2,14,"((DATE, EVENT), 8)"
3,0,"(None, 0)"
4,0,"(None, 0)"





FAC: 


Unnamed: 0,freq,most_pres_comb
1,22,"((FAC,), 22)"
2,5,"((GPE, FAC), 2)"
3,0,"(None, 0)"
4,0,"(None, 0)"





GPE: 


Unnamed: 0,freq,most_pres_comb
1,1255,"((GPE,), 1255)"
2,126,"((GPE, PERSON), 34)"
3,8,"((GPE, DATE, ORG), 2)"
4,0,"(None, 0)"





LANGUAGE: 


Unnamed: 0,freq,most_pres_comb
1,6,"((LANGUAGE,), 6)"
2,2,"((LANGUAGE, ORDINAL), 2)"
3,1,"((DATE, LANGUAGE, ORDINAL), 1)"
4,0,"(None, 0)"





LAW: 


Unnamed: 0,freq,most_pres_comb
1,11,"((LAW,), 11)"
2,0,"(None, 0)"
3,0,"(None, 0)"
4,0,"(None, 0)"





LOC: 


Unnamed: 0,freq,most_pres_comb
1,54,"((LOC,), 54)"
2,6,"((GPE, LOC), 2)"
3,0,"(None, 0)"
4,0,"(None, 0)"





MONEY: 


Unnamed: 0,freq,most_pres_comb
1,146,"((MONEY,), 146)"
2,5,"((MONEY, DATE), 1)"
3,0,"(None, 0)"
4,1,"((MONEY, CARDINAL, CARDINAL, ORG), 1)"





NORP: 


Unnamed: 0,freq,most_pres_comb
1,293,"((NORP,), 293)"
2,89,"((NORP, PERSON), 43)"
3,4,"((DATE, NORP, PERSON), 2)"
4,0,"(None, 0)"





ORDINAL: 


Unnamed: 0,freq,most_pres_comb
1,111,"((ORDINAL,), 111)"
2,29,"((NORP, ORDINAL), 5)"
3,3,"((DATE, LANGUAGE, ORDINAL), 1)"
4,0,"(None, 0)"





ORG: 


Unnamed: 0,freq,most_pres_comb
1,875,"((ORG,), 875)"
2,102,"((ORG, PERSON), 21)"
3,5,"((CARDINAL, CARDINAL, ORG), 2)"
4,1,"((MONEY, CARDINAL, CARDINAL, ORG), 1)"





PERCENT: 


Unnamed: 0,freq,most_pres_comb
1,81,"((PERCENT,), 81)"
2,2,"((CARDINAL, PERCENT), 1)"
3,0,"(None, 0)"
4,0,"(None, 0)"





PERSON: 


Unnamed: 0,freq,most_pres_comb
1,1076,"((PERSON,), 1076)"
2,173,"((CARDINAL, PERSON), 51)"
3,13,"((CARDINAL, PERSON, CARDINAL), 3)"
4,0,"(None, 0)"





PRODUCT: 


Unnamed: 0,freq,most_pres_comb
1,22,"((PRODUCT,), 22)"
2,11,"((GPE, PRODUCT), 9)"
3,0,"(None, 0)"
4,0,"(None, 0)"





QUANTITY: 


Unnamed: 0,freq,most_pres_comb
1,51,"((QUANTITY,), 51)"
2,3,"((QUANTITY, QUANTITY), 2)"
3,0,"(None, 0)"
4,0,"(None, 0)"





TIME: 


Unnamed: 0,freq,most_pres_comb
1,83,"((TIME,), 83)"
2,8,"((DATE, TIME), 7)"
3,1,"((CARDINAL, GPE, TIME), 1)"
4,0,"(None, 0)"





WORK_OF_ART: 


Unnamed: 0,freq,most_pres_comb
1,10,"((WORK_OF_ART,), 10)"
2,2,"((DATE, WORK_OF_ART), 1)"
3,0,"(None, 0)"
4,0,"(None, 0)"





general: 


Unnamed: 0,freq,most_pres_comb
1,6772,"((CARDINAL,), 1621)"
2,403,"((CARDINAL, PERSON), 51)"
3,22,"((CARDINAL, PERSON, CARDINAL), 3)"
4,1,"((MONEY, CARDINAL, CARDINAL, ORG), 1)"







# Fix segmentation errors
One of the possible post-processing steps is to fix segmentation errors.
Write a function that extends the entity span to cover the full noun-compounds. Make use of `compound` dependency relation.

In [17]:
# Reconstruct sentences

sents = []
for sent in test:
    sents.append(' '.join(sent))

# Predict

hyps= []

for sent in sents:
    doc = nlp(sent)
    hyps.append(doc)

In [18]:
errors = []
hyps_ = []

# Create usual format plus the token itself
for sent in hyps:
    sent_hyp = []
    for tok in sent:
        if tok.ent_iob_ != 'O':
            sent_hyp.append((tok.text, tok.ent_iob_+'-'+tok.ent_type_, tok))
        else:
            sent_hyp.append((tok.text, tok.ent_iob_, tok))
    hyps_.append(sent_hyp)
    
hyps = hyps_

# Check if compound token can be merged with directly successive token

def merge_succ(hyps):
    modified = True
    i = 0
    while modified:
        modified = False
        for sent in hyps:
            for tup in sent:
                if tup[-1].dep_ == 'compound' and tup[1][0] == 'O':
                    succ_i = sent.index(tup)+1
                    if succ_i >=0 and succ_i < len(sent):
                        succ = sent[succ_i]
                        if succ[-1] == tup[-1].head:
                            if succ[1] != 'O':
                                new_tup = (tup[0], succ[1], tup[-1])
                                sent[sent.index(tup)] = new_tup
                                new_tup = (succ[0], 'I'+succ[1][1:], succ[-1])
                                sent[succ_i] = new_tup
                                modified = True
                                i+=1
        print('Total merges with succ token: '+str(i))
    return hyps



# Check if compound token can be merged with directly precedent token
def merge_prec(hyps):
    modified = True
    i = 0
    while modified:
        modified = False
        for sent in hyps:
            for tup in sent:
                if tup[-1].dep_ == 'compound' and tup[1][0] == 'O':
                    prec_i = sent.index(tup)-1
                    if prec_i >=0 and prec_i < len(sent):
                        prec = sent[prec_i]
                        if prec[-1] == tup[-1].head:
                            if prec[1] != 'O':
                                print(prec, tup)
                                new_tup = (tup[0], prec[1], tup[-1])
                                sent[sent.index(tup)] = new_tup
                                new_tup = (prec[0], 'I'+prec[1][1:], prec[-1])
                                sent[prec_i] = new_tup
                                modified = True
                                i+=1
        print('Total merges with prec token: '+str(i))
    return hyps

hyps = merge_succ(hyps)
hyps = merge_prec(hyps)


Total merges with succ token: 118
Total merges with succ token: 136
Total merges with succ token: 137
Total merges with succ token: 137
Total merges with prec token: 0


In [19]:
# Prepare tokens for align function
hyps_ = []
for sent in hyps:
    tmp_sent = []
    for tup in sent:
        w, e, t = tup
        tmp_sent.append((w, e[0], e[2:], t.whitespace_))
    hyps_.append(tmp_sent)
    
hyps = hyps_

# Align using slightly different version of previously defined function

def align_2 (hyps):
    new_hyps = []
    for sent in hyps:
        curr_hyp = [list(sent[0])]
        for i in range(1, len(sent)):
            last_tok = curr_hyp[-1]
            # If whitespace not present, merge the two tokens
            if(last_tok[3] == ''):
                curr_hyp.remove(last_tok)
                new_tok = [last_tok[0]+sent[i][0],last_tok[1],last_tok[2],sent[i][3]]
                curr_hyp.append(new_tok)
            # Simply append new token
            else:
                curr_hyp.append(list(sent[i]))
        new_hyps.append(curr_hyp)
    return new_hyps

hyps = align_2(hyps)

# Apply mapping

for sent in hyps:
    for tok in sent:
        if tok[1] != 'O':
            tok[2] = mapping[tok[2]]
        if tok[2] == '':
            tok[1] = 'O'

hyps_ = []
for sent in hyps:
    to_app = []
    for tok in sent:
        if tok[1] != 'O' and tok[1] != '':
            to_app.append((tok[0], tok[1]+'-'+tok[2]))
        elif tok[1] == 'O':
            to_app.append((tok[0], 'O'))
    hyps_.append(to_app)

hyps = hyps_

### Evaluate token level performance

In [20]:
# Count TruePositive, FalseNegative, ...

counts = all_counts(refs, hyps)

# Evaluate statistics for each class

statistics = stats(counts)

#Plot

pd_tbl = pd.DataFrame().from_dict(counts, orient='index')
pd_tbl = pd_tbl.sort_index()
display(pd_tbl)

pd_tbl = pd.DataFrame().from_dict(statistics, orient='columns')
pd_tbl = pd_tbl.sort_index()
display(pd_tbl)

Unnamed: 0,TP,TN,FP,FN
B-LOC,1097,44623,374,571
B-MISC,392,45803,160,310
B-ORG,506,44522,482,1155
B-PER,1008,44785,263,609
I-LOC,137,46260,148,120
I-MISC,86,46340,109,130
I-ORG,433,45203,627,402
I-PER,917,45304,205,239
O,37750,6141,1971,803


Unnamed: 0,Precision,Recall,f1
B-LOC,0.745751,0.657674,0.698949
B-MISC,0.710145,0.558405,0.625199
B-ORG,0.512146,0.304636,0.382031
B-PER,0.793076,0.623377,0.698061
I-LOC,0.480702,0.533074,0.505535
I-MISC,0.441026,0.398148,0.418491
I-ORG,0.408491,0.518563,0.456992
I-PER,0.817291,0.793253,0.805092
O,0.950379,0.979172,0.96456


### Using scikit-learn library

In [21]:
skl_refs = [tok[-1] for sent in refs for tok in sent]
skl_hyps = [tok[-1] for sent in hyps for tok in sent]

report = classification_report(skl_refs, skl_hyps)
    
print(report)

              precision    recall  f1-score   support

       B-LOC       0.75      0.66      0.70      1668
      B-MISC       0.71      0.56      0.63       702
       B-ORG       0.51      0.30      0.38      1661
       B-PER       0.79      0.62      0.70      1617
       I-LOC       0.48      0.53      0.51       257
      I-MISC       0.44      0.40      0.42       216
       I-ORG       0.41      0.52      0.46       835
       I-PER       0.82      0.79      0.81      1156
           O       0.95      0.98      0.96     38553

    accuracy                           0.91     46665
   macro avg       0.65      0.60      0.62     46665
weighted avg       0.90      0.91      0.90     46665



### Evaluate CoNLL chunk-level performance

In [22]:
# Evaluate with provided function

results = evaluate(refs, hyps)

#Plot

pd_tbl = pd.DataFrame().from_dict(results, orient='index')
pd_tbl.round(decimals=3)
display(pd_tbl)

Unnamed: 0,p,r,f,s
LOC,0.736413,0.64988,0.690446,1668
PER,0.766326,0.60235,0.674515,1617
MISC,0.701087,0.551282,0.617225,702
ORG,0.45905,0.273329,0.342642,1661
total,0.676704,0.513279,0.58377,5648
