In [1]:
import spacy
import conll
import pandas as pd
from sklearn.metrics import classification_report
import numpy as np

In [2]:
def remove_doc(path):
    file = conll.read_corpus_conll(path, ' ')
    for tst in file:
        if tst[0][0] == '-DOCSTART-':
            file.remove(tst)
    return file

In [3]:
#function that allows us to change the spacy representation, so that we can actually compare the results with the conll ones
def changeRep(token):
    if token.ent_iob_ != 'O':
        if token.ent_type_ == 'PERSON':
            return token.ent_iob_ + '-PER'
        if token.ent_type_ == 'ORG':
            return token.ent_iob_ + '-ORG'
        if (token.ent_type_ == 'GPE') or (token.ent_type_ == 'LOC') or (token.ent_type_ == 'FAC'):
            return token.ent_iob_ + '-LOC'
        if token.ent_type == '':
                return 'O'
        else:
            return token.ent_iob_ + '-MISC'
    else:
        return token.ent_iob_

In [4]:
def get_stats(refs, hyps):
    pred = []
    true = []
    for ent in refs:
        true.append(ent[0][1])
    for ent in hyps:
        pred.append(ent[0][1])
    res = classification_report(true, pred)
    return res

In [5]:
def get_sent(sent):
    res = []
    token = ''
    for t in sent:
        if t.whitespace_:
            token += t.text
            label = changeRep(t)
            res.append((token, label))
            token = ''
        else:
            token += t.text
    return res

In [6]:
#obtain the data from conll2003 dataset and remove the -DOCSTART- entries
def evaluate(path):
    nlp = spacy.load('en_core_web_sm')
    file = remove_doc(path)
    refs = [[(text, iob) for text, pos, _, iob in sent] for sent in file]
    hyps = []
    for row in file:
        text = ''.join(t[0]+' ' for t in row)
        doc = nlp(text)
        tokenized = get_sent(doc)
        hyps.append(tokenized)

    accuracies = get_stats(refs, hyps)
    print(accuracies)

    #run the conll evaluation function to obtain the chunk level accuracies
    results = conll.evaluate(refs, hyps)

    pd_tbl = pd.DataFrame().from_dict(results, orient='index')
    print('\t1.2 - report CoNLL chunk-level performance (per class and total)\n')
    print(pd_tbl.round(decimals=3))

In [7]:
def group_ents(sent):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(sent)
    groups = []
    ne = []
    for chunk in doc.noun_chunks:
        temp = []
        for e in chunk.ents:
            if e not in ne:
                ne.append(e)
                temp.append(e.label_)
        if ne and len(temp) != 0:
            groups.append(temp)
    for ent in doc.ents:
        if ent not in ne:
            groups.append([ent.label_])        
    return groups

In [8]:
def get_frequencies(path):
    nlp = spacy.load('en_core_web_sm')
    file = remove_doc(path)
    groups = {}
    for row in file[:100]:
        sent = ''.join(t[0]+' ' for t in row)
        g = group_ents(sent)
        for group in g:
            st = ''
            for x in group:
                st += x + ' '
            if st not in groups:
                groups[st] = 1
            else:
                groups[st] += 1
    return groups

In [9]:
def get_head(dep, token):
    if token.dep_ == 'compound':
        dep.append(token)
        get_head(dep, token.head)
    else:
        dep.append(token)
    return dep

In [10]:
def compound_seg(sent):
    res = []
    compounds = []
    skip = []
    ent_c = []
    indexes = []
    for t in sent:
        if t.dep_ == 'compound':
            compounds.append(get_head([], t))
    for span in compounds:
        indexes.append([t.i for t in span])
        for t in span:            
            skip.append(t.i)
    for t in sent:
        if (not t.i in skip):
            indexes.append([t.i])
    indexes = sorted(indexes, key=lambda x: x[0])
    for ind in indexes:
        if len(ind) == 1:
            ent_c.append(sent[ind[0]:ind[0]+1])
        else:
            ent_c.append(sent[ind[0]:ind[-1]+1])
    return ent_c

In [11]:
print('-----------------------------#1-----------------------------\n')
print('Evaluate spaCy NER on CoNLL 2003 data (provided)')
print('\t1.1 - report token-level performance (per class and total)\n')
evaluate('src/conll2003/test.txt')
print('\n-----------------------------#2-----------------------------\n')
tst = "Apple's Steve Jobs died in 2011 in Palo Alto, California."
print("Test sentence: Apple's Steve Jobs died in 2011 in Palo Alto, California.\n", f'Result: {group_ents(tst)}')
frequencies = get_frequencies('src/conll2003/test.txt')
sort = {k: v for k, v in sorted(frequencies.items(), key=lambda item: item[1], reverse=True)}
print('Frequencies:\n')
for y in sort:
    print(y,': ',sort[y])
print('\n-----------------------------#3-----------------------------\n')
    print('Fix segmentation errors.\n')
print('Test sentence: "Apple\'s Steve Jobs died in 2011 in Palo Alto, California."')
nlp = spacy.load('en_core_web_sm')
doc = nlp(tst)
comp = compound_seg(doc)
for c in comp:
    for x in c:
        print(x.text, ': ', (f'{x.ent_iob_}-{x.ent_type_}' if x.ent_iob_ != 'O' else x.ent_iob_))
    

-----------------------------#1-----------------------------

Evaluate spaCy NER on CoNLL 2003 data (provided)
	1.1 - report token-level performance (per class and total)



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-LOC       0.60      0.47      0.52       407
      B-MISC       0.27      0.58      0.37       116
       B-ORG       0.53      0.22      0.31       747
       B-PER       0.62      0.55      0.58       298
      I-MISC       0.00      0.00      0.00         0
       I-ORG       0.00      0.00      0.00         0
       I-PER       0.00      0.00      0.00         0
           O       0.71      0.87      0.78      1885

    accuracy                           0.64      3453
   macro avg       0.34      0.34      0.32      3453
weighted avg       0.64      0.64      0.62      3453

	1.2 - report CoNLL chunk-level performance (per class and total)

           p      r      f     s
LOC    0.749  0.671  0.708  1668
MISC   0.111  0.546  0.184   702
PER    0.774  0.609  0.681  1617
ORG    0.464  0.276  0.346  1661
total  0.408  0.521  0.458  5648

-----------------------------#2-----------------------------

Test sentence: Apple'