In [1]:
import spacy
import json
from spacy.training.example import Example
from spacy.scorer import Scorer

### Evaluation function 

In [2]:

def Evaluate_ner(data, model):# works for NER
    examples = []
    for text, annots in data:
        doc = model(text)
        examples.append(Example.from_dict(doc, annots))
    return model.evaluate(examples)



def Evaluate_spancat(data, model):# works for spancat
    """
    All it does is tha it taker the ner annotation {'entities':[[start, end, label], [start, end, label]]}
    and covert then into spancat annotation {'spans':{'span_key':[[start, end, label], [start, end, label]]}}
    this is needed because our test data follows ner annotation 
    then same as ner evaluation
    """
    span_key = 'sc'
    examples = []
    for text, annots in data:
        new_annots = {'spans':{}}
        new_annots['spans'][span_key] = annots['entities']
        doc = model(text)
        examples.append(Example.from_dict(doc, new_annots))
    return model.evaluate(examples)


# def Evaluate2(data, model):
#     scorer = Scorer()
#     examples = []
#     for text, annots in data:
#         doc = model.make_doc(text)
#         example = Example.from_dict(doc, annots)
#         example.predicted = model(str(example.predicted))
#         examples.append(example)
#     return scorer.score(examples)

In [3]:
# load test data
with open(r'E:\Work\Data_Science\Projects\Custom_NER\data\Train\test\test_data_json.json', 'r') as f:
    test_data_json = json.load(f)

### Evaluating ner model

In [4]:
# load best ner model from conventional training
nlp_ner = spacy.load(r"E:\Work\Data_Science\Projects\Custom_NER\models\model_ner_ef\model-best")

In [5]:
print(Evaluate_ner(test_data_json, nlp_ner))

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.4558499952166842, 'ents_r': 0.4010605167915159, 'ents_f': 0.4267036804871496, 'ents_per_type': {'B-per': {'p': 0.9048780487804878, 'r': 0.30360065466448444, 'f': 0.45465686274509803}, 'I-per': {'p': 0.6119969627942293, 'r': 0.62, 'f': 0.6159724875811998}, 'B-tim': {'p': 0.6615776081424937, 'r': 0.18544935805991442, 'f': 0.28969359331476324}, 'B-org': {'p': 0.27367205542725176, 'r': 0.16481223922114047, 'f': 0.20572916666666669}, 'B-geo': {'p': 0.4578339991122947, 'r': 0.7512745812090313, 'f': 0.568946497517926}, 'I-org': {'p': 0.3824110671936759, 'r': 0.3073868149324861, 'f': 0.34081902245706736}, 'B-gpe': {'p': 0.33007209062821835, 'r': 0.5194489465153971, 'f': 0.40365239294710326}, 'I-geo': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'B-eve': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'I-tim': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'I-gpe': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'B-art': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'B-nat': {'p': 0.0, 'r'

In [6]:
# load best ner model from tranformer training
nlp_ner_trf = spacy.load(r"E:\Work\Data_Science\Projects\Custom_NER\models\model_ner_trf_acc\model-best")

In [7]:
print(Evaluate_ner(test_data_json, nlp_ner_trf))

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'ents_p': 0.7268560953253895, 'ents_r': 0.6674522346603822, 'ents_f': 0.6958887280066692, 'ents_per_type': {'B-geo': {'p': 0.6183513990420973, 'r': 0.8932993445010925, 'f': 0.7308207954714733}, 'B-per': {'p': 0.8341511285574092, 'r': 0.6955810147299509, 'f': 0.7585899152164213}, 'I-per': {'p': 0.7091128545564273, 'r': 0.9038461538461539, 'f': 0.7947243828204261}, 'B-tim': {'p': 0.8788617886178862, 'r': 0.7710413694721826, 'f': 0.8214285714285714}, 'B-gpe': {'p': 0.9122807017543859, 'r': 0.6320907617504052, 'f': 0.7467687888942077}, 'B-org': {'p': 0.6935300794551645, 'r': 0.42489568845618914, 'f': 0.5269512721000431}, 'I-geo': {'p': 0.7926829268292683, 'r': 0.20733652312599682, 'f': 0.3286978508217447}, 'I-org': {'p': 0.7471264367816092, 'r': 0.56791104050834, 'f': 0.6453068592057762}, 'B-eve': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'I-tim': {'p': 0.75, 'r': 0.2777777777777778, 'f': 0.4054054054054055}, 'I-gpe': {'p': 0.0, 'r':

### Evaluating spancat models

In [8]:
# load best model from spancat training with tranformers
nlp_spancat_trf = spacy.load(r"E:\Work\Data_Science\Projects\Custom_NER\models\models_spancat_trf_acc\model-best")

In [9]:
print(Evaluate_spancat(test_data_json, nlp_spancat_trf))


{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'spans_sc_p': 0.0030941382904534445, 'spans_sc_r': 0.555677131554583, 'spans_sc_f': 0.006154009637428907, 'spans_sc_per_type': {'I-eve': {'p': 6.730562136549644e-05, 'r': 1.0, 'f': 0.00013460218324741225}, 'I-tim': {'p': 0.001865929891975394, 'r': 0.6934156378600823, 'f': 0.0037218445770957464}, 'B-art': {'p': 0.0004835452399237705, 'r': 0.3953488372093023, 'f': 0.000965909090909091}, 'B-geo': {'p': 0.012498693288730923, 'r': 0.6966496722505463, 'f': 0.02455680928357788}, 'B-gpe': {'p': 0.004147207847238501, 'r': 0.18638573743922204, 'f': 0.008113876492688693}, 'I-nat': {'p': 1.0417426270665569e-05, 'r': 0.6666666666666666, 'f': 2.083452697810812e-05}, 'I-org': {'p': 0.0037090199230636733, 'r': 0.2565528196981732, 'f': 0.007312324549488364}, 'I-art': {'p': 5.009442799677392e-05, 'r': 0.4, 'f': 0.00010017631030613881}, 'B-nat': {'p': 4.3273183608118046e-05, 'r': 0.5, 'f': 8.653887759075765e-05}, 'B-tim': {'p': 0.00858034