# NER Testing

This code loads the spaCy and Flair trained models and use them to predict the named entities in given sentences; either reading from a file or by providing a sequence of sentences.

## Initialization

In [27]:
import spacy
from spacy import displacy
from flair.data import Sentence
from flair.models import SequenceTagger
import numpy as np

## spaCy

In here we test the spaCy model.

### Backend functions.

In [28]:
# theses functions create training data suitable for the Spacy tool
def _reformat_data(data):
    for counter, example_ in enumerate(data):
        index_ = 0
        annotations = {}
        sentence, ner_tag = example_
        for word, tag in zip(sentence, ner_tag):
            #-------------------------------------#
            # analysing the NER tag
            if '-' in tag:
                In, tag = tag.split('-')
                if tag not in annotations:
                    annotations[tag] = []
            else:
                In = tag
                
            #-------------------------------------#
            # creating the training data
            if In == 'B':
                annotations[tag].append([index_, index_+len(word)])
            elif In == 'I':
                annotations[tag][-1][1] = index_+len(word)
            elif In != 'O':
                print('=====!!!!!', In)
                
            index_ += len(word) + 1
        
        # fix the format
        ann = {'entities':[ (val[0],val[1],key) for key in annotations for val in annotations[key]]}
            
        ## update the training data to fit spacy format
        text = ' '.join(sentence)
        data[counter] = (text, ann)
    return data

def _create_training_data(raw_data):
    File_ = open(raw_data, 'r')
    TRAIN_DATA = []
    sentence = []
    ner_tag = []

    for line in File_:
        try:
            line = line.split('\n')[0]

            if line == '':
                TRAIN_DATA.append([sentence,ner_tag])
                sentence = []
                ner_tag = []
            else:
                word, POS1, CNK2, tag = line.split(' ')
                sentence.append(word)
                ner_tag.append(tag)
        except:
            print('you have a bad line..',line)
            
    File_.close()
    return _reformat_data(TRAIN_DATA)

In [29]:
def predict_on_texts(texts):
    colors = {}
    colors['ORG'] = 'orange'
    colors['PER'] = '#aa9cfc'
    colors['LOC'] = 'green'
    colors['MISC'] = 'yellow'
    options = {'ents': ['ORG', 'PER', 'LOC', 'MISC'], 'colors': colors}

    for text in texts:
        doc = nlp(text)
        Entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        if len(Entities) > 0:
            displacy.render(doc, style='ent', jupyter=True, options=options)
        else:
            print('no entities detected: ',text)
        print('--------------------------')
        print()

In [30]:
def predict_on_test_set(filepath):
    ext = filepath.split('.')[-1]
    if ext == 'json':
        VAL_DATA = json.load(open(filepath,'r'))
    elif ext == 'txt':
        VAL_DATA = _create_training_data(filepath) 
    else:
        VAL_DATA = []

    TP, FN, FP = 0, 0, 0 # True positives, False negatives, False Positives
    for text, ann in VAL_DATA:
        doc = nlp(text)
        GT = sorted(ann['entities'], key=lambda tup: tup[0])
        Entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        Ground_Truth = [(text[a[0]:a[1]], a[0], a[1], a[2]) for a in GT]
        
        TP += len([value for value in Entities if value in Ground_Truth])
        FP += len([value for value in Entities if value not in Ground_Truth])
        FN += len([value for value in Ground_Truth if value not in Entities])
    Pr, Re = TP/(TP+FP), TP/(TP+FN) ## computing Precision and Recall
    print('  -Validation: -precision=%.3f -recall=%.3f -f1 score=%.3f'  % (Pr, Re, 2*(Pr*Re)/(Pr+Re)))

### Loading the model

In [31]:
## load the spaCy model
model = 'Spacy/'

"""Load the model, set up the pipeline and train the entity recognizer."""
try:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
except:
    print("Could not find the model, please check you have the model in your directory")

Loaded model 'Spacy/'


### reading from a text file.

In [32]:
# To test the model on a txt file use:
predict_on_test_set('Dataset/ner_dataset_test.txt')

  -Validation: -precision=0.973 -recall=0.978 -f1 score=0.976


### Sequence of sentences.


In [40]:
# To test the model with a sequence of sentences use:
predict_on_texts(['New York city', 'Muhannad Alomari is going to the city of New York.'])

--------------------------



--------------------------



## Flair model

In here we test the Flair model.

### Backend functions.

In [34]:
def predict_on_texts_flair(texts):
    for text in texts:
        sentence = Sentence(text)
        
        # predict NER tags
        flair_model.predict(sentence)

        # print sentence with predicted tags
        print(sentence.to_tagged_string())


In [35]:
def predict_on_test_set_flair(filepath):
    print('  -reading data..')
    ext = filepath.split('.')[-1]
    if ext == 'txt':
        VAL_DATA = _create_training_data(filepath) 
    else:
        VAL_DATA = []
    print('  -predicting tags..')
    TP, FN, FP = 0, 0, 0 # True positives, False negatives, False Positives
    counter = 0
    for text, ann in VAL_DATA:
        sentence = Sentence(text)
        
        # predict NER tags
        flair_model.predict(sentence)

        # print sentence with predicted tags
        sent = sentence.to_tagged_string()
        
        Entities = [(entity.start_pos, entity.end_pos, entity.tag) for entity in sentence.get_spans('ner')]
        Ground_Truth = sorted(ann['entities'], key=lambda tup: tup[0])
        
#         if Entities != Ground_Truth:
#             print(Entities)
#             print(Ground_Truth)
        
        TP += len([value for value in Entities if value in Ground_Truth])
        FP += len([value for value in Entities if value not in Ground_Truth])
        FN += len([value for value in Ground_Truth if value not in Entities])
        
        if np.mod(counter,20) == 0:
            print(counter)
        counter += 1
    Pr, Re = TP/(TP+FP), TP/(TP+FN) ## computing Precision and Recall
    print('  -Validation: -precision=%.3f -recall=%.3f -f1 score=%.3f'  % (Pr, Re, 2*(Pr*Re)/(Pr+Re)))

### Load the Flair model

In [36]:
# loading the model
flair_model: SequenceTagger = SequenceTagger.load_from_file('Flair/final-model.pt')

### Reading from a text file.

In [37]:
# from flair.models import SequenceTagger
# print('tes1')
# flair_model = SequenceTagger.load('ner')
# print('test2')
predict_on_test_set_flair('Dataset/ner_dataset_test.txt')

  -reading data..
  -predicting tags..
0
[(116, 123, 'LOC'), (126, 134, 'ORG')]
[(97, 105, 'LOC'), (116, 123, 'LOC'), (126, 134, 'ORG')]
[]
[(0, 10, 'ORG')]
20
[(7, 22, 'MISC')]
[(0, 4, 'LOC'), (7, 22, 'MISC')]
[(0, 8, 'PER'), (44, 50, 'LOC'), (86, 97, 'PER')]
[(0, 8, 'PER'), (44, 50, 'LOC'), (86, 97, 'ORG')]
40
[(0, 6, 'LOC'), (7, 17, 'MISC')]
[(0, 6, 'LOC'), (7, 32, 'MISC')]
[]
[(0, 8, 'MISC')]
60
[(0, 8, 'PER')]
[(0, 8, 'PER'), (104, 114, 'LOC'), (145, 153, 'PER')]
[(4, 7, 'ORG')]
[(4, 7, 'ORG'), (30, 37, 'ORG'), (42, 49, 'ORG')]
[]
[(0, 7, 'MISC')]
[(0, 13, 'MISC'), (30, 36, 'PER'), (70, 84, 'ORG'), (97, 109, 'PER')]
[(0, 13, 'MISC'), (30, 36, 'ORG'), (70, 84, 'ORG'), (97, 109, 'PER')]
[(4, 10, 'MISC')]
[(0, 10, 'ORG')]
80
[(4, 25, 'ORG'), (35, 48, 'PER'), (108, 111, 'ORG')]
[(4, 25, 'ORG'), (35, 48, 'PER')]
[(4, 9, 'LOC')]
[(4, 9, 'ORG')]
100
[(5, 38, 'ORG')]
[(17, 38, 'ORG')]
120
[(52, 59, 'LOC'), (75, 77, 'LOC'), (95, 102, 'LOC'), (134, 143, 'LOC')]
[(52, 59, 'LOC'), (75, 77, 'L

[(0, 5, 'MISC'), (133, 140, 'LOC'), (177, 181, 'LOC')]
[(0, 5, 'MISC'), (133, 135, 'LOC'), (177, 181, 'LOC')]
[(4, 11, 'MISC'), (144, 151, 'PER'), (169, 174, 'PER')]
[(4, 11, 'MISC'), (144, 151, 'PER'), (165, 174, 'PER')]
820
[(0, 11, 'LOC'), (95, 105, 'LOC'), (129, 134, 'LOC'), (174, 190, 'ORG')]
[(0, 11, 'LOC'), (95, 105, 'LOC'), (129, 134, 'LOC')]
840
[]
[(21, 27, 'ORG')]
[(77, 83, 'MISC'), (183, 189, 'LOC')]
[(77, 83, 'MISC'), (153, 168, 'MISC'), (183, 189, 'LOC')]
860
[(0, 8, 'ORG'), (111, 124, 'LOC')]
[(0, 8, 'ORG'), (97, 103, 'MISC'), (111, 124, 'LOC')]
[(101, 108, 'MISC'), (119, 124, 'LOC')]
[(101, 115, 'LOC'), (119, 124, 'LOC')]
880
[(39, 60, 'MISC')]
[(39, 49, 'MISC')]
900
[]
[(31, 39, 'PER')]
[(0, 10, 'ORG')]
[(0, 3, 'ORG')]
[(12, 14, 'ORG')]
[]
[(70, 86, 'ORG')]
[]
920
[(46, 62, 'ORG')]
[]
[(0, 7, 'PER'), (45, 52, 'LOC'), (69, 76, 'MISC'), (98, 105, 'PER')]
[(0, 7, 'MISC'), (45, 52, 'LOC'), (69, 76, 'MISC'), (98, 105, 'PER')]
940
[(73, 79, 'LOC')]
[(0, 15, 'ORG'), (73, 79, 

1820
[(50, 56, 'LOC'), (60, 65, 'LOC')]
[(30, 40, 'MISC'), (50, 56, 'LOC'), (60, 65, 'LOC')]
[(18, 30, 'LOC'), (99, 108, 'LOC')]
[(18, 30, 'LOC'), (99, 108, 'LOC'), (143, 159, 'MISC')]
[]
[(24, 33, 'ORG')]
1840
  -Validation: -precision=0.952 -recall=0.933 -f1 score=0.943


### Sequences of text.

In [39]:
predict_on_texts_flair(['New York city.','Muhannad Alomari is going to the city of New York.'])

Muhannad <B-PER> Alomari <I-PER> is going to the city of New <B-LOC> York. <I-LOC>
