# NER Testing

This code loads the spaCy and Flair trained models and use them to predict the named entities in given sentences; either reading from a file or by providing a sequence of sentences.

## Initialization

In [11]:
import spacy
from spacy import displacy
from flair.data import Sentence
from flair.models import SequenceTagger

## spaCy

In here we test the spaCy model.

### Backend functions.

In [19]:
# theses functions create training data suitable for the Spacy tool
def _reformat_data(data):
    for counter, example_ in enumerate(data):
        index_ = 0
        annotations = {}
        sentence, ner_tag = example_
        for word, tag in zip(sentence, ner_tag):
            #-------------------------------------#
            # analysing the NER tag
            if '-' in tag:
                In, tag = tag.split('-')
                if tag not in annotations:
                    annotations[tag] = []
            else:
                In = tag
                
            #-------------------------------------#
            # creating the training data
            if In == 'B':
                annotations[tag].append([index_, index_+len(word)])
            elif In == 'I':
                annotations[tag][-1][1] = index_+len(word)
            elif In != 'O':
                print('=====!!!!!', In)
                
            index_ += len(word) + 1
        
        # fix the format
        ann = {'entities':[ (val[0],val[1],key) for key in annotations for val in annotations[key]]}
            
        ## update the training data to fit spacy format
        text = ' '.join(sentence)
        data[counter] = (text, ann)
    return data

def _create_training_data(raw_data):
    File_ = open(raw_data, 'r')
    TRAIN_DATA = []
    sentence = []
    ner_tag = []

    for line in File_:
        try:
            line = line.split('\n')[0]

            if line == '':
                TRAIN_DATA.append([sentence,ner_tag])
                sentence = []
                ner_tag = []
            else:
                word, POS1, CNK2, tag = line.split(' ')
                sentence.append(word)
                ner_tag.append(tag)
        except:
            print('you have a bad line..',line)
            
    File_.close()
    return _reformat_data(TRAIN_DATA)

In [20]:
def predict_on_texts(texts):
    colors = {}
    colors['ORG'] = 'orange'
    colors['PER'] = '#aa9cfc'
    colors['LOC'] = 'green'
    colors['MISC'] = 'yellow'
    options = {'ents': ['ORG', 'PER', 'LOC', 'MISC'], 'colors': colors}

    for text in texts:
        doc = nlp(text)
        Entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        if len(Entities) > 0:
            displacy.render(doc, style='ent', jupyter=True, options=options)
        else:
            print('no entities detected: ',text)
        print('--------------------------')
        print()

In [21]:
def predict_on_test_set(filepath):
    ext = filepath.split('.')[-1]
    if ext == 'json':
        VAL_DATA = json.load(open(filepath,'r'))
    elif ext == 'txt':
        VAL_DATA = _create_training_data(filepath) 
    else:
        VAL_DATA = []

    TP, FN, FP = 0, 0, 0 # True positives, False negatives, False Positives
    for text, ann in VAL_DATA:
        doc = nlp(text)
        GT = sorted(ann['entities'], key=lambda tup: tup[0])
        Entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        Ground_Truth = [(text[a[0]:a[1]], a[0], a[1], a[2]) for a in GT]
        
        TP += len([value for value in Entities if value in Ground_Truth])
        FP += len([value for value in Entities if value not in Ground_Truth])
        FN += len([value for value in Ground_Truth if value not in Entities])
    Pr, Re = TP/(TP+FP), TP/(TP+FN) ## computing Precision and Recall
    print('  -Validation: -precision=%.3f -recall=%.3f -f1 score=%.3f'  % (Pr, Re, 2*(Pr*Re)/(Pr+Re)))

### Loading the model

In [28]:
## load the spaCy model
model = 'Spacy/'

"""Load the model, set up the pipeline and train the entity recognizer."""
try:
    nlp = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
except:
    print("Could not find the model, please check you have the model in your directory")

Loaded model 'Spacy/'


### reading from a text file.

In [27]:
# To test the model on a txt file use:
predict_on_test_set('Dataset/ner_dataset_test.txt')

  -Validation: -precision=0.987 -recall=0.989 -f1 score=0.988


### Sequence of sentences.

In [24]:
# To test the model with a sequence of sentences use:
predict_on_texts(['New York city', 'My name is Muhannad, and I live in the US. I work in Rolls Royce.'])

--------------------------



--------------------------



## Flair model

In here we test the Flair model.

### Backend functions.

In [29]:
def predict_on_texts_flair(texts):
    for text in texts:
        sentence = Sentence(text)
        
        # predict NER tags
        flair_model.predict(sentence)

        # print sentence with predicted tags
        print(sentence.to_tagged_string())


In [None]:
def predict_on_test_set_flair(filepath):
    ext = filepath.split('.')[-1]
    if ext == 'json':
        VAL_DATA = json.load(open(filepath,'r'))
    elif ext == 'txt':
        VAL_DATA = _create_training_data(filepath) 
    else:
        VAL_DATA = []

    TP, FN, FP = 0, 0, 0 # True positives, False negatives, False Positives
    for text, ann in VAL_DATA:
        doc = nlp(text)
        GT = sorted(ann['entities'], key=lambda tup: tup[0])
        Entities = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
        Ground_Truth = [(text[a[0]:a[1]], a[0], a[1], a[2]) for a in GT]
        
        TP += len([value for value in Entities if value in Ground_Truth])
        FP += len([value for value in Entities if value not in Ground_Truth])
        FN += len([value for value in Ground_Truth if value not in Entities])
    Pr, Re = TP/(TP+FP), TP/(TP+FN) ## computing Precision and Recall
    print('  -Validation: -precision=%.3f -recall=%.3f -f1 score=%.3f'  % (Pr, Re, 2*(Pr*Re)/(Pr+Re)))

### Load the Flair model

In [25]:
# loading the model
flair_model: SequenceTagger = SequenceTagger.load_from_file('Flair/final-model.pt')

NameError: name 'flair_folder' is not defined

### Reading from a text file.

### Sequences of text.