In [1]:
import numpy as np
import pandas as pd
import spacy
import random
from pathlib import Path
from tqdm import tqdm,trange
import pickle
from sklearn.metrics import f1_score
from spacy.training.example import Example

In [2]:
f = open('NER_data_spacy.json','rb')
data = pickle.load(f)

In [None]:
model = None
output_dir=Path("model1/")
n_iter=200

In [None]:
if model is not None:
    nlp1 = spacy.load(model)  # load existing spaCy model
    print("Loaded model '%s'" % model)
else:
    nlp1 = spacy.blank('ar')  # create blank Language class
    print("Created blank 'ar' model")

In [None]:
@Language.component('ner')

In [None]:
if 'ner' not in nlp1.pipe_names:
    ner = nlp1.add_pipe('ner')
    nlp1.add_pipe(ner, last=True)
else:
    ner = nlp1.get_pipe('ner')

In [None]:
for _, annotations in data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])
other_pipes = [pipe for pipe in nlp1.pipe_names if pipe != 'ner']

In [None]:
with nlp1.disable_pipes(*other_pipes):  # only train NER
    optimizer = nlp1.begin_training()
    for itn in trange(n_iter):
        random.shuffle(data)
        losses = {}
        for batch in spacy.util.minibatch(data, size=1):
            for text, annotations in batch:
                doc = nlp1.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp1.update([example], losses=losses, drop=0.5)
        print(losses)

In [None]:
if output_dir is not None:
    output_dir = Path(output_dir)
if not output_dir.exists():
    output_dir.mkdir()
nlp1.to_disk(output_dir)
print("Saved model to", output_dir)

In [3]:
output_dir=Path("spacy_model/")

In [4]:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc = nlp2("مسبار ناسا البعيد يكشف همهمة غريبة من الفضاء بين النجوم")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Loading from spacy_model
Entities []


In [5]:
doc = nlp2("جوزيه توماس هو دبلوماسي و نقابي و و سياسي أسترالي ، ولد في 28 أبريل 1863 في المملكة المتحدة ، وتوفي في 5 فبراير 1933 أستراليا .")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('جوزيه', 'person'), ('توماس', 'person'), ('نقابي', 'person'), ('سياسي', 'artwork'), ('28', 'time'), ('أبريل', 'time'), ('1863', 'time'), ('المملكة', 'location'), ('المتحدة ،', 'location'), ('5', 'time'), ('فبراير', 'time'), ('1933', 'time'), ('أستراليا', 'location')]


In [6]:
doc = nlp2("جون يحب البيت الأزرق في نهاية الشارع")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('جون', 'person')]


In [7]:
k = []
l = []
for z in range(len(data)):
    a = [(tok.text, tok.idx, tok.ent_type_) for tok in nlp2(data[z][0])]
    b = data[z][1]['entities']
    d = []
    e = []
    f = []
    g = []
    h = ['']*len(a)
    for i in a:
        d.append(i[2])
        e.append(i[1])
    for i in b:
        f.append(i[0])
        g.append(i[2])
    for i,j in zip(f,g):
        h[e.index(i)] = j
    assert len(d) == len(h)
    k.extend(d)
    l.extend(h)

In [8]:
f1_score(l,k,average='macro')

0.7100269953063391

In [9]:
(sum(np.array(k) == np.array(l))) / len(k)

0.9352114344709695