In [1]:
from __future__ import unicode_literals,print_function

import json
import plac
import random
import spacy
import warnings
from pathlib import Path
from spacy.util import minibatch,compounding
nlp_de1 = spacy.load("de_core_news_sm")
nlp_de2 = spacy.load("de_core_news_md")

### Create train data

In [29]:
with open("novo_train_de.json",'r',encoding="utf-8")as f:
    train_data = json.load(f)
print(type(train_data))

<class 'list'>


In [30]:
spacy.explain('MISC')

'Miscellaneous entities, e.g. events, nationalities, products or works of art'

In [73]:
def spacy_format_for_train(data):    
    for i in range(len(data)):
        for j in range(1,len(data[i])):
            tmp = data[i][j][-1]
            if tmp == 'pers':
                data[i][j][-1]="PER"
            elif tmp == 'org':
                data[i][j][-1]='ORG'
            elif tmp == 'loc':
                data[i][j][-1]='LOC'
            else:
                data[i][j][-1]='MISC' 
    DATA = []
    for i in range(len(data)):
        values = [(x[1],x[2],x[3]) for x in data[i][1:]]
        DATA.append((data[i][0],{"entities":values}))
    return DATA

### Training data

In [43]:
import random
from random import shuffle

def training(TRAIN_DATA,model,output_dir,n_iter):
    nlp = nlp_de1
    
    # get ner pipelines for this model so that we can modify labels
    ner = nlp.get_pipe("ner")
    
    # add labels
    for x,y in TRAIN_DATA:
        for ent in y.get("entities"):
            ner.add_label(ent[2])
        
    # train ner but not others
    pipe_exceptions = set(["ner","trf_wordpiecer","trf_tok2vec"])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
        warnings.filterwarnings("once",category=UserWarning,module="spacy")
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            
            # batch up the examples using spaCy's minibatch （speed up for training)
            batches = minibatch(TRAIN_DATA,size=compounding(4.0,32.0,1.001))
            for batch in batches:
                texts,annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.5,
                    losses=losses,
                )
            print("Losses,",losses)
    
    nlp.to_disk(output_dir)
    
training(TRAIN_DATA,"de_core_news_sm","/home/zijian/ZijianStageNER/RetrainModels/",100)

### Test this new model with dev data

### Gold 

In [77]:
import json
with open("novo_dev_de.json",'r',encoding="utf-8")as f:
    dev = json.load(f)


DEV_DATA = spacy_format_for_train(dev)
for i in range(len(DEV_DATA)):
    DEV_DATA[i] = (DEV_DATA[i][0],DEV_DATA[i][1]["entities"])

In [94]:
DEV_DATA[1]

('(Condeer.] Petersburg den 18. Dec. Se.russisch- kaiserl. Majestät haben dem Prinzen vonCondé bey dessen Ankunft in Petersburg den St.Andreas-Orden u. den Maltheser Ritterorden in Polen zu ertheilen, und ihn mit einem prächtigen,völlig meublirten Palais in Petersburg zu beschenken geruhet. Das aus 3 Infanterie- und 2 Kavallerie-Regimentern bestehende Corps des Prinzen vonCondé, welches in kaiserliche Dienste genommenworden, ist nun nach Wladimir, Luzk und Kowelin Quartier verlegt. Das ganze Corps wird unterbestandiger Inspection des Prinzen von Condéstehen. Se. kaiserl. Majestät haben ihn zum Chef desadelichen Infanterie-Regiments, und den Duc deBerry zum Chef des adelichen Kavallerie-Regiments ernannt. Als der Prinz in seinen Pallasttrat, fand er daselbst bereits Leute mit seiner Libréevor, auch Carossen mit seinem Wappen. Der Prinzwar in Verlegenheit an welcher Stelle er eigentlichdas Zeichen des St. Andreas-Ordens tragen sollte.Der Kaiser antwortete ihm: Er möchte es mit denInsigni

### Test

In [95]:
from spacy.scorer import Scorer
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model,data):
    scorer = Scorer()
    for text,annot in data:
        doc_gold_text = ner_model.make_doc(text)
        gold = GoldParse(doc_gold_text,entities=annot)
        pred_value = ner_model(text)
        scorer.score(pred_value,gold)
    return scorer.scores


path = "/home/zijian/ZijianStageNER/RetrainModels/"
nlp2 = spacy.load(path)
results = evaluate(nlp2,DEV_DATA)

In [96]:
print(results)

{'uas': 0.0, 'las': 0.0, 'ents_p': 70.9090909090909, 'ents_r': 59.09090909090909, 'ents_f': 64.46280991735537, 'tags_acc': 0.0, 'token_acc': 100.0}
