In [1]:
from __future__ import unicode_literals,print_function

import json
import plac
import random
import spacy
import warnings
from pathlib import Path
from spacy.util import minibatch,compounding
nlp_de1 = spacy.load("de_core_news_sm")
nlp_de2 = spacy.load("de_core_news_md")

### Create train data

In [2]:
with open("novo_train_de.json",'r',encoding="utf-8")as f:
    train_data = json.load(f)
print(train_data[0])

['Frankreich.', ['Frankreich', 0, 10, 'loc']]


In [3]:
def spacy_format_for_train(data):    
    for i in range(len(data)):
        for j in range(1,len(data[i])):
            tmp = data[i][j][-1]
            if tmp == 'pers':
                data[i][j][-1]="PER"
            elif tmp == 'org':
                data[i][j][-1]='ORG'
            elif tmp == 'loc':
                data[i][j][-1]='LOC'
            else:
                data[i][j][-1]='MISC' 
    DATA = []
    for i in range(len(data)):
        values = [(x[1],x[2],x[3]) for x in data[i][1:]]
        DATA.append((data[i][0],{"entities":values}))
    return DATA
TRAIN_DATA = spacy_format_for_train(train_data)
TRAIN_DATA[0]

('Frankreich.', {'entities': [(0, 10, 'LOC')]})

### Training data

In [8]:
import random
from random import shuffle

def create_model(output_dir,n_iter,TRAIN_DATA):
    nlp = spacy.blank("de")
    
    # get ner pipelines for this model so that we can modify labels
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner,last=True)
    
    
    # add labels
    for x,y in TRAIN_DATA:
        for ent in y.get("entities"):
            ner.add_label(ent[2])
        
    # train ner but not others
    pipe_exceptions = set(["ner","trf_wordpiecer","trf_tok2vec"])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
        warnings.filterwarnings("once",category=UserWarning,module="spacy")
        nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            
            # batch up the examples using spaCy's minibatch （speed up for training)
            batches = minibatch(TRAIN_DATA,size=compounding(4.0,32.0,1.001))
            for batch in batches:
                texts,annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.5,
                    losses=losses,
                )
            print("Losses,",losses)
    
    nlp.to_disk(output_dir)
    
create_model("/home/zijian/ZijianStageNER/Novo_Models/Create/",101,TRAIN_DATA)

Losses, {'ner': 21901.252704107843}
Losses, {'ner': 5575.999694520142}
Losses, {'ner': 5339.034751672356}
Losses, {'ner': 4942.349053597078}
Losses, {'ner': 4680.652463650156}
Losses, {'ner': 4660.858366197004}
Losses, {'ner': 4230.358784696698}
Losses, {'ner': 4125.303089174908}
Losses, {'ner': 3803.2817659351276}
Losses, {'ner': 3809.411183443386}
Losses, {'ner': 3493.343972873814}
Losses, {'ner': 3603.288161922901}
Losses, {'ner': 3420.027510996748}
Losses, {'ner': 3299.4164641096067}
Losses, {'ner': 3260.965101563066}
Losses, {'ner': 3103.8086719831554}
Losses, {'ner': 3102.76538397102}
Losses, {'ner': 2945.934652855336}
Losses, {'ner': 2729.9249583415403}
Losses, {'ner': 3000.407493845226}
Losses, {'ner': 2719.1636139813672}
Losses, {'ner': 2567.027094748677}
Losses, {'ner': 2579.592337316328}
Losses, {'ner': 2529.518233144954}
Losses, {'ner': 2421.1188388117816}
Losses, {'ner': 2635.7878836361465}
Losses, {'ner': 2338.217444127071}
Losses, {'ner': 2348.42946402892}
Losses, {'ner'

In [None]:
def training(TRAIN_DATA,model,output_dir,n_iter):
    nlp = nlp_de1
    
    # get ner pipelines for this model so that we can modify labels
    ner = nlp.get_pipe("ner")
    
    # add labels
    for x,y in TRAIN_DATA:
        for ent in y.get("entities"):
            ner.add_label(ent[2])
        
    # train ner but not others
    pipe_exceptions = set(["ner","trf_wordpiecer","trf_tok2vec"])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
        warnings.filterwarnings("once",category=UserWarning,module="spacy")
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            
            # batch up the examples using spaCy's minibatch （speed up for training)
            batches = minibatch(TRAIN_DATA,size=compounding(4.0,32.0,1.001))
            for batch in batches:
                texts,annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.5,
                    losses=losses,
                )
            print("Losses,",losses)
    
    nlp.to_disk(output_dir)
    
training(TRAIN_DATA,"de_core_news_sm","/home/zijian/ZijianStageNER/RetrainModels/",100)

### Test this new model with dev data

### Gold 

In [4]:
import json
with open("novo_dev_de.json",'r',encoding="utf-8")as f:
    dev = json.load(f)


DEV_DATA = spacy_format_for_train(dev)
for i in range(len(DEV_DATA)):
    DEV_DATA[i] = (DEV_DATA[i][0],DEV_DATA[i][1]["entities"])

In [5]:
DEV_DATA[1]

('(Condeer.] Petersburg den 18. Dec. Se.russisch- kaiserl. Majestät haben dem Prinzen vonCondé bey dessen Ankunft in Petersburg den St.Andreas-Orden u. den Maltheser Ritterorden in Polen zu ertheilen, und ihn mit einem prächtigen,völlig meublirten Palais in Petersburg zu beschenken geruhet. Das aus 3 Infanterie- und 2 Kavallerie-Regimentern bestehende Corps des Prinzen vonCondé, welches in kaiserliche Dienste genommenworden, ist nun nach Wladimir, Luzk und Kowelin Quartier verlegt. Das ganze Corps wird unterbestandiger Inspection des Prinzen von Condéstehen. Se. kaiserl. Majestät haben ihn zum Chef desadelichen Infanterie-Regiments, und den Duc deBerry zum Chef des adelichen Kavallerie-Regiments ernannt. Als der Prinz in seinen Pallasttrat, fand er daselbst bereits Leute mit seiner Libréevor, auch Carossen mit seinem Wappen. Der Prinzwar in Verlegenheit an welcher Stelle er eigentlichdas Zeichen des St. Andreas-Ordens tragen sollte.Der Kaiser antwortete ihm: Er möchte es mit denInsigni

### Test

In [6]:
from spacy.scorer import Scorer
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model,data):
    scorer = Scorer()
    for text,annot in data:
        doc_gold_text = ner_model.make_doc(text)
        gold = GoldParse(doc_gold_text,entities=annot)
        pred_value = ner_model(text)
        scorer.score(pred_value,gold)
    return scorer.scores


path = "/home/zijian/ZijianStageNER/Novo_Models/Create"
nlp2 = spacy.load(path)
results = evaluate(nlp2,DEV_DATA)

### Train by updating the model in spacy de_core_web_sm

In [None]:
def training(TRAIN_DATA,model,output_dir,n_iter):
    nlp = nlp_de1
    
    # get ner pipelines for this model so that we can modify labels
    ner = nlp.get_pipe("ner")
    
    # add labels
    for x,y in TRAIN_DATA:
        for ent in y.get("entities"):
            ner.add_label(ent[2])
        
    # train ner but not others
    pipe_exceptions = set(["ner","trf_wordpiecer","trf_tok2vec"])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
        warnings.filterwarnings("once",category=UserWarning,module="spacy")
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            
            # batch up the examples using spaCy's minibatch （speed up for training)
            batches = minibatch(TRAIN_DATA,size=compounding(4.0,32.0,1.001))
            for batch in batches:
                texts,annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.5,
                    losses=losses,
                )
            print("Losses,",losses)
    
    nlp.to_disk(output_dir)
    
training(TRAIN_DATA,"de_core_news_sm","/home/zijian/ZijianStageNER/RetrainModels/",100)

In [14]:
path = "/home/zijian/ZijianStageNER/RetrainModels/"
nlp2 = spacy.load(path)
results2 = evaluate(nlp2,DEV_DATA)

res = pd.DataFrame({"precison":[results['ents_p'],results2['ents_p']],"recall":[results['ents_r'],results2['ents_r']],"f1":[results['ents_f'],results2['ents_f']]},index=["Model only with my train_data","Model with data TigerCorpus and my train data"])
res

Unnamed: 0,precison,recall,f1
Model only with my train_data,64.814815,53.030303,58.333333
Model with data TigerCorpus and my train data,70.909091,59.090909,64.46281


### So What is the difference? Model 1 could detect what? Model 2 could detect what?

In [7]:
Train_Data_Model = spacy.load("/home/zijian/ZijianStageNER/Novo_Models/Create/")
SM_plus_Train_DATA_Model = spacy.load("/home/zijian/ZijianStageNER/RetrainModels/")

In [9]:
trouvés_corrects = []
trouvés_incorrect = []
non_trouvé_mais_correct = []
res1 = []
correction1 = []
for i in range(len(dev)):
    doc = SM_plus_Train_DATA_Model(dev[i][0])
    correction1.append(dev[i][1:])
    tmp = []
    for ent in doc.ents:
        tmp.append([ent.text,ent.start_char,ent.end_char,ent.label_])
    res1.append(tmp)
for i in range(len(res1)):
    l1 = [(x[0],x[1],x[2],x[3]) for x in res1[i]]
    l2 = [(x[0],x[1],x[2],x[3]) for x in correction1[i]]
    trouvés_corrects += list(set(l1) & set(l2))
    trouvés_incorrect += list(set(l1) - set(l2))
    non_trouvé_mais_correct += list(set(l2)-set(l1))  

In [8]:
def context(text,i,j):
    left,right = i,j
    count = 0
    if left > 0:
        while count < 5 and left > 0:
            left-=1
            if text[left]==' ':
                count+=1
    count = 0
    while count < 5 and right < len(text):
        right += 1
        if text[right]==' ':
            count += 1
    return text[left+1:right]

In [10]:
trouvés_corrects[0]

('Rußland', 0, 7, 'LOC')

In [57]:
import pandas as pd
tmp = max([len(trouvés_corrects),len(trouvés_incorrect),len(non_trouvé_mais_correct)])
for i in range(len(trouvés_corrects),tmp):
    trouvés_corrects.append(("","","",""))
for i in range(len(trouvés_incorrect),tmp):
    trouvés_incorrect.append(("","","",""))
for i in range(len(non_trouvé_mais_correct),tmp):
    non_trouvé_mais_correct.append(("","","",""))


d = pd.DataFrame({"Trouvé corrects":[(x[0],x[3]) for x in trouvés_corrects],"Trouvés incorrects":[(x[0],x[3]) for x in trouvés_incorrect],"non trouvé mais correct":[(x[0],x[3]) for x in non_trouvé_mais_correct]},index = [x for x in range(1,tmp+1)])

html_string = '''
<html>
  <meta charset="UTF-8"></meta>
  <head><title>Model only with train data</title></head>
  <body>
    {table}
  </body>
</html>.
'''
with open("Model with train data and spacy data.html","w",encoding='utf-8') as f:
    f.write(html_string.format(table=d.to_html()))

### FP and their context

In [None]:
trouvés_incorrect = [(x[0],x[1],x[2],x[3],context(x[0],x))]

In [None]:
import pandas as pd
tmp = max([len(trouvés_corrects),len(trouvés_incorrect),len(non_trouvé_mais_correct)])
for i in range(len(trouvés_corrects),tmp):
    trouvés_corrects.append(("","","",""))
for i in range(len(trouvés_incorrect),tmp):
    trouvés_incorrect.append(("","","",""))
for i in range(len(non_trouvé_mais_correct),tmp):
    non_trouvé_mais_correct.append(("","","",""))


d = pd.DataFrame({"Trouvé corrects":[(x[0],x[3]) for x in trouvés_corrects],"Trouvés incorrects":[(x[0],x[3]) for x in trouvés_incorrect],"non trouvé mais correct":[(x[0],x[3]) for x in non_trouvé_mais_correct]},index = [x for x in range(1,tmp+1)])

html_string = '''
<html>
  <meta charset="UTF-8"></meta>
  <head><title>Model only with train data</title></head>
  <body>
    {table}
  </body>
</html>.
'''