In [101]:
from __future__ import unicode_literals,print_function

import json
import plac
import random
import spacy
import warnings
from pathlib import Path
from spacy.util import minibatch,compounding
nlp_de1 = spacy.load("de_core_news_sm")
nlp_de2 = spacy.load("de_core_news_md")

### Create train data

In [122]:
with open("novo_train_de.json",'r',encoding="utf-8")as f:
    train_data = json.load(f)
print(train_data[0])

['Frankreich.', ['Frankreich', 0, 10, 'loc']]


In [129]:
def spacy_format_for_train(data):    
    for i in range(len(data)):
        for j in range(1,len(data[i])):
            tmp = data[i][j][-1]
            if tmp == 'pers':
                data[i][j][-1]="PER"
            elif tmp == 'org':
                data[i][j][-1]='ORG'
            elif tmp == 'loc':
                data[i][j][-1]='LOC'
            else:
                data[i][j][-1]='MISC' 
    DATA = []
    for i in range(len(data)):
        values = [(x[1],x[2],x[3]) for x in data[i][1:]]
        data[i][0] = data[i][0].replace("\n"," ")
        data[i][0] = data[i][0].replace("\t"," ")
        DATA.append((data[i][0],{"entities":values}))
    return DATA
TRAIN_DATA = spacy_format_for_train(train_data)
TRAIN_DATA[0:2]

[('Frankreich.', {'entities': [(0, 10, 'LOC')]}),
 ("Gesetzgeber. Den 19. Niv. (8. Jän.) warddie Staatskleidung der Sekretär-Redacteurs, derStaatsbothen und Thorwächter für beyde Räthebestimmt. Auf Talot's Vorschlag beschloß manüber den constitutionellen Umkreis, welchen dasgesetzgebende Corps in Zukunft inne haben soll, folgendes: Vom Tage an, da der Rath der 500 in seinen neuen Pallast installirt seyn wird, sind dieäußerlichen Bezirke für beyde Räthe folgendermassen firir: Rarh der Alten: Der Umfang des Nationalpallastes der Alten, in den Tuillerien situirt,enhält gegen Westen die Straße und den Platz desCarrousel bis zum Eintritt in die Straße Nicaise,am Hause Coigny vorbey bis zur Straße des Orties, die Passage de Marigny mit einbegriffen: gegen Suden den Theil des Quai der Gallerien desLouvre's von der Passage de Marigny am rechtenUfer der Seine hinab bis zum Quai der Tuilerien,vom rechten Winkel des Parapets bey der Nationalbrucke bis an den Eingang zum Platze de la Coneorde, die

### Training data

In [8]:
import random
from random import shuffle

def create_model(output_dir,n_iter,TRAIN_DATA):
    nlp = spacy.blank("de")
    
    # get ner pipelines for this model so that we can modify labels
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner,last=True)
    
    
    # add labels
    for x,y in TRAIN_DATA:
        for ent in y.get("entities"):
            ner.add_label(ent[2])
        
    # train ner but not others
    pipe_exceptions = set(["ner","trf_wordpiecer","trf_tok2vec"])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
        warnings.filterwarnings("once",category=UserWarning,module="spacy")
        nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            
            # batch up the examples using spaCy's minibatch （speed up for training)
            batches = minibatch(TRAIN_DATA,size=compounding(4.0,32.0,1.001))
            for batch in batches:
                texts,annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.5,
                    losses=losses,
                )
            print("Losses,",losses)
    
    nlp.to_disk(output_dir)
    
create_model("/home/zijian/ZijianStageNER/Novo_Models/Create/",101,TRAIN_DATA)

Losses, {'ner': 21901.252704107843}
Losses, {'ner': 5575.999694520142}
Losses, {'ner': 5339.034751672356}
Losses, {'ner': 4942.349053597078}
Losses, {'ner': 4680.652463650156}
Losses, {'ner': 4660.858366197004}
Losses, {'ner': 4230.358784696698}
Losses, {'ner': 4125.303089174908}
Losses, {'ner': 3803.2817659351276}
Losses, {'ner': 3809.411183443386}
Losses, {'ner': 3493.343972873814}
Losses, {'ner': 3603.288161922901}
Losses, {'ner': 3420.027510996748}
Losses, {'ner': 3299.4164641096067}
Losses, {'ner': 3260.965101563066}
Losses, {'ner': 3103.8086719831554}
Losses, {'ner': 3102.76538397102}
Losses, {'ner': 2945.934652855336}
Losses, {'ner': 2729.9249583415403}
Losses, {'ner': 3000.407493845226}
Losses, {'ner': 2719.1636139813672}
Losses, {'ner': 2567.027094748677}
Losses, {'ner': 2579.592337316328}
Losses, {'ner': 2529.518233144954}
Losses, {'ner': 2421.1188388117816}
Losses, {'ner': 2635.7878836361465}
Losses, {'ner': 2338.217444127071}
Losses, {'ner': 2348.42946402892}
Losses, {'ner'

In [None]:
def training(TRAIN_DATA,model,output_dir,n_iter):
    nlp = nlp_de1
    
    # get ner pipelines for this model so that we can modify labels
    ner = nlp.get_pipe("ner")
    
    # add labels
    for x,y in TRAIN_DATA:
        for ent in y.get("entities"):
            ner.add_label(ent[2])
        
    # train ner but not others
    pipe_exceptions = set(["ner","trf_wordpiecer","trf_tok2vec"])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
        warnings.filterwarnings("once",category=UserWarning,module="spacy")
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            
            # batch up the examples using spaCy's minibatch （speed up for training)
            batches = minibatch(TRAIN_DATA,size=compounding(4.0,32.0,1.001))
            for batch in batches:
                texts,annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.5,
                    losses=losses,
                )
            print("Losses,",losses)
    
    nlp.to_disk(output_dir)
    
training(TRAIN_DATA,"de_core_news_sm","/home/zijian/ZijianStageNER/RetrainModels/",100)

### Test this new model with dev data

### Gold 

In [133]:
import json
with open("novo_dev_de.json",'r',encoding="utf-8")as f:
    dev = json.load(f)

DEV_DATA = spacy_format_for_train(dev)
for i in range(len(DEV_DATA)):
    DEV_DATA[i] = (DEV_DATA[i][0],DEV_DATA[i][1]["entities"])

### Test

In [98]:
from spacy.scorer import Scorer
from spacy.gold import GoldParse
from spacy.scorer import Scorer

def evaluate(ner_model,data):
    scorer = Scorer()
    for text,annot in data:
        doc_gold_text = ner_model.make_doc(text)
        gold = GoldParse(doc_gold_text,entities=annot)
        pred_value = ner_model(text)
        scorer.score(pred_value,gold)
    return scorer.scores


#path = "/home/zijian/ZijianStageNER/Novo_Models/Create"
#nlp2 = spacy.load(path)
#results = evaluate(nlp2,DEV_DATA)

### Train by updating the model in spacy de_core_web_sm

In [None]:
def training(TRAIN_DATA,model,output_dir,n_iter):
    nlp = nlp_de1
    
    # get ner pipelines for this model so that we can modify labels
    ner = nlp.get_pipe("ner")
    
    # add labels
    for x,y in TRAIN_DATA:
        for ent in y.get("entities"):
            ner.add_label(ent[2])
        
    # train ner but not others
    pipe_exceptions = set(["ner","trf_wordpiecer","trf_tok2vec"])
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
    
    with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
        warnings.filterwarnings("once",category=UserWarning,module="spacy")
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            
            # batch up the examples using spaCy's minibatch （speed up for training)
            batches = minibatch(TRAIN_DATA,size=compounding(4.0,32.0,1.001))
            for batch in batches:
                texts,annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.5,
                    losses=losses,
                )
            print("Losses,",losses)
    
    nlp.to_disk(output_dir)
    
training(TRAIN_DATA,"de_core_news_sm","/home/zijian/ZijianStageNER/RetrainModels/",100)

In [14]:
path = "/home/zijian/ZijianStageNER/RetrainModels/"
nlp2 = spacy.load(path)
results2 = evaluate(nlp2,DEV_DATA)

res = pd.DataFrame({"precison":[results['ents_p'],results2['ents_p']],"recall":[results['ents_r'],results2['ents_r']],"f1":[results['ents_f'],results2['ents_f']]},index=["Model only with my train_data","Model with data TigerCorpus and my train data"])
res

Unnamed: 0,precison,recall,f1
Model only with my train_data,64.814815,53.030303,58.333333
Model with data TigerCorpus and my train data,70.909091,59.090909,64.46281


### So What is the difference? Model 1 could detect what? Model 2 could detect what?

In [144]:
Train_Data_Model = spacy.load("ModelsRetrained/Novo_Models/Create/")
SM_plus_Train_DATA_Model = spacy.load("ModelsRetrained/Novo_Models/UpdateModel1/RetrainModels/")

In [145]:
def context(text,i,j):
    left,right = i,j
    count = 0
    if left > 0:
        while count < 5 and left > 0:
            left-=1
            if text[left]==' ':
                count+=1
    count = 0
    while count < 5 and right+1 < len(text):
        right += 1
        if text[right]==' ':
            count += 1
    return text[left:right]

def consulter(model,dev):
    results = []
    for i in range(len(dev)):
        doc = model(dev[i][0])
        tmp = []
        for ent in doc.ents:
            tmp.append([ent.text,ent.start_char,ent.end_char,ent.label_,context(dev[i][0],ent.start_char,ent.end_char)])
        results.append(tmp) 
    return results

# Model with train_de.json :
res1 = consulter(Train_Data_Model,dev)

# Model with train_de.json + sm:
res2 = consulter(SM_plus_Train_DATA_Model,dev)

print(len(res1)," ",len(res2))

40   40


In [114]:
def correction():
    correction = []
    for i in range(len(dev)):
        text = dev[i][0]
        tmp = []
        for x in dev[i][1:]:
            tmp.append([x[0],x[1],x[2],x[3],context(text,x[1],x[2])])
        correction.append(tmp[:])
    return correction
correction = correction()
print(correction[1])

[['Petersburg', 11, 21, 'LOC', '(Condeer.] Petersburg den 18. Dec. Se.russisch- kaiserl.'], ['Prinzen', 76, 83, 'PER', ' kaiserl. Majestät haben dem Prinzen vonCondé bey dessen Ankunft in'], ['Petersburg', 115, 125, 'LOC', ' bey dessen Ankunft in Petersburg den St.Andreas-Orden u. den Maltheser'], ['Polen', 179, 184, 'LOC', ' den Maltheser Ritterorden in Polen zu ertheilen, und ihn mit'], ['Petersburg', 256, 266, 'LOC', ' prächtigen,völlig meublirten Palais in Petersburg zu beschenken geruhet. Das aus'], ['Prinzen', 362, 369, 'PER', ' Kavallerie-Regimentern bestehende Corps des Prinzen vonCondé, welches in kaiserliche Dienste'], ['Wladimir', 440, 448, 'LOC', ' genommenworden, ist nun nach Wladimir, Luzk und Kowelin Quartier'], ['Luzk', 450, 454, 'LOC', ' ist nun nach Wladimir, Luzk und Kowelin Quartier verlegt. Das'], ['Kowelin', 459, 466, 'LOC', ' nach Wladimir, Luzk und Kowelin Quartier verlegt. Das ganze Corps'], ['Prinzen', 538, 545, 'PER', ' wird unterbestandiger Inspection des Pr

In [198]:
def evaluate(res,correction):        
    VP = []
    FP = []
    VN = [] 
    for x in range(len(res)):
        a = res[x]
        b = correction[x]
        i,j = 0,0
        while i<len(a) and j<len(b):
            l1,r1 = a[i][1],a[i][2]
            l2,r2 = b[j][1],b[j][2]
            #print(a[i])
            #print(b[j])
            #print()
            if l1>r2:
                VN.append(b[j])
                j+=1
            elif r1 < l2:
                FP.append(a[i])
                i+=1
            elif l1==l2 and r1==r2:
                if a[i][3]!=b[j][3]:
                    a[i][3] += " but correct : ({})".format(b[j][3])
                    FP.append(a[i])
                else:
                    VP.append(a[i])
                i+=1
                j+=1
            else:
                if a[i][0] in b[j][4]:
                    if a[i][3]!=b[j][3]:
                        a[i][3] += " but correct : ({})".format(b[j][3])
                        FP.append(a[i])
                    else:
                        VP.append(a[i])
                else:
                    FP.append(a[i])
                i+=1
                j+=1  
            #print("找到且正确 : ",len(VP)," 找到但是不正确 ： ",len(FP)," 没有找到但是正确 ： ",len(VN))
            #print("--------")
            #print()
        while i<len(a):
            FP.append(a[i])
            i+=1
        while j<len(b):
            VN.append(b[j])
            j+=1
    return [VP,FP,VN]

evaluate(res2,correction)

[[['Rußland', 0, 7, 'LOC', 'Rußland'],
  ['Petersburg',
   11,
   21,
   'LOC',
   '(Condeer.] Petersburg den 18. Dec. Se.russisch- kaiserl.'],
  ['Petersburg',
   115,
   125,
   'LOC',
   ' bey dessen Ankunft in Petersburg den St.Andreas-Orden u. den Maltheser'],
  ['Polen',
   179,
   184,
   'LOC',
   ' den Maltheser Ritterorden in Polen zu ertheilen, und ihn mit'],
  ['Petersburg',
   256,
   266,
   'LOC',
   ' prächtigen,völlig meublirten Palais in Petersburg zu beschenken geruhet. Das aus'],
  ['Wladimir',
   440,
   448,
   'LOC',
   ' genommenworden, ist nun nach Wladimir, Luzk und Kowelin Quartier'],
  ['Luzk',
   450,
   454,
   'LOC',
   ' ist nun nach Wladimir, Luzk und Kowelin Quartier verlegt. Das'],
  ['Marquis',
   2205,
   2212,
   'PER',
   ' Darnleyund Craven dafür, der Marquis Lansdowne undder junge Lord Holland'],
  ['Lord',
   2236,
   2240,
   'PER',
   ' Marquis Lansdowne undder junge Lord Holland (Neveu von For) dagegen.Im'],
  ['Francis',
   2354,
   2361,
 

In [199]:
result1 = evaluate(res1,correction)
result2 = evaluate(res2,correction)

In [200]:
import pandas as pd

d1 = pd.DataFrame({"NER":[x[0] for x in result1[1]],"Type":[x[3] for x in result1[1]],"Context":[x[4] for x in result1[1]]},index = [x for x in range(1,len(result1[1])+1)])
d2 = pd.DataFrame({"NER":[x[0] for x in result1[2]],"Type":[x[3] for x in result1[2]],"Context":[x[4] for x in result1[2]]},index = [x for x in range(1,len(result1[2])+1)])

d3 = pd.DataFrame({"NER":[x[0] for x in result2[1]],"Type":[x[3] for x in result2[1]],"Context":[x[4] for x in result2[1]]},index = [x for x in range(1,len(result2[1])+1)])
d4 = pd.DataFrame({"NER":[x[0] for x in result2[2]],"Type":[x[3] for x in result2[2]],"Context":[x[4] for x in result2[2]]},index = [x for x in range(1,len(result2[2])+1)])


html_string1 = '''
<html>
  <meta charset="UTF-8"></meta>
  <head><title>Faux Postif</title></head>
  <body>
    {table}
  </body>
</html>.
'''

html_string2 = '''
<html>
  <meta charset="UTF-8"></meta>
  <head><title>Vrai Négatif</title></head>
  <body>
    {table}
  </body>
</html>.
'''

with open("résultat/Model only with train/Faux Positif.html","w",encoding='utf-8') as f:
    f.write(html_string1.format(table=d1.to_html()))
with open("résultat/Model only with train/Vrai Négatif.html","w",encoding='utf-8') as f:
    f.write(html_string2.format(table=d2.to_html()))
with open("résultat/Model with train + Spacy Model/Faux Positif.html","w",encoding='utf-8') as f:
    f.write(html_string1.format(table=d3.to_html()))
with open("résultat/Model with train + Spacy Model/Vrai Négatif.html","w",encoding='utf-8') as f:
    f.write(html_string2.format(table=d4.to_html()))

### Statistics for the results


#### Type of NER

In [201]:
def stats(result2,typeLabel):
    total = []
    trouvé = list(filter(lambda x:x[3]==typeLabel,result2[0]))
    incorrect = list(filter(lambda x:x[3]==typeLabel,result2[1]))
    non_trouvé = list(filter(lambda x:x[3]==typeLabel,result2[2]))
    total+=(trouvé+incorrect+non_trouvé)
    p = round(len(trouvé)/(len(trouvé)+len(incorrect))*100,3)
    r = round(len(trouvé)/(len(non_trouvé)+len(trouvé))*100,3)
    if p+r!=0:
        f = round(2*p*r/(p+r),3)
    else:
        f = 0.0
    return [p,r,f]

##### 1. LOC

In [202]:
loc = stats(result2,"LOC")
print(loc)

[76.432, 55.878, 64.558]


###### 2. PERS

In [203]:
pers = stats(result2,"PER")
print(pers)

[56.727, 40.31, 47.13]


###### 3. Orgs

In [204]:
orgs = stats(result2,"ORG")
print(orgs)

[77.083, 25.17, 37.949]


##### 4. MISC

In [205]:
misc = stats(result2,"MISC")
print(misc)

[61.702, 23.77, 34.319]


In [206]:
import pandas as pd
res = pd.DataFrame({"Précision":[loc[0],pers[0],orgs[0],misc[0]],"Rappel":[loc[1],pers[1],orgs[1],misc[1]],'F-Mesure':[loc[2],pers[2],orgs[2],misc[2]]},index = ["Location","Person","Organisation","Miscellaneous entities"])
res

Unnamed: 0,Précision,Rappel,F-Mesure
Location,76.432,55.878,64.558
Person,56.727,40.31,47.13
Organisation,77.083,25.17,37.949
Miscellaneous entities,61.702,23.77,34.319


In [207]:
html_string1 = '''
<html>
  <meta charset="UTF-8"></meta>
  <head><title>Faux Postif</title></head>
  <body>
    {table}
  </body>
</html>.
'''

html_string2 = '''
<html>
  <meta charset="UTF-8"></meta>
  <head><title>Vrai Négatif</title></head>
  <body>
    {table}
  </body>
</html>.
'''
with open("résultat/Model with train + Spacy Model/Chaque_type.html","w",encoding='utf-8') as f:
    f.write(html_string1.format(table=res.to_html()))

### Observe every NER not finded or finded but not correct

##### Incorrect

In [208]:
import collections
from collections import Counter
fp = dict(Counter([x[0] for x in result2[1]]))
fp = sorted(fp.items(),key=lambda x:x[1],reverse=True)
print(fp)

[('Zürich', 4), ('von', 3), ('Paris', 3), ('Preußen', 3), ('Nationalrath', 3), ('Baden', 3), ('Bundesrath', 3), ('Sängervereins', 3), ('Schweiz', 3), ('Ferdinand', 3), ('Se', 2), ('Lord', 2), ('vom', 2), ('Asselt', 2), ('am', 2), ('Kompagnien', 2), ('Bucharest', 2), ('Baron', 2), ('Jedermann', 2), ('Nationalgarde', 2), ('Herren', 2), ('seit', 2), ('Papier', 2), ('Sündfluth', 2), ('XVI', 2), ('Noven', 2), ('Graf', 2), ('Bürgerschaft', 2), ('Chöre', 2), ('Händler', 2), ('Basel', 2), ('Nokraschi', 2), ('Ministeriums', 2), ('Condeer', 1), ('St', 1), ('Kowelin', 1), ('Duc', 1), ('Königreichs', 1), ('FortsetzungIhrer', 1), ('Gefahrenund', 1), ('Unglük', 1), ('Sinclair', 1), ('consolidirten3', 1), ('Landein', 1), ('Courier', 1), ('Belgische', 1), ('vereinigten', 1), ('Sittard', 1), ('Lüttich', 1), ('Betagerungsstand', 1), ('Roumiroir', 1), ('Kühnheitgehabt', 1), ('Löwen', 1), ('Batavische', 1), ('Schuldenscheine', 1), ('Sevilla', 1), ('Hauptstraßenach', 1), ('Saragossa', 1), ('Gegenwart', 1),

##### Non_trouvé

In [209]:
import collections
from collections import Counter
fn = dict(Counter([x[0] for x in result2[2]]))
fn = sorted(fn.items(),key=lambda x:x[1],reverse=True)
print(fn)

[('Hr', 22), ('Hrn', 9), ('Zürich', 9), ('Dr', 9), ('von', 8), ('Guyer', 6), ('Kirchgemeinde', 6), ('St', 5), ('Schweiz', 5), ('1937', 5), ('Deutschland', 4), ('Prof', 4), ('im', 4), ('Bürgliterrasse', 4), ('Bundesgericht', 4), ('Küßnacht', 4), ('Volksrecht', 4), ('Prinzen', 3), ('Havre', 3), ('alt', 3), ('Solothurner', 3), ('Oesterreich', 3), ('A', 3), ('L', 3), ('Köln', 3), ('Wafd', 3), ('Nokraschi', 3), ('junge', 2), ('das', 2), ('Gerona', 2), ('Wohlen', 2), ('Bundeszeitung', 2), ('Journal', 2), ('New', 2), ('hannoversche', 2), ('Preußenverein', 2), ('Berlin', 2), ('Winterthur', 2), ('Liestal', 2), ('Basellandschaftliche', 2), ('Wallis', 2), ('Oberrichter', 2), ('Clement', 2), ('Madrid', 2), ('dem', 2), ('Toledo', 2), ('Andrassy', 2), ('Freiburg', 2), ('Lausanner', 2), ('Bundesversammlung', 2), ('Kellersberger', 2), ('KantonsZürich', 2), ('Gobat', 2), ('am', 2), ('Frey', 2), ('Schweizerische', 2), ('Interlaken', 2), ('vom', 2), ('Lauterbrunnen', 2), ('Männerchor', 2), ('Bürgerlichen

In [212]:
import pandas as pd
tf_incorrect = pd.DataFrame({"NER":[x[0] for x in fp],"Frequecy":[x[1] for x in fp]})
tf_non_trouvé = pd.DataFrame({"NER":[x[0] for x in fn],"Frequency":[x[1] for x in fn]})


html_string1 = '''
<html>
  <meta charset="UTF-8"></meta>
  <head><title>Term Frequency for incorrect NER</title></head>
  <body>
    {table}
  </body>
</html>.
'''

html_string2 = '''
<html>
  <meta charset="UTF-8"></meta>
  <head><title>Term Frequency for non correct NER</title></head>
  <body>
    {table}
  </body>
</html>.
'''
with open("résultat/Model with train + Spacy Model/tfFP.html","w",encoding='utf-8') as f:
    f.write(html_string1.format(table=tf_incorrect.to_html()))

with open("résultat/Model with train + Spacy Model/tfFN.html","w",encoding='utf-8') as f:
    f.write(html_string1.format(table=tf_non_trouvé.to_html()))

In [210]:
"Prinzen" in set([x[0] for x in result2[0]])

False

In [211]:
"Zürich" in set([x[0] for x in result2[0]])

True

### Zürich exists both in VP and Faux Negatif

In [214]:
Zürich_vp = list(filter(lambda x:x[0]=="Zürich",result2[0]))
Zürich_fp = list(filter(lambda x:x[0]=="Zürich",result2[1]))
Zürich_fn = list(filter(lambda x:x[0]=="Zürich",result2[2]))
print(Zürich_vp[0],len(Zürich_vp))
print(Zürich_fp[0],len(Zürich_fp))
print(Zürich_fn[0],len(Zürich_fn))

['Zürich', 3, 9, 'LOC', 'In Zürich starb am 2. April Hr.'] 30
['Zürich', 785, 791, 'LOC', ' gesetzt wurden.— Der Stadtverein Zürich hat in seiner Versammlung vom'] 4
['Zürich', 2111, 2117, 'LOC', ' in Zürich, Nationalrath Fierz inZürich, Oberstl. v. Muralt in'] 9


In [215]:
html_string = '''
<html>
  <meta charset="UTF-8"></meta>
  <head><title>Zürich</title></head>
  <body>
    {table}
  </body>
</html>.
'''
tmp1 = [x[4] for x in Zürich_vp]
tmp2 = [x[4] for x in Zürich_fn]
a,b = len(tmp1),len(tmp2)
if a<b:
    for x in range(b-a):
        tmp1.append('')
else:
    for x in range(a-b):
        tmp2.append("")
    

l1 = pd.DataFrame({"Find":tmp1,"Not Find":tmp2},index = [x for x in range(1,max(a,b)+1)])
with open("résultat/Model with train + Spacy Model/Zürich.html","w",encoding='utf-8') as f:
    f.write(html_string.format(table=l1.to_html()))