# Binder for custom-ner-de

## Create training labels from XML files to train custom spaCy model

Input URL to PAGE XML Zip file exported from Transkribus (for example a public link pointing to a file on SWITCHdrive: https://drive.switch.ch/index.php/s/FILE/download):

In [4]:
ZIP_URL = input()

Input list of words to be removed from the list of entities (false positives):

In [5]:
WORD_REMOVE = input()

Extract entities:

In [None]:
from custom_ner_de.extract import extract_entities
extract_entities(zip_path=ZIP_URL,
                 word_remove=WORD_REMOVE)

##**Part 2 :: Custom training with Spacy**

In [6]:
!pip show spacy

Name: spacy
Version: 3.3.1
Summary: Industrial-strength Natural Language Processing (NLP) in Python
Home-page: https://spacy.io
Author: Explosion
Author-email: contact@explosion.ai
License: MIT
Location: c:\users\hinder0000\appdata\local\programs\python\python39\lib\site-packages
Requires: requests, srsly, preshed, setuptools, blis, jinja2, typer, numpy, spacy-legacy, wasabi, pydantic, cymem, murmurhash, tqdm, thinc, catalogue, pathy, langcodes, spacy-loggers, packaging
Required-by: 


In [7]:
#More packages to install
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path
import spacy
from tqdm import tqdm
from spacy.util import minibatch, compounding
from spacy.training import Example
from spacy.pipeline import EntityRuler

ModuleNotFoundError: No module named 'plac'

In [None]:
!mkdir de_spacy_custom_v2

In [None]:
model = None
output_dir=Path("/content/de_spacy_custom_v2") #output folder in which trained model will be stored
n_iter=100 #number of training epochs (increase for better performance or decrease for shorter run time - rule of thumb : minimum 40 epochs required)

In [None]:
if model is not None:
    nlp = spacy.load(model)  
    print("Loaded model '%s'" % model)
else:
    nlp = spacy.blank('de')  
    print("Created blank 'de' model")

#set up the pipeline

if 'ner' not in nlp.pipe_names:
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner')
else:
    ner = nlp.get_pipe('ner')

**Add words in the list to be considered in entity ruler**

In [None]:
# list of words to be added as person for training using entity ruler
person_names = ['Gustav Gottheil', 'Max Mustermann'] #add manually names here

person_patterns = []

for i in range(len(person_names)):
  person_patterns.append({"label": "PERSON", "pattern": person_names[i]})

In [None]:
# list of words to be added as location for training using entity ruler
location_names = ['Boston', 'New-York'] #add manually places/locations/cities here

location_patterns = []

for i in range(len(location_names)):
  location_patterns.append({"label": "LOC", "pattern": location_names[i]})

In [None]:
patterns = person_patterns + location_patterns

**Creating Entity Ruler with custom patterns**

In [None]:
cfg = {"overwrite_ents": True} #add an entitiy ruler for the manual changes
nlp.add_pipe('entity_ruler', before='ner', config=cfg).add_patterns(patterns)

**Below cell is for spacy training code**

**This can take up to 3 hours to complete the training for the 100 epochs.**

In [None]:
for _, annotations in final_all_ents_tuple:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [None]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

In [None]:
optimizer = nlp.begin_training()
for itn in range(n_iter):
    random.shuffle(final_all_ents_tuple)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(final_all_ents_tuple, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        example = []
        # Update the model with iterating each text
        for i in range(len(texts)):
            doc = nlp.make_doc(texts[i])
            example.append(Example.from_dict(doc, annotations[i]))
        
        # Update the model
        nlp.update(example, drop=0.5, losses=losses)
        print("Losses", losses)

In [None]:
#saving trained model in directory
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

In [None]:
#sample inference using trained model
for text, _ in final_all_ents_tuple[:5]:
    doc = nlp(text)
    print('Entities', [(ent.text, ent.label_) for ent in doc.ents])

##**Part 3 :: Inference of Custom trained model on test data**

In [None]:
# add words in this list which needs to be removed.
word_remove = ['Händeklatschen'] #seperate words with commas (,)

In [None]:
import pandas as pd #read the text file ".txt" to test the model -> rename if necessary
test_df = pd.read_csv('03_Protokoll-Zionistenkongress-Basel_1899.txt', delimiter = "\n", header=None, names=["text"])

In [None]:
test_df.info()

In [None]:
test_df.head(30)

In [None]:
all_persons = []
all_locations = []

for jj in range(len(test_df)):
  doc = nlp(test_df['text'][jj])
  persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON' and ent.text not in word_remove]
  locations = [ent.text for ent in doc.ents if ent.label_ == 'LOC' and ent.text not in word_remove]
  all_persons.append(persons)
  all_locations.append(locations)

In [None]:
test_df['v2_Custom-trained_Spacy_Person'] = pd.Series(all_persons)
test_df['v2_Custom-trained_Spacy_Location'] = pd.Series(all_locations)

In [None]:
test_df.head(30)

In [None]:
test_df.tail(30)

In [None]:
test_df.info()

In [None]:
test_df.head(40)

In [None]:
##**These are the results of the own trained model - saves as "v2_Custom_NER_inference_results.csv" - change directory and/or name if needed**

In [None]:
test_df.to_csv('v2_Custom_NER_inference_results.csv',index=False) #saving inference results of custom trained model

##**Part 4 :: Using pre-trained German spacy Large model to detect entity**

In [None]:
!python -m spacy download de_core_news_lg

In [None]:
import pandas as pd

In [None]:
nlp = spacy.load("de_core_news_lg") #loading the large pre-trained spacy model for german language

In [None]:
df = pd.read_csv("v2_Custom_NER_inference_results.csv") #loads the csv of custom trained results -> change here if you renamed this file earlier

In [None]:
df.info()

In [None]:
all_persons = []
all_locations = []

for jj in range(len(df)):
  doc = nlp(df['text'][jj])
  persons = [ent.text for ent in doc.ents if ent.label_ == 'PER']
  locations = [ent.text for ent in doc.ents if ent.label_ == 'LOC']
  all_persons.append(persons)
  all_locations.append(locations)

In [None]:
df['Pre-trained_Spacy_Person'] = pd.Series(all_persons)
df['Pre-trained_Spacy_Location'] = pd.Series(all_locations)

In [None]:
df.info()

In [None]:
df.to_csv("v2_Custom_NER_All_Inference_results.csv", index=False) #saving final results which has results of custom model and pre-trained spacy large model.

In [None]:
!zip -r de_spacy_custom_v2.zip de_spacy_custom_v2/

In [None]:
df.tail(50)

##**Part 5 :: Calculating Accuracy score of Custom model and Spacy large model**

In order to run below code please make sure below files are in same folder as notebook, if not, change path accordingly.

- This Notebook
- extracted_entities.txt
- custom trained model folder

In [None]:
!ls -lh

In [None]:
import warnings
warnings.filterwarnings('ignore')
from spacy.training import Example
import spacy

In [None]:
def load_data():
    """
    function to load entity data

    input ::
        
    output ::
        - Entity data to use for accuracy calculation
    """

    print("loading data...")
    file1=open('extracted_entities.txt')

    lines = file1.readlines()

    for i in range(len(lines)):
        lines[i] = eval(lines[i])

    return lines

In [None]:
def load_custom_spacy_model(model_path):
    """
    function to load the custom trained spacy model

    input ::
        - folder_path : folder which contains model

    output ::
        - model
    """

    print("Loading model from {0}\n".format(model_path))
    nlp = spacy.load(model_path)

    return nlp

In [None]:
def calculate_custom_model_accuracy(data):
    """
    function to calculate accuracy of custom trained entity model

    input ::
        - list containing entity data
        
    output ::
        - accuracy metrics 
    """

    nlp = load_custom_spacy_model("de_spacy_custom_v2")

    print("Calculating score...")
    new_test_data = []

    for text, annots in data:
        new_test_data.append(Example.from_dict(nlp.make_doc(text), annots))

    scores_model = nlp.evaluate(new_test_data)

    #print scores that you want
    precision_model = scores_model["ents_p"]
    recall_model = scores_model["ents_r"]
    f_score_model = scores_model["ents_f"]
    scores_entities = scores_model["ents_per_type"]

    print("================ Accuracy scores using custom trained model =================\n")
   
    print("================= Overall scores =================\n")
    print("Precision : ",precision_model)
    print("Recall : ",recall_model)
    print("F1 Score : ",f_score_model)
   
    print("\n================= Entity wise score =================\n")
   
    print("============= Person Entity score =================\n")
    print("Precision : ",scores_entities['PERSON']['p'])
    print("Recall : ",scores_entities['PERSON']['r'])
    print("F1 Score : ",scores_entities['PERSON']['r'])

    print("\n============= Location Entity score =================\n")
    print("Precision : ",scores_entities['LOC']['p'])
    print("Recall : ",scores_entities['LOC']['r'])
    print("F1 Score : ",scores_entities['LOC']['r'])


def calculate_pre_trained_model_score(data):
    """
    function to calculate accuracy of custom trained entity model

    input ::
        - list containing entity data
        
    output ::
        - accuracy metrics 
    """

    # using spact large german model
    nlp = spacy.load("de_core_news_lg")

    print("\n\nCalculating score...")
    new_test_data = []

    for text, annots in data:
        new_test_data.append(Example.from_dict(nlp.make_doc(text), annots))

    scores_model = nlp.evaluate(new_test_data)

    #print scores that you want
    precision_model = scores_model["ents_p"]
    recall_model = scores_model["ents_r"]
    f_score_model = scores_model["ents_f"]
    scores_entities = scores_model["ents_per_type"]

    print("\n================ Accuracy scores using Pre-trained large model =================\n")
   
    print("================= Overall scores =================\n")
    print("Precision : ",precision_model)
    print("Recall : ",recall_model)
    print("F1 Score : ",f_score_model)
   
    print("\n================= Entity wise score =================\n")
   
    print("============= Person Entity score =================\n")
    print("Precision : ",scores_entities['PERSON']['p'])
    print("Recall : ",scores_entities['PERSON']['r'])
    print("F1 Score : ",scores_entities['PERSON']['r'])

    print("\n============= Location Entity score =================\n")
    print("Precision : ",scores_entities['LOC']['p'])
    print("Recall : ",scores_entities['LOC']['r'])
    print("F1 Score : ",scores_entities['LOC']['r'])

In [None]:
#loading data
data = load_data()

In [None]:
#accuracy score of custom trained model
calculate_custom_model_accuracy(data)

In [None]:
#accuracy score of spacy large model
calculate_pre_trained_model_score(data)