## `tei2spacy` pipeline

If we want to replace spacy entity mentions with our own ones, we need to intervene in the spacy docs, and inject our mentions.
This is doable but requires a mapping between each token in the spacy document and TEI tags in the input TEI file. This way, we can know whether a given token is part of a `placeName` or `persName` tag and label it accordingly. At that point we can call the entity fishing service on our custom mentions.

In [None]:
from tqdm import tqdm
from textentlib.utils import nlp_model_fr, tei_element_to_ner_label, extract_metadata_from_tei, sample_files, print_corpus_summary
from pathlib import Path
from bs4 import BeautifulSoup as bs
from spacy.tokens import Doc, Span, DocBin
import xml.etree.ElementTree as etree

def get_tag_from_char_index(char_start: int, char_end: int, entities: dict) -> str:
    for (start, end), tag in entities.items():
        if start <= char_start and end >= char_end:
            return tag
    return None

def tei2spacy_simple(tei_file_path: Path) -> Doc:
    soup = bs(tei_file_path.read_text(), 'xml')
    output_text = ""
    chunks = {}
    entities = []
    entity = {}
    inside_entity = False

    for elem in soup.findAll('reg'):
        for node in elem.contents:
            if isinstance(node, str):
                output_text += node.text
            else:
                if node.name == 'persName' or node.name == 'placeName':
                    offset_start = len(output_text)
                    output_text += node.text
                    offset_end = len(output_text)
                    ner_tag = tei_element_to_ner_label(node.name)
                    chunks[(offset_start, offset_end)] = ner_tag
                else:
                    output_text += node.text
        output_text += " "

    # extract metadata from TEI
    metadata = extract_metadata_from_tei(etree.parse(tei_file_path))
    
    # create a spacy doc object
    doc = nlp_model_fr(output_text)
    doc.user_data['author'] = metadata['author']
    doc.user_data['title'] = metadata['title']
    doc.user_data['publication_date'] = metadata['date']
    doc.user_data['path'] = str(tei_file_path) 
    doc.user_data['filename'] = str(tei_file_path.name)
    doc.user_data['entity_linking'] = None


    # Iterate over the tokens in the document and project the entities from the TEI document
    # onto character offsets of tokens in the SpaCy document
    for token in doc:
        ner_label = get_tag_from_char_index(token.idx, token.idx + len(token.text), chunks)
        if inside_entity:
            if ner_label is None:
                entities.append(entity)
                entity = {}
                inside_entity = False
            else:
                if entity['label'] == ner_label:
                    entity['chunks'].append(token)
                else:
                    entities.append(entity)
                    entity = {
                        'label': ner_label,
                        'chunks': [token]
                    } 
        else:
            if ner_label is not None:
                entity['label'] = ner_label
                entity['chunks'] = [token]
                inside_entity = True

    # Convert the entities to Spacy format
    # NB: start and end are token indices, not character offsets
    entities_to_add = []
    for entity in entities:
        spacy_ent = {}
        spacy_ent['start'] = entity['chunks'][0].i
        spacy_ent['end'] = entity['chunks'][-1].i + 1
        spacy_ent['label'] = entity['label']
        entities_to_add.append(spacy_ent)

    # Create Span objects for each entity and inject them into the Doc object
    doc.ents = [Span(doc, ent["start"], ent["end"], label=ent["label"]) for ent in entities_to_add]
    return doc

In [2]:
SPACY_CORPUS_SERIALIZED_PATH = "./data/corpus_new.spacy"
CORPUS_PATH = "../TheatreLFSV2-downloaded/"

In [3]:
if Path(SPACY_CORPUS_SERIALIZED_PATH).exists():
    spacy_corpus = DocBin(store_user_data=True).from_disk(SPACY_CORPUS_SERIALIZED_PATH)
    print(f"Loaded serialize spacy corpus from {SPACY_CORPUS_SERIALIZED_PATH}")
else:
    spacy_corpus = DocBin(store_user_data=True)

Loaded serialize spacy corpus from ./data/corpus_new.spacy


In [4]:
print_corpus_summary(spacy_corpus, nlp_model_fr)

Number of documents in the corpus: 594
Number of entities in the corpus: 287389
Number of tokens in the corpus: 12885306


In [6]:
already_processed_files = set([Path(doc.user_data['path']) for doc in spacy_corpus.get_docs(nlp_model_fr.vocab)])

In [7]:
len(already_processed_files)

100

In [11]:
sample_size = 100
corpus_basedir = Path(CORPUS_PATH)
sampled_files = sample_files(Path(corpus_basedir / 'NER'), sample_size, already_processed_files)

Found 594 files in ../TheatreLFSV2-downloaded/NER
Excluded 100 files: kept 494 files


In [16]:
sampled_files = set(Path(corpus_basedir / 'NER').iterdir()) - already_processed_files

In [22]:
len(sampled_files)

494

In [18]:
# there should not be files in the sample that have already been processed
assert len(set(sampled_files) - already_processed_files) == len(sampled_files) 

In [10]:
#!head -n 200 {str(sampled_files[0])}

In [23]:
docs = [tei2spacy_simple(file) for file in tqdm(sampled_files)]

100%|██████████| 494/494 [24:16<00:00,  2.95s/it]  


In [24]:
for doc in docs:
    spacy_corpus.add(doc)
    
print_corpus_summary(spacy_corpus, nlp_model_fr)

Number of documents in the corpus: 594
Number of entities in the corpus: 287389
Number of tokens in the corpus: 12885306


In [25]:
len(docs)

494

In [26]:
spacy_corpus.to_disk(SPACY_CORPUS_SERIALIZED_PATH)