## `tei2spacy` pipeline

If we want to replace spacy entity mentions with our own ones, we need to intervene in the spacy docs, and inject our mentions.
This is doable but requires a mapping between each token in the spacy document and TEI tags in the input TEI file. This way, we can know whether a given token is part of a `placeName` or `persName` tag and label it accordingly. At that point we can call the entity fishing service on our custom mentions.

In [None]:
import sys
import dask.bag as db
from pathlib import Path
sys.path.append("../")
from spacy.tokens import DocBin
from lib.utils import tei2spacy, nlp_model_fr, sample_files, print_corpus_summary
from dask.distributed import Client, LocalCluster

In [None]:
SPACY_CORPUS_SERIALIZED_PATH = "./data/corpus.spacy"
CORPUS_PATH = "../TheatreLFSV2-downloaded/"

In [None]:
if Path(SPACY_CORPUS_SERIALIZED_PATH).exists():
    spacy_corpus = DocBin(store_user_data=True).from_disk(SPACY_CORPUS_SERIALIZED_PATH)
    print(f"Loaded serialize spacy corpus from {SPACY_CORPUS_SERIALIZED_PATH}")
else:
    spacy_corpus = DocBin(store_user_data=True)

In [None]:
print_corpus_summary(spacy_corpus, nlp_model_fr)

In [None]:
already_processed_files = set([Path(doc.user_data['path']) for doc in spacy_corpus.get_docs(nlp_model_fr.vocab)])

In [None]:
sample_size = 10
corpus_basedir = Path(CORPUS_PATH)
sampled_files = sample_files(Path(corpus_basedir / 'NER'), 10, already_processed_files)

In [None]:
# there should not be files in the sample that have already been processed
assert len(set(sampled_files) - already_processed_files) == len(sampled_files) 

In [None]:
#sampled_files

In [None]:
# setting up Dask stuff for parallel processing
cluster = LocalCluster()
dask_client = Client(cluster)

In [None]:
dask_client

In [None]:
entity_projection = True
disable_pb = True
docs = db.from_sequence(sampled_files).map(tei2spacy, entity_projection, disable_pb).compute()

In [None]:
for doc in docs:
    spacy_corpus.add(doc)
    
print_corpus_summary(spacy_corpus, nlp_model_fr)

In [None]:
spacy_corpus.to_disk(SPACY_CORPUS_SERIALIZED_PATH)