## Imports

In [1]:
from pathlib import Path
from spacy.tokens import DocBin
from tqdm import tqdm
from textentlib.utils import nlp_model_fr, print_corpus_summary, SalientSentenceSelector

In [22]:
SPACY_CORPUS_SERIALIZED_PATH = "../data/corpus.spacy"

## Loading pre-processed data

In [3]:
entity_fishing_pipe = nlp_model_fr.add_pipe(
    "entityfishing", config={
        "api_ef_base": "http://nerd.huma-num.fr/nerd/service"
        #"api_ef_base": "http://localhost:8090/service/"
    }
)

In [23]:
if Path(SPACY_CORPUS_SERIALIZED_PATH).exists():
    spacy_corpus = DocBin(store_user_data=True).from_disk(SPACY_CORPUS_SERIALIZED_PATH)
    print(f"Loaded serialize spacy corpus from {SPACY_CORPUS_SERIALIZED_PATH}")
    print_corpus_summary(spacy_corpus, nlp_model_fr)
else:
    spacy_corpus = DocBin(store_user_data=True)

Loaded serialize spacy corpus from ../data/corpus.spacy
Number of documents in the corpus: 594
Number of entities in the corpus: 287389
Number of tokens in the corpus: 12885306


In [24]:
docs = spacy_corpus.get_docs(nlp_model_fr.vocab)

## Adding entity linking information to docs in corpus

In [26]:
nlp_model_fr.remove_pipe('entityfishing')

('entityfishing',
 <spacyfishing.entity_fishing_linker.EntityFishing at 0x12331f450>)

In [27]:
entity_fishing_pipe = nlp_model_fr.add_pipe(
    "entityfishing", config={
        "api_ef_base": "http://nerd.huma-num.fr/nerd/service"
        #"api_ef_base": "http://localhost:8090/service/"
    }
)

In [28]:
nlp_model_fr.pipe_names

['tok2vec',
 'morphologizer',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'entityfishing']

In [29]:
spacy_corpus = DocBin(store_user_data=True)

In [30]:
for spacy_doc in tqdm(list(docs)):
    el_doc = entity_fishing_pipe(spacy_doc)
    assert spacy_doc.user_data['document_id'] is not None
    el_doc.user_data['entity_linking'] = 'spacy_entity_fishing'
    spacy_corpus.add(el_doc)

100%|██████████| 594/594 [10:46<00:00,  1.09s/it]


In [31]:
spacy_corpus.to_disk('../data/corpus_24022025.spacy')

In [11]:
print_corpus_summary(spacy_corpus, nlp_model_fr)

Number of documents in the corpus: 594
Number of entities in the corpus: 287389
Number of tokens in the corpus: 12885306


## Selection of salient sentences

In [15]:
import random, operator, json
from textentlib.utils import SalientSentenceSelector

In [16]:
spacy_docs = list(docs)

In [17]:
sample_doc = random.choice(spacy_docs)

In [18]:
sample_doc.user_data

{'author': 'Bordelon, Laurent',
 'title': 'La baguette de Vulcain Comedie',
 'publication_date': '1693',
 'path': '../TheatreLFSV2-downloaded/NER/bpt6k859543c.tar.gz.tei_segmented_ner.xml',
 'filename': 'bpt6k859543c.tar.gz.tei_segmented_ner.xml',
 'entity_linking': 'spacy_entity_fishing',
 ('._.',
  'annotations',
  None,
  None): {'disambiguation_text_service': {'software': 'entity-fishing',
   'version': '0.0.6',
   'date': '2025-01-10T16:49:39.018589Z',
   'runtime': 314,
   'nbest': False,
   'text': 'Chez A\' ROUEN, EAN DUMESNIL, dans la Cour du Palais. BONAVENTURE LE BRUN, sous la Porte du Palais. AVIS AULECIEUN. À Baguette de Vulcain Comédie a eu tant de succès, représentée sur le Théâtre dans cette ville par les Comédients, que plusieurs Personnes qui l\'ont vu joüer ont souhaité de l\'avoir Imprimée, ce qui m\'a obligé d\'en demander une Copie, sur laquelle aai fin cette Impression le plus fidèlement qu\'il a été possible, et où je n\'ai rien épargné pour sa perfection, je d

## Pre-generate document summaries

In [53]:
# the summary should contain:
# document metadata: author, title, publication date
# top 5 person mentions
# top 5 place mentions
# most salient person entity + top 5 sentences
# most salient place entity + top 5 sentences
def build_JSON_document_summary(spacy_doc):
    sass = SalientSentenceSelector(spacy_doc)

    # extract the top 5 sentences for the most frequent geographical place in the document
    # this entity context made of the k sentences can then be fed to an LLM
    top_place, place_rel_sentences = sass.select(top_k_sentences=5, entity_type='place')

    # extract the top 5 sentences for the most frequent person in the document
    # this entity context made of the k sentences can then be fed to an LLM
    top_person, person_rel_sentences = sass.select(top_k_sentences=5, entity_type='person')

    top_5_persons = sorted(
        list(sass.person_entities.values()),
        key=operator.attrgetter('mention_frequency'),
        reverse=True
    )[:5]

    top_5_places = sorted(
        list(sass.place_entities.values()),
        key=operator.attrgetter('mention_frequency'),
        reverse=True
    )[:5]



    summary = {
        "metadata": {
            "author": spacy_doc.user_data['author'],
            "title": spacy_doc.user_data['title'],
            "publication_date": spacy_doc.user_data['publication_date'],
            "document_id": spacy_doc.user_data['filename'].split('.')[0]
        },
        "context": {
            "people": {
                "top_1_person": {
                    "entity": {
                        "label": top_person.unique_surface_forms[0],
                        "frequency": top_place.mention_frequency
                    },
                    "related_sentences": [str(sent) for sent in person_rel_sentences]
                },
                "top_5_persons": [e.unique_surface_forms[0] for e in top_5_persons]
            },
            "places":{
                "top_1_place": {
                    "entity": {
                        "label": top_place.unique_surface_forms[0],
                        "frequency": top_place.mention_frequency
                    },
                    "related_sentences": [str(sent) for sent in place_rel_sentences]
                },
                "top_5_places": [e.unique_surface_forms[0] for e in top_5_places]
            }
        }
    }
    return summary
    

In [47]:
doc_summary = build_JSON_document_summary(sample_doc)

Document btv1b86221195.tar.gz.tei_segmented_ner.xml contains 353 PER entities; 199 linked and 154 non-linked
Document btv1b86221195.tar.gz.tei_segmented_ner.xml contains 37 PER unique entities
Document btv1b86221195.tar.gz.tei_segmented_ner.xml contains 19 LOC entities; 9 linked and 10 non-linked
Document btv1b86221195.tar.gz.tei_segmented_ner.xml contains 3 LOC unique entities


In [48]:
print(json.dumps(doc_summary, indent=2, ensure_ascii=False))

{
  "metadata": {
    "author": "Rotrou, Jean de",
    "title": "\"Dom Bernard de Cabrère, tragi-comédie de Rotrou [avec une \"\"Élégie\"\" de l'auteur à Mazarin]\"",
    "publication_date": "1648",
    "document_id": "btv1b86221195"
  },
  "context": {
    "people": {
      "top_1_person": {
        "entity": {
          "label": "LOPE",
          "frequency": 6
        },
        "related_sentences": [
          "D. BERNARD embrassant D \nLope\n DLût au ciel, cher de Lune, et ainsi le dis sans T feinte, Que le sort qui vous livre toujours si rude atteinte, Et contre qui pour vous, touts mes souhaits sont vains, Suivist son inconstance, et nous changeast de mains La disgrâce du Roi, me serait moins sensible, Que le mépris qu'il fait de ce bras invincible;",
          "Et nous faire tribut de votre liberté: D. \nLOPE\n Il paraît par l'acçuil que m'a fait Violante, Que cette qualité me serait messeante, Et l'on redoute peu la rigueur d'un Amant, Anon ne daigne nonoreru un tegaru seuleme

In [56]:
import os

# create a code block that does the following:
# 1) it iterates over document in `spacy_docs`
# 2) for each document, it builds a JSON summary using the `build_JSON_document_summary` function
# 3) it writes the JSON summary to a file in the `data/summaries` directory (if it does not exist, create it);  
# Ensure the summaries directory exists
os.makedirs('data/summaries', exist_ok=True)

# Iterate over documents in spacy_docs
for i, spacy_doc in enumerate(spacy_docs):
    try:
        # Build JSON summary
        doc_summary = build_JSON_document_summary(spacy_doc)
        
        # Define the file path
        file_path = f"data/summaries/{doc_summary['metadata']['document_id']}_summary.json"
        
        # Write the JSON summary to a file
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(doc_summary, f, indent=2, ensure_ascii=False)
    except Exception as e:
        print(f"Error processing document {i}: {e}")

Document bpt6k1090127n.tar.gz.tei_segmented_ner.xml contains 268 PER entities; 73 linked and 195 non-linked
Document bpt6k1090127n.tar.gz.tei_segmented_ner.xml contains 37 PER unique entities
Document bpt6k1090127n.tar.gz.tei_segmented_ner.xml contains 46 LOC entities; 24 linked and 22 non-linked
Document bpt6k1090127n.tar.gz.tei_segmented_ner.xml contains 11 LOC unique entities
Document bpt6k5626255h.tar.gz.tei_segmented_ner.xml contains 364 PER entities; 150 linked and 214 non-linked
Document bpt6k5626255h.tar.gz.tei_segmented_ner.xml contains 37 PER unique entities
Document bpt6k5626255h.tar.gz.tei_segmented_ner.xml contains 76 LOC entities; 68 linked and 8 non-linked
Document bpt6k5626255h.tar.gz.tei_segmented_ner.xml contains 11 LOC unique entities
Document bpt6k1280401n.tar.gz.tei_segmented_ner.xml contains 257 PER entities; 75 linked and 182 non-linked
Document bpt6k1280401n.tar.gz.tei_segmented_ner.xml contains 32 PER unique entities
Document bpt6k1280401n.tar.gz.tei_segmented_

In [1]:
spacy_doc

NameError: name 'spacy_doc' is not defined