## Imports

In [82]:
import sys
import pandas as pd
from pathlib import Path
sys.path.append("../")
from spacy.tokens import DocBin, Doc
from typing import List, Tuple
from lib.utils import tei2spacy, nlp_model_fr, sample_files, print_corpus_summary

In [2]:
SPACY_CORPUS_SERIALIZED_PATH = "./data/corpus.spacy"

## Loading pre-processed data

In [3]:
if Path(SPACY_CORPUS_SERIALIZED_PATH).exists():
    spacy_corpus = DocBin(store_user_data=True).from_disk(SPACY_CORPUS_SERIALIZED_PATH)
    print(f"Loaded serialize spacy corpus from {SPACY_CORPUS_SERIALIZED_PATH}")
    print_corpus_summary(spacy_corpus, nlp_model_fr)
else:
    spacy_corpus = DocBin(store_user_data=True)

Loaded serialize spacy corpus from ./data/corpus.spacy
Number of documents in the corpus: 12
Number of entities in the corpus: 3204
Number of tokens in the corpus: 184314


In [4]:
docs = spacy_corpus.get_docs(nlp_model_fr.vocab)

## Adding entity linking information to docs in corpus

In [5]:
entity_fishing_pipe = nlp_model_fr.add_pipe(
    "entityfishing", config={
        "api_ef_base": "http://nerd.huma-num.fr/nerd/service"
    }
)

In [7]:
#nlp_model_fr.pipe_names

In [8]:
el_docs = [
    entity_fishing_pipe(spacy_doc)
    for spacy_doc in docs
]

## Selection of salient sentences

In [None]:
# TODO: add documentation and move to lib/utils.py

import operator
import pandas as pd
from dataclasses import dataclass

@dataclass
class Entity:
    qid: str
    ner_labels: List[str] # ner tags for entity mentions
    mention_frequency: int # document-level mention frequency
    unique_surface_forms: List[str]
    short_desc: str

class SalientSentenceSelector(object):
    
    def __init__(self, spacy_doc: Doc):
        self.doc = spacy_doc
        self.entities = self._mentions2entities()
        self.sentences = {sent_i + 1: sent for sent_i, sent in enumerate(self.doc.sents)}
        self.sent2ent_idx = self._build_sentence2entity_index()

    def _mentions2entities(self) -> List[str]:
        # transform the entity mentions from spacy into a dataframe for easier manipulation
        self._mentions_df = pd.DataFrame(
            [
                {
                    'mention': ent.text,
                    'ner_label': ent.label_,
                    'qid': ent._.kb_qid,
                    'url_wikidata': ent._.url_wikidata,
                    'nerd_score': ent._.nerd_score
                }
                for ent in self.doc.ents
            ]
        )
        linked_entities_df = self._mentions_df[self._mentions_df.qid.notna()]
        n_nonlinked_entities = len(self._mentions_df[self._mentions_df.qid.isna()])
        n_linked_entities = len(linked_entities_df)
        print(
            f'Document {self.doc.user_data["filename"]} contains {self._mentions_df.shape[0]} entities;',
            f'{n_linked_entities} linked and {n_nonlinked_entities} non-linked'
        )

        # unique entities
        unique_qids = linked_entities_df.qid.unique()
        print(f'Document {self.doc.user_data["filename"]} contains {len(unique_qids)} unique entities')

        entities = []
        for qid in unique_qids:
            mentions  = linked_entities_df[linked_entities_df.qid == qid].mention
            mention_frequency = len(mentions.tolist())
            ner_labels = linked_entities_df[linked_entities_df.qid == qid].ner_label.unique().tolist()
            unique_surface_forms = mentions.unique().tolist()
            entities.append(
                Entity(
                    qid=qid,
                    ner_labels=ner_labels,
                    mention_frequency=mention_frequency,
                    unique_surface_forms=unique_surface_forms,
                    short_desc=''
                )
            )
        # top_person
        # top_place
        return {entity.qid: entity for entity in entities}

    def _build_sentence2entity_index(self) -> dict:
        sentence2entity_index = {}
        for sent_i, sent in self.sentences.items():
            for ent in sent.ents:
                if ent._.kb_qid:
                    if sent_i not in sentence2entity_index:
                        sentence2entity_index[sent_i] = set()
                    sentence2entity_index[sent_i].add(ent._.kb_qid)
        return sentence2entity_index

    # TODO: select sentences for people and places separately
    def _find_sentences_for_entity(self, entity: Entity) -> List[str]:
        sentences = []
        for sentence_id, entity_qids in self.sent2ent_idx.items():
            if entity.qid in entity_qids:
                sentences.append(self.sentences[sentence_id])
        sentences.sort(key=lambda x: len(x), reverse=True)
        return sentences

    # very simplistic first implementation: take the most frequent entity (no diff. between people and places)
    # and return the first `k`` sentences where the entity appears, ranked by sentence length (rationale: the longer, the more informative)
    def select(self, top_k_sentences: int = 5) -> Tuple[Entity, List[str]]:
        sorted_entities = sorted(self.entities.values(), key=operator.attrgetter('mention_frequency'), reverse=True)
        top_entity = sorted_entities[0]
        return (top_entity, self._find_sentences_for_entity(top_entity)[:top_k_sentences])


In [None]:
# take a sample document    
sample_doc = el_docs[9]

In [None]:
sss = SalientSentenceSelector(sample_doc)

Document bpt6k5458862p.tar.gz.tei_segmented_ner.xml contains 471 entities; 143 linked and 328 non-linked
Document bpt6k5458862p.tar.gz.tei_segmented_ner.xml contains 56 unique entities


In [None]:
# use the  SalientSentenceSelector to extract the top 5 sentences for the most frequent entity in the document
# this entity context made of the k sentences can then be fed to an LLM (vel sim.) to characterise the spation-temporal
# dimension of the document
sss.select(top_k_sentences=5)

(Entity(qid='Q18190448', ner_labels=['PER'], mention_frequency=52, unique_surface_forms=['Guillaume'], short_desc=''),
 [Guillaume Que c'est un grapd trésor que notre liberté, Qu'on la compare mal au bien de la clartète, Que l'usage en est doux, et qu'au temps où nous sommes, Un bien qui vient du Ciel se vend mal à des hommes, Le libéral arbitre est un don précieux Par où nous éprouvons la clémence des Dieux, Un esprit franc et libre où la raison abonde Enne possédant rien, possède tout le monde, Il ne dépend jamais de tant d'esprits divers, Au lieu d'une maison il a tour d'Unissers, Sa richesse le suit, et l'âme, sans contrainte, Sans auoir ai vertu sans con visage empreinte, LES PÂTONS,
  Quand elle aura connu mon âme par ta bouche, Regarde dans son teint comment cela la touche, Prends garde si l'amour retient ensevelis Sous un peu de rougeur, ses roses et ses lis, Compte tous ses soupirs, et te donne la gloire D'âpporter mon salut dans ta belle mémoire, Guillaume Tout ce que l'arti

In [None]:
# Guillaume is the most frequent entity in the document
# just because Calyante was not linked to a Wikidata QID
# To be dealt with and fixed in the next iteration
sss._mentions_df.mention.value_counts()

mention
Calyante     70
Guillaume    54
Filandre     39
Agaritte     37
Aronthe      37
             ..
Alexandre     1
Nectar        1
Vulcain       1
Jupiter       1
Ciela         1
Name: count, Length: 133, dtype: int64