<a href="https://colab.research.google.com/github/OdysseusPolymetis/colabs_for_nlp/blob/main/4_ner_with_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install flair
!pip install stanza

In [None]:
import stanza
import numpy as np
from tqdm import tqdm
from flair.data import Sentence
from flair.models import SequenceTagger

stanza.download("fr")
nlp = stanza.Pipeline(lang="fr", processors="tokenize,pos,lemma", use_gpu=True)

tagger = SequenceTagger.load("flair/ner-french")

In [3]:
from google.colab import files
from pathlib import Path

def upload_one_txt_utf8(dest_dir="/content", rename_to="input.txt", allow_utf8_bom=True):
    uploaded = files.upload()
    if len(uploaded) != 1:
        raise ValueError(f"Merci d'uploader exactement 1 fichier .txt (reçu: {len(uploaded)}).")

    filename, data = next(iter(uploaded.items()))
    if Path(filename).suffix.lower() != ".txt":
        raise ValueError("Merci d'uploader un fichier avec l'extension .txt")

    try:
        text = data.decode("utf-8-sig" if (allow_utf8_bom and data.startswith(b"\xef\xbb\xbf")) else "utf-8")
    except UnicodeDecodeError as e:
        raise ValueError("Encodage invalide : merci de fournir un fichier UTF-8.") from e

    out_path = Path(dest_dir) / rename_to
    out_path.write_text(text, encoding="utf-8")
    return str(out_path)

filepath = upload_one_txt_utf8(rename_to="texte.txt")
print("OK ->", filepath)

Saving Les_trois_mousquetaires.txt to Les_trois_mousquetaires.txt
OK -> /content/texte.txt


In [None]:
from collections import defaultdict
import numpy as np

with open(filepath, "r", encoding="utf-8") as f:
    text = f.read()

doc = nlp(text)

window = 20
cooccurrence = defaultdict(lambda: defaultdict(int))

entity_lemma_cache = {}

In [None]:
def lemmatize_entity(ent_text: str) -> str:
    key = ent_text.strip()
    if key in entity_lemma_cache:
        return entity_lemma_cache[key]
    d = nlp(key)
    lemmas = []
    for s in d.sentences:
        for w in s.words:
            if w.lemma:
                lemmas.append(w.lemma.lower())
    out = " ".join(lemmas) if lemmas else key.lower()
    entity_lemma_cache[key] = out
    return out

In [None]:
for sent in doc.sentences:
    s_text = sent.text.strip()
    if not s_text:
        continue

    ner_sent = Sentence(s_text)
    tagger.predict(ner_sent)

    pers = [span for span in ner_sent.get_spans("ner") if span.get_label("ner").value == "PER"]

    pers_items = []
    for span in pers:
        tok_idxs = [t.idx for t in span.tokens]
        center = int(round(sum(tok_idxs) / len(tok_idxs)))
        label = lemmatize_entity(span.text)
        pers_items.append((label, center))

    for i, (li, ci) in enumerate(pers_items):
        for j, (lj, cj) in enumerate(pers_items):
            if i != j and abs(ci - cj) <= window:
                cooccurrence[li][lj] += 1

entities = sorted(cooccurrence.keys())
idx = {e:i for i,e in enumerate(entities)}
M = np.zeros((len(entities), len(entities)), dtype=int)

for e1, d in cooccurrence.items():
    for e2, w in d.items():
        M[idx[e1], idx[e2]] = w

print("Nb entités PER:", len(entities))
print(M)

In [None]:
import networkx as nx

G = nx.Graph()

for i, ent in enumerate(entities):
    G.add_node(i, label=ent)

for i in range(M.shape[0]):
    for j in range(M.shape[1]):
        w = int(M[i, j])
        if w > 0 and i != j:
            G.add_edge(i, j, weight=w)

out_gexf = "/content/network_fr.gexf"
nx.write_gexf(G, out_gexf)
print("Export ->", out_gexf)

In [None]:
from google.colab import files
files.download("/content/network_fr.gexf")