<a href="https://colab.research.google.com/github/OdysseusPolymetis/digital_classics_course/blob/main/6_network_analysis_and_geolocalisation_for_greek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install flair
!pip install stanza

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

tagger = SequenceTagger.load("UGARIT/flair_grc_bert_ner")

2024-03-28 22:35:41,777 SequenceTagger predicts: Dictionary with 15 tags: O, S-PER, B-PER, E-PER, I-PER, S-MISC, B-MISC, E-MISC, I-MISC, S-LOC, B-LOC, E-LOC, I-LOC, <START>, <STOP>


# **Test sur une phrase**

In [None]:
sentence = Sentence('ταῦτα εἴπας ὁ Ἀλέξανδρος παρίζει Πέρσῃ ἀνδρὶ ἄνδρα Μακεδόνα ὡς γυναῖκα τῷ λόγῳ · οἳ δέ , ἐπείτε σφέων οἱ Πέρσαι ψαύειν ἐπειρῶντο , διεργάζοντο αὐτούς .')
tagger.predict(sentence)
for entity in sentence.get_spans('ner'):
    print(entity)

# **Test sur un TXT**

In [None]:
import stanza
import numpy as np
from tqdm import tqdm

In [None]:
with open('/content/odyssee_integrale.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [None]:
stanza.download('grc')
nlp = stanza.Pipeline(lang='grc', processors='tokenize,lemma')

In [None]:
doc=nlp(text)

from collections import defaultdict
import numpy as np

# Initialisation du dictionnaire pour la matrice de cooccurrence
cooccurrence_dict = defaultdict(lambda: defaultdict(int))

for sentence in doc.sentences:
    sentence_text = sentence.text

    # Prédiction NER avec Flair
    ner_sentence = Sentence(sentence_text)
    tagger.predict(ner_sentence)

    # Extraction des entités NER de type PER
    ner_entities = [(entity.text, entity.start_position, entity.end_position) for entity in ner_sentence.get_spans('ner') if entity.get_label('ner').value == 'PER']

    # Pour chaque entité NER PER, trouver les cooccurrences dans la fenêtre de 20 mots
    for i, (entity_text_i, start_i, end_i) in enumerate(ner_entities):
        lemma_i = ' '.join([token.lemma for token in sentence.words if entity_text_i in token.text])

        for j, (entity_text_j, start_j, end_j) in enumerate(ner_entities):
            if i != j and abs(start_i - start_j) <= 20:
                lemma_j = ' '.join([token.lemma for token in sentence.words if entity_text_j in token.text])
                cooccurrence_dict[lemma_i][lemma_j] += 1

# Conversion du dictionnaire en matrice
entities = list(cooccurrence_dict.keys())
matrix_size = len(entities)
cooccurrence_matrix = np.zeros((matrix_size, matrix_size), dtype=int)

for i, entity_i in enumerate(entities):
    for j, entity_j in enumerate(entities):
        cooccurrence_matrix[i, j] = cooccurrence_dict[entity_i][entity_j]

# Affichage de la matrice
print(cooccurrence_matrix)


In [None]:
import networkx as nx

G = nx.Graph()

# Ajout des nœuds
for i, entity in enumerate(entities):
    G.add_node(i, label=entity)

# Ajout des arêtes
for i, row in enumerate(cooccurrence_matrix):
    for j, weight in enumerate(row):
        if weight > 0 and i != j:
            G.add_edge(i, j, weight=weight)

# Exportation en GEXF
nx.write_gexf(G, "network.gexf")

## Si vous voulez être plus tolérants sur les NER, et inclure les incertitudes (MISC)

In [None]:
cooccurrence_dict = defaultdict(lambda: defaultdict(int))

for sentence in doc.sentences:
    sentence_text = sentence.text

    # Prédiction NER avec Flair
    ner_sentence = Sentence(sentence_text)
    tagger.predict(ner_sentence)

    # Extraction des entités NER de type PER et MISC
    ner_entities = [(entity.text, entity.start_position, entity.end_position) for entity in ner_sentence.get_spans('ner') if entity.get_label('ner').value in ['PER', 'MISC']]

    # Pour chaque entité NER (PER et MISC), trouver les cooccurrences dans la fenêtre de 20 mots
    for i, (entity_text_i, start_i, end_i) in enumerate(ner_entities):
        lemma_i = ' '.join([token.lemma for token in sentence.words if entity_text_i in token.text])

        for j, (entity_text_j, start_j, end_j) in enumerate(ner_entities):
            if i != j and abs(start_i - start_j) <= 20:
                lemma_j = ' '.join([token.lemma for token in sentence.words if entity_text_j in token.text])
                cooccurrence_dict[lemma_i][lemma_j] += 1

# Conversion du dictionnaire en matrice
entities = list(cooccurrence_dict.keys())
matrix_size = len(entities)
cooccurrence_matrix = np.zeros((matrix_size, matrix_size), dtype=int)

for i, entity_i in enumerate(entities):
    for j, entity_j in enumerate(entities):
        cooccurrence_matrix[i, j] = cooccurrence_dict[entity_i][entity_j]

# Affichage de la matrice
print(cooccurrence_matrix)

# **Géolocalisation avec Pleiades**

In [66]:
doc=nlp(text)

In [72]:
import requests
import folium
from flair.data import Sentence
from flair.models import SequenceTagger
import pandas as pd
import re
from collections import Counter
from collections import defaultdict
import unicodedata

In [74]:
# Fonction pour normaliser les noms
def normalize_name(name):
    return ''.join(c for c in unicodedata.normalize('NFD', name) if unicodedata.category(c) != 'Mn').lower()

# Fonction pour trouver le lemme correspondant au texte de l'entité
def find_lemma_by_text(text, sentence):
    text = normalize_name(text)
    for word in sentence.words:
        if normalize_name(word.text) == text:
            return word.lemma.lower()
    return None

# Chargement des données Pleiades depuis le CSV
names_df = pd.read_csv('/content/names.csv')
names_df['attested_form_lower'] = names_df['attested_form'].str.lower()

# Initialisation de la carte Folium
m = folium.Map(location=[37.9838, 23.7275], zoom_start=5)


loc_to_pers = defaultdict(set)

for sentence in tqdm(doc.sentences, desc="Processing sentences"):
    flair_sentence = Sentence(sentence.text)
    tagger.predict(flair_sentence)

    # Extraction des entités LOC et PER avec leurs positions de mot dans la phrase
    loc_entities = [(ent.text, ent.start_position) for ent in flair_sentence.get_spans('ner') if ent.tag == 'LOC']
    per_entities = [(ent.text, ent.start_position) for ent in flair_sentence.get_spans('ner') if ent.tag == 'PER']

    # Association des entités PER à proximité des entités LOC
    for loc_text, loc_pos in loc_entities:
        for per_text, per_pos in per_entities:
            if abs(loc_pos - per_pos) <= 100:  # Condition de proximité basée sur la position des mots
                loc_lemma = find_lemma_by_text(loc_text, sentence)
                per_lemma = find_lemma_by_text(per_text, sentence)  # Lemmatisation des entités PER
                if loc_lemma and per_lemma:  # Vérifiez si les lemmes ont été trouvés
                    loc_to_pers[loc_lemma].add(per_lemma)

# Ajout des marqueurs sur la carte pour chaque lieu avec les personnages associés
for loc_lemma, pers in loc_to_pers.items():
    match = names_df[names_df['attested_form_lower'] == loc_lemma]
    if not match.empty:
        pid = match.iloc[0]['place_id']
        pleiades_data = requests.get(f"http://pleiades.stoa.org/places/{pid}/json").json()
        if 'reprPoint' in pleiades_data and pleiades_data['reprPoint']:
            folium.Marker(
                location=[pleiades_data['reprPoint'][1], pleiades_data['reprPoint'][0]],
                popup=f"{loc_lemma}: {', '.join(pers)}",
                icon=folium.Icon(color='red')
            ).add_to(m)

# Sauvegarde de la carte
m.save("/content/map_ancient_places.html")


Processing sentences:   0%|          | 0/7476 [00:00<?, ?it/s]