<a href="https://colab.research.google.com/github/OdysseusPolymetis/digital_classics_course/blob/main/network_analysis_for_greek.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install flair
!pip install stanza

In [None]:
from flair.data import Sentence
from flair.models import SequenceTagger

tagger = SequenceTagger.load("UGARIT/flair_grc_bert_ner")

# **Test sur une phrase**

In [None]:
sentence = Sentence('ταῦτα εἴπας ὁ Ἀλέξανδρος παρίζει Πέρσῃ ἀνδρὶ ἄνδρα Μακεδόνα ὡς γυναῖκα τῷ λόγῳ · οἳ δέ , ἐπείτε σφέων οἱ Πέρσαι ψαύειν ἐπειρῶντο , διεργάζοντο αὐτούς .')
tagger.predict(sentence)
for entity in sentence.get_spans('ner'):
    print(entity)

# **Test sur un TXT**

In [2]:
import stanza
import numpy as np
from tqdm import tqdm

In [5]:
with open('/content/odyssee_integrale.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [None]:
stanza.download('grc')
nlp = stanza.Pipeline(lang='grc', processors='tokenize,lemma')

In [None]:
doc=nlp(text)

from collections import defaultdict
import numpy as np

# Initialisation du dictionnaire pour la matrice de cooccurrence
cooccurrence_dict = defaultdict(lambda: defaultdict(int))

for sentence in doc.sentences:
    sentence_text = sentence.text

    # Prédiction NER avec Flair
    ner_sentence = Sentence(sentence_text)
    tagger.predict(ner_sentence)

    # Extraction des entités NER de type PER
    ner_entities = [(entity.text, entity.start_position, entity.end_position) for entity in ner_sentence.get_spans('ner') if entity.get_label('ner').value == 'PER']

    # Pour chaque entité NER PER, trouver les cooccurrences dans la fenêtre de 20 mots
    for i, (entity_text_i, start_i, end_i) in enumerate(ner_entities):
        lemma_i = ' '.join([token.lemma for token in sentence.words if entity_text_i in token.text])

        for j, (entity_text_j, start_j, end_j) in enumerate(ner_entities):
            if i != j and abs(start_i - start_j) <= 20:
                lemma_j = ' '.join([token.lemma for token in sentence.words if entity_text_j in token.text])
                cooccurrence_dict[lemma_i][lemma_j] += 1

# Conversion du dictionnaire en matrice
entities = list(cooccurrence_dict.keys())
matrix_size = len(entities)
cooccurrence_matrix = np.zeros((matrix_size, matrix_size), dtype=int)

for i, entity_i in enumerate(entities):
    for j, entity_j in enumerate(entities):
        cooccurrence_matrix[i, j] = cooccurrence_dict[entity_i][entity_j]

# Affichage de la matrice
print(cooccurrence_matrix)


In [15]:
import networkx as nx

G = nx.Graph()

# Ajout des nœuds
for i, entity in enumerate(entities):
    G.add_node(i, label=entity)

# Ajout des arêtes
for i, row in enumerate(cooccurrence_matrix):
    for j, weight in enumerate(row):
        if weight > 0 and i != j:
            G.add_edge(i, j, weight=weight)

# Exportation en GEXF
nx.write_gexf(G, "network.gexf")

## Si vous voulez être plus tolérants sur les NER, et inclure les incertitudes (MISC)

In [None]:
cooccurrence_dict = defaultdict(lambda: defaultdict(int))

for sentence in doc.sentences:
    sentence_text = sentence.text

    # Prédiction NER avec Flair
    ner_sentence = Sentence(sentence_text)
    tagger.predict(ner_sentence)

    # Extraction des entités NER de type PER et MISC
    ner_entities = [(entity.text, entity.start_position, entity.end_position) for entity in ner_sentence.get_spans('ner') if entity.get_label('ner').value in ['PER', 'MISC']]

    # Pour chaque entité NER (PER et MISC), trouver les cooccurrences dans la fenêtre de 20 mots
    for i, (entity_text_i, start_i, end_i) in enumerate(ner_entities):
        lemma_i = ' '.join([token.lemma for token in sentence.words if entity_text_i in token.text])

        for j, (entity_text_j, start_j, end_j) in enumerate(ner_entities):
            if i != j and abs(start_i - start_j) <= 20:
                lemma_j = ' '.join([token.lemma for token in sentence.words if entity_text_j in token.text])
                cooccurrence_dict[lemma_i][lemma_j] += 1

# Conversion du dictionnaire en matrice
entities = list(cooccurrence_dict.keys())
matrix_size = len(entities)
cooccurrence_matrix = np.zeros((matrix_size, matrix_size), dtype=int)

for i, entity_i in enumerate(entities):
    for j, entity_j in enumerate(entities):
        cooccurrence_matrix[i, j] = cooccurrence_dict[entity_i][entity_j]

# Affichage de la matrice
print(cooccurrence_matrix)

# **Géolocalisation avec Pleiades**

In [7]:
doc=nlp(text)

In [8]:
import requests
import folium
from flair.data import Sentence
from flair.models import SequenceTagger
import pandas as pd
import re
from collections import Counter
from collections import defaultdict

names_df = pd.read_csv('/content/names.csv')

# Préparation du DataFrame pour la recherche de lemmes (case-insensitive)
names_df['attested_form_lower'] = names_df['attested_form'].str.lower()

m = folium.Map(location=[37.9838, 23.7275], zoom_start=5)

# Initialiser le compteur pour les entités PER
per_counter = Counter()

# Traitement de chaque phrase dans le document
for sentence in tqdm(doc.sentences, desc="Processing sentences for ner counting"):
    sentence_text = sentence.text
    ner_sentence = Sentence(sentence_text)
    tagger.predict(ner_sentence)
    tokens = sentence.words

    # Compter les entités PER
    for entity in ner_sentence.get_spans('ner'):
        if entity.get_label('ner').value == 'PER':
            per_counter[entity.text] += 1

# Sélectionner les 20 noms de personnages les plus fréquents
top_per_entities = {per[0] for per in per_counter.most_common(20)}

# Analyse NER et extraction des lemmes, et ajout des marqueurs LOC
loc_entities = []
per_entities = []

for sentence in tqdm(doc.sentences, desc="Processing sentences for loc pointing"):
    sentence_text = sentence.text
    ner_sentence = Sentence(sentence_text)
    tagger.predict(ner_sentence)
    tokens = sentence.words

    for entity in ner_sentence.get_spans('ner'):
        if entity.get_label('ner').value == 'LOC':
            for token in tokens:
                lemma_lower = token.lemma.lower()
                lemma_lower_escaped = re.escape(lemma_lower)
                match = names_df[names_df['attested_form_lower'].str.contains(lemma_lower_escaped, na=False, regex=False)]
                if not match.empty:
                    pid = match.iloc[0]['place_id']
                    response = requests.get(f"http://pleiades.stoa.org/places/{pid}/json")
                    if response.status_code == 200:
                        data = response.json()
                        if data and 'reprPoint' in data and data['reprPoint'] is not None:
                            loc_entities.append((data, entity))
                            folium.Marker(
                                location=[data['reprPoint'][1], data['reprPoint'][0]],
                                popup=entity.text,
                                icon=folium.Icon(color='red')
                            ).add_to(m)
                break
        elif entity.get_label('ner').value == 'PER' and entity.text in top_per_entities:
            per_entities.append((entity.text, entity.start_position, entity.end_position))

# Ajout des marqueurs PER à proximité des marqueurs LOC pour les entités les plus fréquentes uniquement
for loc_data, loc_entity in loc_entities:
    loc_start, loc_end = loc_entity.start_position, loc_entity.end_position
    for per_text, per_start, per_end in per_entities:
        if abs(loc_start - per_start) <= 10 or abs(loc_end - per_end) <= 10:
            folium.Marker(
                location=[loc_data['reprPoint'][1], loc_data['reprPoint'][0]],
                popup=per_text,
                icon=folium.Icon(color='blue')
            ).add_to(m)

# Sauvegarde de la carte avec moins de marqueurs
m.save("/content/map_ancient_places.html")


Processing sentences for ner counting: 100%|██████████| 7476/7476 [02:20<00:00, 53.37it/s]
Processing sentences for loc pointing: 100%|██████████| 7476/7476 [04:10<00:00, 29.84it/s]


In [None]:
import requests
import folium
from flair.data import Sentence
from flair.models import SequenceTagger
import pandas as pd
import re
from collections import Counter
from collections import defaultdict

names_df = pd.read_csv('/content/names.csv')

# Préparation du DataFrame pour la recherche de lemmes (case-insensitive)
names_df['attested_form_lower'] = names_df['attested_form'].str.lower()

m = folium.Map(location=[37.9838, 23.7275], zoom_start=5)

# Initialiser le compteur pour les entités PER
per_counter = Counter()

# Traitement de chaque phrase dans le document
for sentence in tqdm(doc.sentences, desc="Processing sentences for ner counting"):
    sentence_text = sentence.text
    ner_sentence = Sentence(sentence_text)
    tagger.predict(ner_sentence)
    tokens = sentence.words

    # Compter les entités PER
    for entity in ner_sentence.get_spans('ner'):
        if entity.get_label('ner').value == 'PER':
            per_counter[entity.text] += 1

# Sélectionner les 20 noms de personnages les plus fréquents
top_per_entities = {per[0] for per in per_counter.most_common(20)}

# Analyse NER et extraction des lemmes, et ajout des marqueurs LOC
loc_entities = []
per_entities = []

for sentence in tqdm(doc.sentences, desc="Processing sentences for loc pointing"):
    sentence_text = sentence.text
    ner_sentence = Sentence(sentence_text)
    tagger.predict(ner_sentence)
    tokens = sentence.words

    for entity in ner_sentence.get_spans('ner'):
        if entity.get_label('ner').value == 'LOC':
            for token in tokens:
                lemma_lower = token.lemma.lower()
                lemma_lower_escaped = re.escape(lemma_lower)
                match = names_df[names_df['attested_form_lower'].str.contains(lemma_lower_escaped, na=False, regex=False)]
                if not match.empty:
                    pid = match.iloc[0]['place_id']
                    response = requests.get(f"http://pleiades.stoa.org/places/{pid}/json")
                    if response.status_code == 200:
                        data = response.json()
                        if data and 'reprPoint' in data and data['reprPoint'] is not None:
                            loc_entities.append((data, entity))
                            folium.Marker(
                                location=[data['reprPoint'][1], data['reprPoint'][0]],
                                popup=entity.text,
                                icon=folium.Icon(color='red')
                            ).add_to(m)
                break
        elif entity.get_label('ner').value == 'PER' and entity.text in top_per_entities:
            per_entities.append((entity.text, entity.start_position, entity.end_position))

# Ajout des marqueurs PER à proximité des marqueurs LOC pour les entités les plus fréquentes uniquement
loc_to_pers = defaultdict(list)

for loc_data, loc_entity in loc_entities:
    loc_start, loc_end = loc_entity.start_position, loc_entity.end_position
    for per_text, per_start, per_end in per_entities:
        if abs(loc_start - per_start) <= 50 or abs(loc_end - per_end) <= 50:
            # Ajouter le texte de l'entité PER à l'ensemble associé au lieu LOC si le PER est parmi les 20 plus fréquents
            if per_text in top_per_entities:
                loc_to_pers[loc_entity.text].add(per_text)

# Pour chaque lieu, trier les personnages par fréquence et conserver les trois premiers
for loc, pers_set in loc_to_pers.items():
    # Trier les personnages par fréquence décroissante et conserver les trois premiers
    loc_to_pers[loc] = sorted(pers_set, key=lambda x: per_counter[x], reverse=True)[:3]

# Création des marqueurs sur la carte avec les noms de personnages dans le popup
for loc_data, loc_entity in loc_entities:
    if 'reprPoint' in loc_data and loc_data['reprPoint'] is not None:
        # Préparer le texte du popup pour inclure le lieu et les trois personnages les plus fréquents associés (sans répétition)
        popup_text = f"{loc_entity.text}: {', '.join(loc_to_pers[loc_entity.text])}"
        folium.Marker(
            location=[loc_data['reprPoint'][1], loc_data['reprPoint'][0]],
            popup=popup_text,
            icon=folium.Icon(color='red')
        ).add_to(m)

# Sauvegarde de la carte
m.save("/content/map_ancient_places3.html")


Processing sentences for ner counting: 100%|██████████| 7476/7476 [02:21<00:00, 53.02it/s]
Processing sentences for loc pointing:  32%|███▏      | 2387/7476 [01:37<10:03,  8.44it/s]