<a href="https://colab.research.google.com/github/OdysseusPolymetis/ganeshs_ia_02_04_24/blob/main/sphilberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
import torch
import numpy as np

In [None]:
# Chargement du modèle
model = SentenceTransformer("bowphs/SPhilBerta")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Phrases en grec ancien et en latin
greek_sentence = '''περὶ μὲν οὖν τούτων δεδήλωται καὶ πρότερον·'''
latin_sentence = '''id quod iam supera tibi saepe ostendimus ante.'''

# Obtention des embeddings pour chaque phrase
greek_embedding = model.encode(greek_sentence)
latin_embedding = model.encode(latin_sentence)
embedding_array = greek_embedding.cpu().numpy() if isinstance(greek_embedding, torch.Tensor) else greek_embedding
    # Imprimer une représentation simplifiée de l'embedding, comme la somme des valeurs
print(f"Résumé de l'embedding: Somme des valeurs = {np.sum(embedding_array)}")

# Fonction pour calculer la similarité cosinus
def cosine_similarity(a, b):
    a = torch.tensor(a)
    b = torch.tensor(b)
    return torch.nn.functional.cosine_similarity(a.unsqueeze(0), b.unsqueeze(0)).item()

# Calcul de la similarité entre les deux phrases
similarity_score = cosine_similarity(greek_embedding, latin_embedding)
print("Score de similarité :", similarity_score)

Résumé de l'embedding: Somme des valeurs = 5.299886703491211
Score de similarité : 0.9692979454994202


# Démonstration sur quelque chose de plus large

In [None]:
!git clone https://github.com/PerseusDL/canonical-greekLit.git

Cloning into 'canonical-greekLit'...
remote: Enumerating objects: 63292, done.[K
remote: Total 63292 (delta 0), reused 0 (delta 0), pack-reused 63292[K
Receiving objects: 100% (63292/63292), 557.61 MiB | 15.04 MiB/s, done.
Resolving deltas: 100% (41862/41862), done.
Updating files: 100% (2650/2650), done.


In [None]:
!git clone https://github.com/PerseusDL/canonical-latinLit.git

Cloning into 'canonical-latinLit'...
remote: Enumerating objects: 25788, done.[K
remote: Counting objects: 100% (1026/1026), done.[K
remote: Compressing objects: 100% (480/480), done.[K
remote: Total 25788 (delta 559), reused 998 (delta 539), pack-reused 24762[K
Receiving objects: 100% (25788/25788), 285.59 MiB | 14.18 MiB/s, done.
Resolving deltas: 100% (16253/16253), done.
Updating files: 100% (1355/1355), done.


In [None]:
import glob
import pickle
from collections import defaultdict
import osgeo
import re
from lxml import etree
from tqdm import tqdm
import unicodedata
from nltk.tokenize import sent_tokenize
import torch
import numpy as np
import os
import pickle
import heapq
import h5py

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
!pip install cltk

In [None]:
from cltk.sentence.lat import LatinPunktSentenceTokenizer
from cltk.sentence.grc import GreekRegexSentenceTokenizer
from cltk.data.fetch import FetchCorpus

corpus_downloader = FetchCorpus(language='lat')
corpus_downloader.import_corpus('lat_models_cltk')
corpus_downloader = FetchCorpus(language='grc')
corpus_downloader.import_corpus('grc_models_cltk')

In [None]:
def extract_sentences_from_descendants(element, nsmap, file_path):
    sentences = []

    # Sélection du tokenizer en fonction de la langue
    if 'lat' in file_path:
        sentence_tokenizer = LatinPunktSentenceTokenizer()
    elif 'grc' in file_path:
        sentence_tokenizer = GreekRegexSentenceTokenizer()
    else:
        raise ValueError("Langue non supportée.")

    for descendant in element.iterdescendants():
        if isinstance(descendant.tag, str):
            # Extraction du nom de la balise sans l'espace de nom
            tag_without_ns = descendant.tag.split('}')[-1]
            if tag_without_ns == 'l' or tag_without_ns == 'p':
                child_text = etree.tostring(descendant, method="text", encoding="unicode").strip()
                # Vérifier si le texte n'est pas vide
                if child_text:
                    if tag_without_ns == 'l':
                        # Pour les balises 'l', ajouter directement le texte
                        sentences.append(child_text)
                    elif tag_without_ns == 'p':
                        # Pour les balises 'p', tokenizer le paragraphe en phrases
                        child_sentences = sentence_tokenizer.tokenize(child_text)
                        # Filtrer les phrases vides potentielles après le tokenizing
                        sentences.extend([sentence for sentence in child_sentences if sentence.strip()])

    return sentences

def extract_body_content_from_xml(file_path):
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(file_path, parser)
    nsmap = tree.getroot().nsmap
    default_ns = nsmap.get(None)

    if default_ns:
        body = tree.find(".//{{{}}}body".format(default_ns))
    else:
        body = tree.find(".//body")

    if body is None:
        raise ValueError(f"No <body> element found in {file_path}")

    return extract_sentences_from_descendants(body, nsmap, file_path)

In [None]:
def extract_texts_from_directory(directory_path, target_authors):
    texts = {}
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            if file.endswith('.xml') and ('lat' in file or 'grc' in file) and any(author in file for author in target_authors):
                content = extract_body_content_from_xml(file_path)
                texts[file] = content
    return texts

In [None]:
# Endroit où stocker tous les auteurs disponibles
grc_target_authors = ["tlg0059","tlg0086","tlg1325", "tlg0626","tlg1304","tlg0632","tlg0591","tlg0593","tlg1562","tlg1705","tlg0014","tlg0610"]
lat_target_authors = ["phi0474", "phi1017","stoa0255","phi1014","tlg0557","phi0550","tlg0628","tlg0562","phi1254","phi1002","stoa0058","phi0684","phi1212"]

In [None]:
greek_corpus = extract_texts_from_directory('/content/canonical-greekLit', grc_target_authors)

In [None]:
latin_corpus = extract_texts_from_directory('/content/canonical-latinLit', lat_target_authors)

In [None]:
total_phrases_grecques = sum(len(phrases) for phrases in greek_corpus.values())
total_phrases_latines = sum(len(phrases) for phrases in latin_corpus.values())

total_phrases_grecques, total_phrases_latines

(84080, 147698)

In [None]:
# Créer greek_sentences et latin_sentences en filtrant les phrases vides dès le début
greek_sentences = [(sentence, file_id) for file_id, sentences in greek_corpus.items() for sentence in sentences if sentence.strip()]
latin_sentences = [(sentence, file_id) for file_id, sentences in latin_corpus.items() for sentence in sentences if sentence.strip()]

# Créer greek_texts et latin_texts directement à partir des listes filtrées
greek_texts = [text for text, _ in greek_sentences]
latin_texts = [text for text, _ in latin_sentences]

## **Encodage**

In [None]:
from tqdm import tqdm

def check_data_and_encode(model, texts_with_file_id, batch_size=1024):
    encoded_data = []
    for start_index in tqdm(range(0, len(texts_with_file_id), batch_size), desc="Encoding"):
        # Sélectionner un lot de textes à encoder avec leur file_id
        batch = texts_with_file_id[start_index:start_index + batch_size]
        batch_texts = [text for text, _ in batch]
        batch_file_ids = [file_id for _, file_id in batch]

        try:
            # Encoder le lot de textes
            batch_embeddings = model.encode(batch_texts)
            # Associer chaque embedding à sa phrase et file_id, puis ajouter au résultat
            for text, embedding, file_id in zip(batch_texts, batch_embeddings, batch_file_ids):
                encoded_data.append((text, embedding, file_id))
        except Exception as e:
            print(f"Erreur lors de l'encodage du lot de textes de l'indice {start_index} à {start_index + batch_size}\nErreur: {e}")

    return encoded_data


In [None]:
greek_encoded_data = check_data_and_encode(model, greek_sentences)
#with open('/content/drive/MyDrive/embeddings_save/greek_embeddings_sphilberta.pkl', 'wb') as file:
    #pickle.dump(greek_encoded_data, file)

In [None]:
latin_encoded_data = check_data_and_encode(model, latin_sentences)
#with open('/content/drive/MyDrive/embeddings_save/latin_embeddings_sphilberta.pkl', 'wb') as file:
    #pickle.dump(latin_encoded_data, file)

In [None]:
import torch

# Fonction pour convertir les données en tensors PyTorch et les transférer sur la GPU
def to_tensor(data):
    phrases, embeddings, file_ids = zip(*data)
    embeddings_tensor = torch.tensor(embeddings).float()
    if torch.cuda.is_available():
        embeddings_tensor = embeddings_tensor.cuda()
    return phrases, embeddings_tensor, file_ids

greek_phrases, greek_embeddings_tensor, greek_file_ids = to_tensor(greek_encoded_data)
latin_phrases, latin_embeddings_tensor, latin_file_ids = to_tensor(latin_encoded_data)

In [None]:
from torch.nn.functional import cosine_similarity

latin_group_size = 20

# Préparer les tensors pour stocker les indices et les scores de similarité
greek_indices_list = []
latin_indices_list = []
similarity_scores_list = []

with torch.no_grad():
    # Itérer sur les groupes d'embeddings latins
    for start_idx in tqdm(range(0, latin_embeddings_tensor.size(0), latin_group_size), desc="Group Progress"):
        end_idx = min(start_idx + latin_group_size, latin_embeddings_tensor.size(0))
        latin_group = latin_embeddings_tensor[start_idx:end_idx]

        # Calculer la matrice de similarité pour le groupe actuel
        similarity_matrix = cosine_similarity(greek_embeddings_tensor.unsqueeze(1), latin_group.unsqueeze(0), dim=2)

        # Appliquer le seuil de similarité
        high_similarity_indices = torch.where(similarity_matrix > 0.7)

        # Stocker les indices et les scores de similarité
        greek_indices_list.append(high_similarity_indices[0])
        latin_indices_list.append(high_similarity_indices[1] + start_idx)  # Ajuster les indices latins selon le groupe
        similarity_scores_list.append(similarity_matrix[high_similarity_indices])

# Concaténer les listes d'indices et de scores en tensors uniques
greek_indices = torch.cat(greek_indices_list)
latin_indices = torch.cat(latin_indices_list)
similarity_scores = torch.cat(similarity_scores_list)

# Convertir en arrays NumPy en une seule opération
greek_indices_np = greek_indices.cpu().numpy()
latin_indices_np = latin_indices.cpu().numpy()
similarity_scores_np = similarity_scores.cpu().numpy()

Group Progress: 100%|██████████| 7385/7385 [03:31<00:00, 34.89it/s]


In [None]:
filtered_indices = [
    idx for idx in tqdm(range(len(similarity_scores_np)), desc="Filtrage des indices")
    if len(greek_phrases[greek_indices_np[idx]].split()) > 5 and
    len(latin_phrases[latin_indices_np[idx]].split()) > 5 and
    similarity_scores_np[idx] < 1
]

In [None]:
from collections import defaultdict
from math import log

file_pair_similarities = defaultdict(list)

for idx in tqdm(filtered_indices, desc="Accumulation des similarités"):
    g_idx, l_idx = greek_indices_np[idx], latin_indices_np[idx]
    file_pair_key = (greek_file_ids[g_idx], latin_file_ids[l_idx])
    file_pair_similarities[file_pair_key].append(similarity_scores_np[idx])

# Calculer la moyenne de similarité pour chaque paire de fichiers
file_pair_metrics = {
    file_pair: (sum(similarities) / len(similarities)) * log(len(similarities))
    for file_pair, similarities in tqdm(file_pair_similarities.items(), desc="Calcul des métriques composites")
}

In [None]:
# Trier par moyenne de similarité décroissante
sorted_file_pairs = sorted(file_pair_metrics.items(), key=lambda x: x[1], reverse=True)

# Prendre les N premiers fichiers pour affichage, par exemple N=5
top_file_pairs = sorted_file_pairs[:5]

In [None]:
for (g_file, l_file), _ in top_file_pairs:
    print(f"Greek File: {g_file}, Latin File: {l_file}")

    # Filtrer les indices pour ce fichier spécifique
    specific_indices = [
        idx for idx in filtered_indices
        if greek_file_ids[greek_indices_np[idx]] == g_file and latin_file_ids[latin_indices_np[idx]] == l_file
    ]

    # Trier ces indices par similarité décroissante
    specific_indices.sort(key=lambda x: similarity_scores_np[x], reverse=True)

    # Afficher les 10 paires de phrases supérieures
    for idx in specific_indices[:10]:
        print(f"  Greek Phrase: {greek_phrases[greek_indices_np[idx]]}")
        print(f"  Latin Phrase: {latin_phrases[latin_indices_np[idx]]}")
        print(f"  Similarity Score: {similarity_scores_np[idx]}\n")

    print("--------------------------------------------------\n")

Greek File: tlg0086.tlg025.perseus-grc1.xml, Latin File: phi0550.phi001.perseus-lat1.xml
  Greek Phrase: δῆλον δʼ ἔσται τὸ λεγόμενον ἐκ τῶν ὕστερον μᾶλλον.
  Latin Phrase: id licet hinc quamvis hebeti cognoscere corde.
  Similarity Score: 0.9741785526275635

  Greek Phrase: δῆλον δʼ ἔσται τὸ λεγόμενον ἐκ τῶν ὕστερον μᾶλλον.
  Latin Phrase: id licet hinc quamvis hebeti cognoscere corde.
  Similarity Score: 0.9741785526275635

  Greek Phrase: περὶ μὲν οὖν τούτων δεδήλωται καὶ πρότερον·
  Latin Phrase: id quod iam supra tibi paulo ostendimus ante.
  Similarity Score: 0.9737507104873657

  Greek Phrase: δῆλον δʼ ἔσται τὸ λεγόμενον ἐκ τῶν ὕστερον μᾶλλον.
  Latin Phrase: quae tibi posterius largo sermone probabo.
  Similarity Score: 0.9700868129730225

  Greek Phrase: περὶ μὲν οὖν τούτων δεδήλωται καὶ πρότερον·
  Latin Phrase: id quod iam supera tibi saepe ostendimus ante.
  Similarity Score: 0.9692978858947754

  Greek Phrase: περὶ μὲν οὖν τούτων δεδήλωται καὶ πρότερον·
  Latin Phrase: id q

In [None]:
auteurs = [
    ("Thalès", "tlg1705", -624, -546),
    ("Pythagore", "tlg0632", -570, -495),
    ("Héraclite", "tlg0626", -535, -475),
    ("Parménide", "tlg1562", -515, -450),
    ("Alcidamas", "tlg0610", -450, -400),
    ("Antisthène", "tlg0591", -445, -365),
    ("Platon", "tlg0059", -427, -347),
    ("Démocrite", "tlg1304", -460, -370),
    ("Aristote", "tlg0086", -384, -322),
    ("Diogène de Sinope", "tlg1325", -413, -324),
    ("Gorgias de Léontium", "tlg0593", -480, -375),
    ("Démosthène", "tlg0014", -384, -322),
    ("Cicéron", "phi0474", -106, -43),
    ("Lucrèce", "phi0550", -99, -55),
    ("Varron", "phi0684", -116, -27),
    ("Sénèque", "phi1017", -4, 65),
    ("Sénèque", "stoa0255", -4, 65),
    ("Sénèque", "phi1014", -4, 65),
    ("Épictète", "tlg0557", 50, 135),
    ("Musonius Rufus", "tlg0628", 20, 101),
    ("Quintilien", "phi1002", 35, 100),
    ("Marc Aurèle", "tlg0562", 121, 180),
    ("Apulée", "phi1212", 124, 170),
    ("Aulu Gelle", "phi1254", 125, 180),
    ("Boèce", "stoa0058", 477, 524)
]