<a href="https://colab.research.google.com/github/OdysseusPolymetis/digital_studies/blob/main/BERT_for_multilingual_similarity_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# <center>**Mesures de similarités entre le grec et les traductions latines avec Multilingual BERT**</center>

Requirements

In [None]:
!pip install torch transformers numpy stanza

Nous entreposons nos pickles et notre modèle dans notre drive, vous pouvez changer les chemins.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Téléchargement des données sur le dépôt Perseus

In [None]:
!git clone https://github.com/PerseusDL/canonical-greekLit.git

In [None]:
!git clone https://github.com/PerseusDL/canonical-latinLit.git

Imports nécessaires

In [None]:
import torch
import glob
from joblib import Parallel, delayed
import pickle
from collections import defaultdict
import os
import numpy as np
import re
from lxml import etree
import stanza
from tqdm import tqdm
import unicodedata
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

## Récupération et paramétrage du modèle affiné

In [None]:
import torch
from transformers import BertTokenizer, BertModel

model_directory = "/content/drive/MyDrive/MBERT_Models/finetuned_mbert_model_best"

model = BertModel.from_pretrained(model_directory)
tokenizer = BertTokenizer.from_pretrained(model_directory + "/vocab.txt")
model.eval()  # Mode évaluation

Paramétrage du modèle sur GPU

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

## Extraction des éléments XML

In [None]:
def extract_body_content_from_xml(file_path):
    parser = etree.XMLParser(recover=True)
    tree = etree.parse(file_path, parser)
    nsmap = tree.getroot().nsmap
    default_ns = nsmap.get(None)

    if default_ns:
        body = tree.find(".//ns:body", namespaces={"ns": default_ns})
    else:
        body = tree.find(".//body")

    if body is None:
        raise ValueError(f"No <body> element found in {file_path}")

    return etree.tostring(body, method="text", encoding="unicode")

def is_latin(filename):
    return re.search(r'lat\d+\.xml$', filename) is not None

def is_greek(filename):
    return re.search(r'grc\d+\.xml$', filename) is not None

def extract_texts_from_directory(directory_path):
    """
    Extrait les textes de chaque fichier XML dans le répertoire spécifié et les stocke dans un dictionnaire.
    """
    texts = {}
    for root, _, files in os.walk(directory_path):
        for file in files:
            if is_latin(file) or is_greek(file):
                file_path = os.path.join(root, file)
                content = extract_body_content_from_xml(file_path)
                texts[file] = content
    return texts

In [None]:
def tokenize_and_encode_texts(texts, tokenizer):
    tokenized_texts = {}
    for text_key, text_value in texts.items():
        tokens = tokenizer.tokenize(text_value)
        # Tronquer à la longueur maximale de 512
        tokens = tokens[:512]
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        tokenized_texts[text_key] = {
            'tokens': tokens,
            'input_ids': input_ids
        }
    return tokenized_texts

In [None]:
greek_corpus = extract_texts_from_directory('/content/canonical-greekLit')

In [None]:
latin_corpus = extract_texts_from_directory('/content/canonical-latinLit')

## Lemmatisation avec Stanza
<br>Sauvegardes en pickle

In [None]:
stanza.download('grc')

In [None]:
stanza.download('la')

In [None]:
def process_text(text, nlp):
    """
    Traite le texte avec Stanza pour obtenir à la fois les formes fléchies et les lemmes.
    """
    doc = nlp(text)

    processed_text = {
        "lemmas": [],
        "forms": []
    }

    for sentence in doc.sentences:
        lemmatized_sentence = []
        forms_sentence = []
        for word in sentence.words:
            lemmatized_sentence.append(word.lemma)
            forms_sentence.append(word.text)
        processed_text["lemmas"].append(lemmatized_sentence)
        processed_text["forms"].append(forms_sentence)

    return processed_text

In [None]:
nlp = stanza.Pipeline('grc', processors='tokenize,pos,lemma', use_gpu=True)

In [None]:
nlp_latin = stanza.Pipeline('la', processors='tokenize,pos,lemma', use_gpu=True)

In [None]:
filenames = list(greek_corpus.keys())

# 2. Effectuer la lemmatisation sur cette portion
greek_corpus_processed = {}
for filename in tqdm(filenames, desc="Processing"):
    greek_corpus_processed[filename] = process_text(greek_corpus[filename], nlp)

In [None]:
filenames = list(latin_corpus.keys())

# 2. Effectuer la lemmatisation sur cette portion
latin_corpus_processed = {}
for filename in tqdm(filenames, desc="Processing"):
    latin_corpus_processed[filename] = process_text(latin_corpus[filename], nlp_latin)

Sauvegarde

In [None]:
with open('/content/drive/MyDrive/embeddings_save/lemmatized_greek_perseus.pkl', 'wb') as file:
    pickle.dump(greek_corpus_processed, file)

In [None]:
with open('/content/drive/MyDrive/embeddings_save/greek_corpus_processed.pkl', 'rb') as file:
    greek_corpus_processed = pickle.load(file)

In [None]:
with open('/content/drive/MyDrive/embeddings_save/latin_corpus_processed.pkl', 'wb') as file:
    pickle.dump(latin_corpus_processed, file)

In [None]:
with open('/content/drive/MyDrive/embeddings_save/latin_corpus_processed.pkl', 'rb') as file:
    latin_corpus_processed = pickle.load(file)

Vérification des données

In [None]:
for key, value in list(latin_corpus_processed.items())[:5]:
    lemmas = value["lemmas"]
    forms = value["forms"]
    print(f"Filename: {key}")
    print(f"Lemmatized Content: {' '.join(lemmas[0])[:200]}...")  # Print only first 200 chars of the lemmatized content of the first sentence
    print(f"Forms Content: {' '.join(forms[0])[:200]}...\n")  # Print only first 200 chars of the forms content of the first sentence

Redécoupage des phrases de plus de 510 tokens (BERT accepte 512 tokens par séquence)

In [None]:
def split_long_sentences(corpus, tokenizer, max_length=510):
    split_corpus = {"forms": {}, "lemmas": {}}

    for filename, text in tqdm(corpus.items(), desc="Splitting long sentences"):
        split_forms, split_lemmas = [], []

        for forms_sentence, lemmas_sentence in zip(text["forms"], text["lemmas"]):
            tokenized_words = [tokenizer.tokenize(word) for word in forms_sentence]
            token_lengths = [len(tokens) for tokens in tokenized_words]

            sub_sentence_tokens, sub_forms, sub_lemmas = [], [], []
            for word, lemma, tokens, token_length in zip(forms_sentence, lemmas_sentence, tokenized_words, token_lengths):
                if len(sub_sentence_tokens) + token_length <= max_length:
                    sub_forms.append(word)
                    sub_lemmas.append(lemma)
                    sub_sentence_tokens.extend(tokens)
                else:
                    split_forms.append(sub_forms)
                    split_lemmas.append(sub_lemmas)
                    sub_forms, sub_lemmas, sub_sentence_tokens = [word], [lemma], tokens
            if sub_forms:
                split_forms.append(sub_forms)
                split_lemmas.append(sub_lemmas)

        split_corpus["forms"][filename] = split_forms
        split_corpus["lemmas"][filename] = split_lemmas

    return split_corpus

In [None]:
def tokenize_sentence(sentence, tokenizer):
    return tokenizer.tokenize(sentence)

In [None]:
split_greek_corpus = split_long_sentences(greek_corpus_processed, tokenizer, 510)

In [None]:
split_latin_corpus = split_long_sentences(latin_corpus_processed, tokenizer, 510)

Sauvegarde du split

In [None]:
with open('/content/drive/MyDrive/embeddings_save/split_latin_corpus.pkl', 'wb') as file:
    pickle.dump(split_latin_corpus, file)
with open('/content/drive/MyDrive/embeddings_save/split_greek_corpus.pkl', 'wb') as file:
    pickle.dump(split_greek_corpus, file)

In [None]:
with open('/content/drive/MyDrive/embeddings_save/split_latin_corpus.pkl', 'rb') as file:
    split_latin_corpus = pickle.load(file)
with open('/content/drive/MyDrive/embeddings_save/split_greek_corpus.pkl', 'rb') as file:
    split_greek_corpus = pickle.load(file)

Vérification

In [None]:
for key, value in list(split_greek_corpus["forms"].items())[:5]:
    print(f"Filename: {key}")
    print(f"Forms Content (first sentence): {' '.join(value[0])[:200]}...\n")  # Print only first 200 chars of the forms content of the first sentence

print("\n---\n")

for key, value in list(split_greek_corpus["lemmas"].items())[:5]:
    print(f"Filename: {key}")
    print(f"Lemmatized Content (first sentence): {' '.join(value[0])[:200]}...\n")  # Print only first 200 chars of the lemmatized content of the first sentence

In [None]:
def verify_split_corpus(split_corpus, tokenizer, max_length=510):
    # Ici, nous utilisons tqdm pour envelopper l'itération sur les fichiers
    for filename in tqdm(split_corpus["forms"].keys(), desc="Verifying files"):
        forms_sentences = split_corpus["forms"][filename]
        lemmas_sentences = split_corpus["lemmas"][filename]

        if len(forms_sentences) != len(lemmas_sentences):
            print(f"Error: Mismatched number of sentences for file {filename}")
            continue

        for idx, (forms_sentence, lemmas_sentence) in enumerate(zip(forms_sentences, lemmas_sentences)):
            tokenized_sentence = tokenizer.tokenize(" ".join(forms_sentence))

            if len(tokenized_sentence) > max_length:
                print(f"Error: Sentence {idx} in file {filename} exceeds {max_length} tokens.")

            if len(forms_sentence) != len(lemmas_sentence):
                print(f"Error: Mismatched number of words in sentence {idx} of file {filename}")

In [None]:
print("Starting verification...")
verify_split_corpus(split_greek_corpus, tokenizer)
print("\nVerification complete!")

## Tri sur les auteurs

Le tri des auteurs se fait sur les identifiants des auteurs présents dans le titre des fichiers.

In [None]:
# Endroit où stocker tous les auteurs disponibles
grc_target_authors = ["tlg0059","tlg0086","tlg1325", "tlg0626","tlg1304","tlg0632","tlg0591","tlg0593","tlg1562","tlg1705","tlg0014","tlg0610"]
lat_target_authors = ["phi0474", "phi1017","stoa0255","phi1014","tlg0557","phi0550","tlg0628","tlg0562","phi1254","phi1002","stoa0058","phi0684","phi1212"]

In [None]:
#Première tranche latine
grc_target_authors = ["tlg0059","tlg0086","tlg1325", "tlg0626","tlg1304","tlg0632","tlg0591","tlg0593","tlg1562","tlg1705","tlg0014","tlg0610"]
lat_target_authors = ["phi0474", "phi1017","stoa0255","phi1014","tlg0557","phi0684"]

In [None]:
#Seconde tranche latine
grc_target_authors = ["tlg0059","tlg0086","tlg1325", "tlg0626","tlg1304","tlg0632","tlg0591","tlg0593","tlg1562","tlg1705","tlg0014","tlg0610"]
lat_target_authors = ["phi0550", "tlg0628", "tlg0562", "phi1254", "phi1002", "stoa0058", "phi1212"]

In [None]:
filtered_lat_corpus = {}
filtered_grc_corpus = {}
for categorie in split_latin_corpus:
    filtered_lat_corpus[categorie] = {fichier: phrases for fichier, phrases in split_latin_corpus[categorie].items() if any(substring in fichier for substring in lat_target_authors)}
for categorie in split_greek_corpus:
    filtered_grc_corpus[categorie] = {fichier: phrases for fichier, phrases in split_greek_corpus[categorie].items() if any(substring in fichier for substring in grc_target_authors)}

Tokénisation avec le modèle BERT, en conservant les indices des subtokens des formes associées aux lemmes

In [None]:
import multiprocessing

def tokenize_batch(args):
    batch_sentences_forms, batch_sentences_lemmas, tokenizer = args
    # Convertir les listes de mots en phrases pour le tokénisateur
    batch_sentences = [" ".join(sentence) for sentence in batch_sentences_forms]

    encoding = tokenizer(batch_sentences, truncation=True, padding='longest', return_tensors="pt", max_length=512)

    batch_tokens = []
    batch_origin_indices = []
    batch_subtoken_lemmas = []

    for sent_forms, sent_lemmas in zip(batch_sentences_forms, batch_sentences_lemmas):
        tokens = []
        origin_indices = []
        subtoken_lemmas = []
        for idx, (form, lemma) in enumerate(zip(sent_forms, sent_lemmas)):
            word_tokens = tokenizer.tokenize(form)
            tokens.extend(word_tokens)
            origin_indices.extend([idx] * len(word_tokens))
            subtoken_lemmas.extend([lemma] * len(word_tokens))
        batch_tokens.append(tokens)
        batch_origin_indices.append(origin_indices)
        batch_subtoken_lemmas.append(subtoken_lemmas)

    return encoding["input_ids"].tolist(), encoding["attention_mask"].tolist(), batch_sentences, batch_tokens, batch_origin_indices, batch_subtoken_lemmas

def batched_tokenization(corpus, tokenizer, batch_size):
    all_input_ids = []
    all_attention_masks = []
    all_sentence_tokens = []
    all_origin_indices = []
    all_subtoken_lemmas = []

    # Génération d'une liste de toutes les phrases du corpus pour les formes et les lemmes
    sentences_forms = [sentence for text in corpus["forms"].values() for sentence in text]
    sentences_lemmas = [sentence for text in corpus["lemmas"].values() for sentence in text]

    assert len(sentences_forms) == len(sentences_lemmas),

    # Préparation des batches de phrases
    batches = [(sentences_forms[i:i+batch_size], sentences_lemmas[i:i+batch_size], tokenizer) for i in range(0, len(sentences_forms), batch_size)]

    with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool:
        for input_ids, attention_masks, batch_sents, batch_tokens, batch_origin_inds, batch_lemmas in tqdm(pool.imap_unordered(func=tokenize_batch, iterable=batches), total=len(batches), desc="Tokenizing sentences"):
            all_input_ids.extend(input_ids)
            all_attention_masks.extend(attention_masks)
            all_sentence_tokens.extend(batch_tokens)
            all_origin_indices.extend(batch_origin_inds)
            all_subtoken_lemmas.extend(batch_lemmas)

    return all_input_ids, all_attention_masks, all_sentence_tokens, all_origin_indices, all_subtoken_lemmas

In [None]:
greek_all_input_ids, greek_all_attention_masks, greek_all_sentences_tokens, greek_all_origin_indices, greek_all_subtoken_lemmas = batched_tokenization(filtered_grc_corpus, tokenizer, batch_size=512)

In [None]:
latin_all_input_ids, latin_all_attention_masks, latin_all_sentences_tokens, latin_all_origin_indices, latin_all_subtoken_lemmas = batched_tokenization(filtered_lat_corpus, tokenizer, batch_size=512)

Vérification de la tokénisation

In [None]:
print("Sample input IDs:", greek_all_input_ids[:5])
print("Sample attention masks:", greek_all_attention_masks[:5])
print("Sample sentence tokens:", greek_all_sentences_tokens[:5])
print("Sample sentence tokens:", greek_all_subtoken_lemmas[:5])

Sample input IDs: [[101, 479, 10781, 12649, 58317, 468, 15233, 17198, 31712, 480, 13140, 17762, 484, 31712, 472, 21263, 31712, 10649, 465, 14669, 29613, 467, 99509, 23788, 12649, 465, 31625, 53428, 19491, 44306, 10484, 483, 14669, 87276, 58281, 12649, 469, 29223, 12526, 10487, 480, 19582, 29223, 14669, 27393, 19038, 12649, 10356, 475, 15233, 70076, 15860, 117, 468, 12526, 110568, 16099, 479, 14669, 63444, 485, 34359, 16146, 35790, 51650, 480, 22360, 15751, 484, 31712, 465, 14669, 70076, 117, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 10487, 484, 21263, 465, 43140, 62913, 33947, 10358, 10487, 474, 13140, 20660, 12649, 10358, 10487, 471, 84236, 14669, 117, 480, 60846, 15233, 27835, 10484, 102, 0, 0, 

Sauvegarde

In [None]:
with open("/content/drive/MyDrive/embeddings_save/greek_all_input_ids_512.pkl", "wb") as f:
    pickle.dump(greek_all_input_ids, f)
with open("/content/drive/MyDrive/embeddings_save/greek_attention_masks_512.pkl", "wb") as f:
    pickle.dump(greek_all_attention_masks, f)
with open("/content/drive/MyDrive/embeddings_save/greek_all_sentences_tokens_512.pkl", "wb") as f:
    pickle.dump(greek_all_sentences_tokens, f)
with open("/content/drive/MyDrive/embeddings_save/greek_all_origin_indices_512.pkl", "wb") as f:
    pickle.dump(greek_all_origin_indices, f)
with open("/content/drive/MyDrive/embeddings_save/greek_all_subtoken_lemmas_512.pkl", "wb") as f:
    pickle.dump(greek_all_subtoken_lemmas, f)

In [None]:
with open("/content/drive/MyDrive/embeddings_save/latin_all_input_ids_512.pkl", "wb") as f:
    pickle.dump(latin_all_input_ids, f)
with open("/content/drive/MyDrive/embeddings_save/latin_attention_masks_512.pkl", "wb") as f:
    pickle.dump(latin_all_attention_masks, f)
with open("/content/drive/MyDrive/embeddings_save/latin_all_sentences_tokens_512.pkl", "wb") as f:
    pickle.dump(latin_all_sentences_tokens, f)
with open("/content/drive/MyDrive/embeddings_save/latin_all_origin_indices_512.pkl", "wb") as f:
    pickle.dump(latin_all_origin_indices, f)
with open("/content/drive/MyDrive/embeddings_save/latin_all_subtoken_lemmas_512.pkl", "wb") as f:
    pickle.dump(latin_all_subtoken_lemmas, f)

In [None]:
with open("/content/drive/MyDrive/embeddings_save/greek_all_input_ids_512.pkl", "rb") as f:
    greek_all_input_ids = pickle.load(f)
with open("/content/drive/MyDrive/embeddings_save/greek_attention_masks_512.pkl", "rb") as f:
    greek_all_attention_masks = pickle.load(f)
with open("/content/drive/MyDrive/embeddings_save/greek_all_sentences_tokens_512.pkl", "rb") as f:
    greek_all_sentences_tokens = pickle.load(f)
with open("/content/drive/MyDrive/embeddings_save/greek_all_subtoken_lemmas_512.pkl", "rb") as f:
    greek_all_subtoken_lemmas = pickle.load(f)

In [None]:
with open("/content/drive/MyDrive/embeddings_save/latin_all_input_ids_512.pkl", "rb") as f:
    latin_all_input_ids = pickle.load(f)
with open("/content/drive/MyDrive/embeddings_save/latin_attention_masks_512.pkl", "rb") as f:
    latin_all_attention_masks = pickle.load(f)
with open("/content/drive/MyDrive/embeddings_save/latin_all_sentences_tokens_512.pkl", "rb") as f:
    latin_all_sentences_tokens = pickle.load(f)
with open("/content/drive/MyDrive/embeddings_save/latin_all_subtoken_lemmas_512.pkl", "rb") as f:
    latin_all_subtoken_lemmas = pickle.load(f)

## Calcul des vecteurs

Fonction qui permet d'impacter le moyennage des subtokens en un seul token : la partie du milieu d'un mot a plus ou moins de poids sur l'ensemble du vecteur.
<br>Pour faire sans cette fonction, il faut appeler la fonction `get_contextual_embeddings_without_weights`

In [None]:
def compute_weights(length):
    # Création d'un array avec les positions des tokens
    positions = np.arange(length)

    # Calcul de la position du milieu
    mid_position = length / 2.0

    # Paramètre de mise à l'échelle pour contrôler la largeur de la fonction gaussienne
    sigma = length / 6.0

    # Calcul des poids en utilisant une fonction gaussienne
    weights = np.exp(-(positions - mid_position)**2 / (2 * sigma**2))

    # Normalisation des poids pour qu'ils somment à 1
    weights /= weights.sum()

    return weights

In [None]:
def get_contextual_embeddings_without_weights(processed_texts, model, tokenizer, device, all_input_ids, all_attention_masks, all_origin_indices, all_sentences_tokens, all_subtoken_lemmas, batch_size=512):
    model.eval()
    lemma_embeddings_sum = {}
    lemma_token_counts = {}

    current_idx = 0
    for filename, forms_sentences in tqdm(processed_texts["forms"].items(), desc="Processing files"):
        lemmas_sentences = processed_texts["lemmas"][filename]

        for forms_sentence, lemmas_sentence in zip(forms_sentences, lemmas_sentences):
            assert len(forms_sentence) == len(lemmas_sentence), f"Forms and lemmas length mismatch in file {filename}."

            input_ids = all_input_ids[current_idx]
            attention_masks = all_attention_masks[current_idx]
            origin_indices = all_origin_indices[current_idx]
            subtoken_lemmas = all_subtoken_lemmas[current_idx]  # Lemmas pour les subtokens actuels

            input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)
            attention_masks_tensor = torch.tensor(attention_masks).unsqueeze(0).to(device)

            with torch.no_grad():
                  outputs = model(input_tensor, attention_mask=attention_masks_tensor)
                  hidden_states = outputs.last_hidden_state[0]

            # Pas de logique de poids ici

            for token_idx, (token, lemma) in enumerate(zip(all_sentences_tokens[current_idx], subtoken_lemmas)):
                if lemma not in lemma_embeddings_sum:
                    lemma_embeddings_sum[lemma] = np.zeros(hidden_states.shape[1], dtype=np.float32)
                    lemma_token_counts[lemma] = 0

                lemma_embeddings_sum[lemma] += hidden_states[token_idx].cpu().numpy()  # Simplement ajouter l'embedding
                lemma_token_counts[lemma] += 1

            current_idx += 1

    averaged_embeddings = {lemma: lemma_embeddings_sum[lemma] / lemma_token_counts[lemma] for lemma in lemma_embeddings_sum.keys()}
    return averaged_embeddings

In [None]:
from torch.nn.utils.rnn import pad_sequence

def get_contextual_embeddings_with_weights(processed_texts, model, tokenizer, device, all_input_ids, all_attention_masks, all_origin_indices, all_sentences_tokens, all_subtoken_lemmas, batch_size=256):
    model.eval()
    lemma_embeddings_sum = {}
    lemma_token_counts = {}

    # Total number of sentences
    total_sentences = len(all_input_ids)

    for start_idx in tqdm(range(0, total_sentences, batch_size), desc="Processing batches"):
        end_idx = min(start_idx + batch_size, total_sentences)

        # Extract batched data
        batch_input_ids = all_input_ids[start_idx:end_idx]
        batch_attention_masks = all_attention_masks[start_idx:end_idx]
        batch_sentences_tokens = all_sentences_tokens[start_idx:end_idx]
        batch_subtoken_lemmas = all_subtoken_lemmas[start_idx:end_idx]

        # Padding sequences within the batch to have the same length
        input_tensor = pad_sequence([torch.tensor(seq) for seq in batch_input_ids], batch_first=True).to(device)
        attention_masks_tensor = pad_sequence([torch.tensor(mask) for mask in batch_attention_masks], batch_first=True).to(device)

        with torch.no_grad():
            outputs = model(input_tensor, attention_mask=attention_masks_tensor)
            batch_hidden_states = outputs.last_hidden_state

        # Iterate over individual sentence embeddings in the batch
        for idx, hidden_states in enumerate(batch_hidden_states):
            weights = compute_weights(len(batch_sentences_tokens[idx]))

            for token_idx, (token, lemma) in enumerate(zip(batch_sentences_tokens[idx], batch_subtoken_lemmas[idx])):
                if lemma not in lemma_embeddings_sum:
                    lemma_embeddings_sum[lemma] = np.zeros(hidden_states.shape[1], dtype=np.float32)
                    lemma_token_counts[lemma] = 0

                lemma_embeddings_sum[lemma] += weights[token_idx] * hidden_states[token_idx].cpu().numpy()
                lemma_token_counts[lemma] += weights[token_idx]

    averaged_embeddings = {lemma: lemma_embeddings_sum[lemma] / lemma_token_counts[lemma] for lemma in lemma_embeddings_sum.keys()}
    return averaged_embeddings


In [None]:
philo_greek_embeddings = get_contextual_embeddings_with_weights(
    processed_texts=filtered_grc_corpus,
    model=model,
    tokenizer=tokenizer,
    device=device,
    all_input_ids=greek_all_input_ids,
    all_attention_masks=greek_all_attention_masks,
    all_sentences_tokens = greek_all_sentences_tokens,
    all_origin_indices=greek_all_origin_indices,
    all_subtoken_lemmas=greek_all_subtoken_lemmas
)

In [None]:
philo_latin_embeddings = get_contextual_embeddings_with_weights(
    processed_texts=filtered_lat_corpus,
    model=model,
    tokenizer=tokenizer,
    device=device,
    all_input_ids=latin_all_input_ids,
    all_attention_masks=latin_all_attention_masks,
    all_sentences_tokens = latin_all_sentences_tokens,
    all_origin_indices=latin_all_origin_indices,
    all_subtoken_lemmas=latin_all_subtoken_lemmas
)

Vérification

In [None]:
import random

random_lemmas = random.sample(list(philo_greek_embeddings.keys()), 10)
for lemma in random_lemmas:
    print(lemma, philo_greek_embeddings[lemma][:5])

Sauvegarde

In [None]:
with open('/content/drive/MyDrive/embeddings_save/philo_greek_embeddings.pkl', 'wb') as file:
    pickle.dump(philo_greek_embeddings, file)

In [None]:
with open('/content/drive/MyDrive/embeddings_save/philo_latin_embeddings.pkl', 'wb') as file:
    pickle.dump(philo_latin_embeddings, file)

## Postprocessing sur les embeddings

Téléchargement des stopwords. Il s'agit ici d'un dépôt public, mais les listes sont celles proposées par CLTK.

In [None]:
!gdown --id 1MZ4ld8j30ye3YGYy-T7V3Cyy1c4dlGks

In [None]:
!gdown --id 161g7Kdv4PCFp2iYJAlMbHkb-CGstsNcu

In [None]:
with open("/content/stopwords_gk.txt", encoding = "utf8") as stop_file:
  stopwords_diacritics=stop_file.read().split("\n")
stopwords_greek=set(stopwords_diacritics)

In [None]:
with open("/content/stopwords_lat.txt", encoding = "utf8") as stop_file:
  stopwords_diacritics=stop_file.read().split("\n")
stopwords_latin=set(stopwords_diacritics)

Suppression des embeddings de mots outils

In [None]:
no_stop_greek_embeddings = {k: v for k, v in philo_greek_embeddings.items() if k not in stopwords_greek}

In [None]:
no_stop_latin_embeddings = {k: v for k, v in philo_latin_embeddings.items() if k not in stopwords_latin}

Conservation des embeddings des 10000 mots les plus fréquents (pour la représention graphique --> n'impacte pas le calcul).

In [None]:
from collections import Counter

# 1. Calculer la fréquence des lemmes
frequency_counter = Counter()
for filename, lemmatized_sentences in split_greek_corpus["lemmas"].items():  # Attention ici, j'ai ajouté ["lemmas"]
    for lemmatized_text in lemmatized_sentences:
        for lemma in lemmatized_text:
            frequency_counter[lemma] += 1

# 2. Trier les lemmes par fréquence
sorted_lemmas = [item[0] for item in frequency_counter.most_common()]

# 3. Sélectionner les top N lemmes
top_greek_lemmas = set(sorted_lemmas[:10000])

# 4. Filtrer les embeddings
greek_filtered_embeddings = {lemma: no_stop_greek_embeddings[lemma] for lemma in top_greek_lemmas if lemma in no_stop_greek_embeddings}

In [None]:
# 1. Calculer la fréquence des lemmes
frequency_counter = Counter()
for filename, lemmatized_sentences in split_latin_corpus["lemmas"].items():  # Attention ici, j'ai ajouté ["lemmas"]
    for lemmatized_text in lemmatized_sentences:
        for lemma in lemmatized_text:
            frequency_counter[lemma] += 1

# 2. Trier les lemmes par fréquence
sorted_lemmas = [item[0] for item in frequency_counter.most_common()]

# 3. Sélectionner les top N lemmes
top_latin_lemmas = set(sorted_lemmas[:10000])

# 4. Filtrer les embeddings
latin_filtered_embeddings = {lemma: no_stop_latin_embeddings[lemma] for lemma in top_latin_lemmas if lemma in no_stop_latin_embeddings}

Sauvegarde dans un fichier de vecteurs compatible tensorflow (embeddings projector)

In [None]:
# Extract the lemmas and their embeddings
all_embeddings = {**latin_filtered_embeddings, **greek_filtered_embeddings}
# Save embeddings to the vectors file
with open('vectors.tsv', 'w') as f_vectors:
    for lemma, embedding in all_embeddings.items():
        f_vectors.write('\t'.join([str(value) for value in embedding]) + '\n')

with open('metadata.tsv', 'w') as f_metadata:
    for lemma in all_embeddings.keys():
        f_metadata.write(lemma + '\n')

Calcul de la similarité avec similarité cosinus

In [None]:
similarities = np.zeros((len(reduced_latin_dict), len(reduced_greek_dict)))

def find_most_similar(target_embedding, embeddings_dict):
    similarities = {}
    for word, embedding in embeddings_dict.items():
        sim = cosine_similarity([target_embedding], [embedding])[0][0]
        similarities[word] = sim

    sorted_items = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

    return sorted_items[:10]

In [None]:
greek_word = "δόξα"
greek_embedding = reduced_greek_dict[greek_word]

top_latin_words_with_scores = find_most_similar(greek_embedding, reduced_greek_dict)

for word, score in top_latin_words_with_scores:
    print(f"{word}: {score:.4f}")