In [247]:
import pprint
import os
from conll3 import *

In [248]:
def convert_misc_to_dict(misc):
    # Dictionnaire pour stocker les informations supplémentaires
    result_dict = {}
    pairs = misc.split("|")
    for pair in pairs:
        key, value = pair.split("=")
        result_dict[key] = value
    return result_dict

In [249]:
def extract_occ_homophone_mono(conllu_path:str):
    """
    Extrait les occurrences d'une suite de 2 tokens homophones dans un corpus CoNLL-U.Regarde dans misc syl1.

    Parameters:
    conllu_path (str): le chemin du fichier CoNLL-U

    Returns:
    dict: un dictionnaire avec les occurrences des suites de 2 tokens homophones
    """
    occurences = {}
    # Convertir le fichier conllu en arbres de dépendances
    trees = conllFile2trees(conllu_path)
    for tree in trees:
        # Convertir l'arbre en chaîne de caractères
        tree_str = str(tree)

        words = tree.words
        index = 1

        while index < len(words):
            form_T1 = tree[index].get("t", "_")
            pos = tree[index].get("tag", "_")
            pos_T1 = pos
            misc_T1 = tree[index].get("misc", "")
            misc_T1_dict = convert_misc_to_dict(misc_T1)
            prononciation1 = misc_T1_dict.get("Syl1", "_")
            syl1_slopeglo = misc_T1_dict.get("Syl1SlopeGlo", "_")

            idx = index + 1

            if idx < len(words):
                form_T2 = tree[idx].get("t", "_")
                pos_T2 = tree[idx].get("tag", "_")
                misc_T2 = tree[idx].get("misc", "")
                misc_T2_dict = convert_misc_to_dict(misc_T2)
                prononciation2 = misc_T2_dict.get("Syl1", "_")
                syl1_slopeglo2 = misc_T2_dict.get("Syl1SlopeGlo", "_")

                if (prononciation1 == prononciation2 and pos_T1 != pos_T2) and ("Syl2" not in misc_T1 and "Syl2" not in misc_T2) and (prononciation1 != "_" and prononciation2 != "_" and prononciation1 != "FUSED" and prononciation2 != "FUSED" and prononciation1 and prononciation2 and pos_T1 != "PUNCT" and pos_T2 != "PUNCT"):
                    key = f"{form_T1}-{form_T2}"
                    sub_key = f"/{prononciation1}/-/{prononciation2}/"
                    if key not in occurences:
                        occurences[key] = {sub_key: 1}
                    else:
                        if sub_key not in occurences[key]:
                            occurences[key][sub_key] = 1
                        else:
                            occurences[key][sub_key] += 1

            index += 1
            
    # print(occurences)
    return occurences


In [250]:
def extract_occ_homophone_bi(conllu_path:str):
    """
    Extrait les occurrences d'une suite de 2 tokens homophones dans un corpus CoNLL-U.Regarde dans misc syl1.

    Parameters:
    conllu_path (str): le chemin du fichier CoNLL-U

    Returns:
    dict: un dictionnaire avec les occurrences des suites de 2 tokens homophones
    """
    occurences = {}
    # Convertir le fichier conllu en arbres de dépendances
    trees = conllFile2trees(conllu_path)
    for tree in trees:
        # Convertir l'arbre en chaîne de caractères
        tree_str = str(tree)

        words = tree.words
        index = 1

        while index < len(words):
            form_T1 = tree[index].get("t", "_")
            pos = tree[index].get("tag", "_")
            pos_T1 = pos
            misc_T1 = tree[index].get("misc", "")
            misc_T1_dict = convert_misc_to_dict(misc_T1)
            prononciation1_syl1 = misc_T1_dict.get("Syl1", "_")
            prononciation1_syl2 = misc_T1_dict.get("Syl2", "_")
            prononciation = f"{prononciation1_syl1}{prononciation1_syl2}"


            idx = index + 1

            if idx < len(words):
                form_T2 = tree[idx].get("t", "_")
                pos_T2 = tree[idx].get("tag", "_")
                misc_T2 = tree[idx].get("misc", "")
                misc_T2_dict = convert_misc_to_dict(misc_T2)
                prononciation2_syl1 = misc_T2_dict.get("Syl1", "_")
                prononciation2_syl2 = misc_T2_dict.get("Syl2", "_")
                prononciation2 = f"{prononciation2_syl1}{prononciation2_syl2}"

                if (prononciation == prononciation2 and pos_T1 != pos_T2) and ("Syl2" in misc_T1 and "Syl2" in misc_T2 and "Syl3" not in misc_T1 and "Syl3" not in misc_T2) and ("_" not in prononciation and "_" not in prononciation2 and "FUSED" not in prononciation and "FUSED" not in prononciation2 and pos_T1 != "PUNCT" and pos_T2 != "PUNCT"):
                    key = f"{form_T1}-{form_T2}"
                    sub_key = f"/{prononciation}/-/{prononciation2}/"
                    sub_sub_key_slope = f"{prononciation1_syl1}-{prononciation2_syl1}"
                    if key not in occurences:
                        occurences[key] = {sub_key: 1}
                    else:
                        if sub_key not in occurences[key]:
                            occurences[key][sub_key] = 1
                        else:
                            occurences[key][sub_key] += 1

            index += 1
            
    # print(occurences)
    return occurences


In [251]:
def extract_all_occurrences_obj(directory_path):
    # Dictionnaire pour stocker toutes les occurrences d'objets
    occurrences = {}
    # Parcourir tous les fichiers du répertoire
    for root, dirs, files in os.walk(directory_path):
        if "non_gold" in dirs:
            dirs.remove("non_gold")
        for file in files:
            # Vérifier si le fichier est au format conllu et contient "MG" dans son nom
            if file.endswith(".conllu") and "MG" in file:
                file_path = os.path.join(root, file)
                # Extraire les occurrences d'objets du fichier et les ajouter au dictionnaire des occurrences
                file_occurrences_mono = extract_occ_homophone_mono(file_path)
                for key, sub_dict in file_occurrences_mono.items():
                    if key not in occurrences:
                        occurrences[key] = sub_dict
                    else:
                        for sub_key, count in sub_dict.items():
                            if sub_key in occurrences[key]:
                                occurrences[key][sub_key] += count
                            else:
                                occurrences[key][sub_key] = count
                                
                
                file_occurrences_bi = extract_occ_homophone_bi(file_path)
                for key, sub_dict in file_occurrences_bi.items():
                    if key not in occurrences:
                        occurrences[key] = sub_dict
                    else:
                        for sub_key, count in sub_dict.items():
                            if sub_key in occurrences[key]:
                                occurrences[key][sub_key] += count
                            else:
                                occurrences[key][sub_key] = count

    filtered_sorted_occurrences = {}
    for key, sub_dict in occurrences.items():
        # Filtrer les sous-dictionnaires pour garder les valeurs > 1
        filtered_sub_dict = {sub_key: count for sub_key, count in sub_dict.items() if count > 1}
        if filtered_sub_dict:  # Ajouter seulement si le sous-dictionnaire n'est pas vide
            sorted_sub_dict = dict(sorted(filtered_sub_dict.items(), key=lambda item: item[1], reverse=True))
            filtered_sorted_occurrences[key] = sorted_sub_dict
    return filtered_sorted_occurrences

In [252]:
directory_path = "/Users/perrine/Desktop/Stage_2023-2024/SUD_Naija-NSC-master/"

# Extraire les occurrences d'objets de tous les fichiers du répertoire
homophones = extract_all_occurrences_obj(directory_path)
pprint.pp(homophones)
print("\nhomophones extraits !")

{'de-dey': {'/de/-/de/': 67, '/dE/-/dE/': 3},
 'no-know': {'/no/-/no/': 65},
 'for-four': {'/fO/-/fO/': 2},
 'go-go': {'/go/-/go/': 65},
 'dey-dere': {'/de/-/de/': 2},
 'dere-dey': {'/de/-/de/': 2},
 'way-wey': {'/we/-/we/': 13},
 'dey-dey': {'/de/-/de/': 12},
 'con-come': {'/kO~/-/kO~/': 6},
 'ah-I': {'/a/-/a/': 3},
 'fall-for': {'/fO/-/fO/': 7},
 'dem-dey': {'/de/-/de/': 5},
 'now-na': {'/na/-/na/': 2},
 'come-con': {'/kO~/-/kO~/': 2},
 'none-of': {'/nO/-/nO/': 2},
 'we-will': {'/wi/-/wi/': 2}}

homophones extraits !
