# Setup

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
path = r"C:\Users\maron\OneDrive\02-Documents\03.PROJETS\00.INFORMATIQUE\02.AI\WOLOF"
os.chdir(path)

# Functions

In [44]:
def compare_two_dataframes(df1, df2, d1, d2):
    # This function compares two dataframes
    set1 = set(df1['transcription'])
    set2 = set(df2['transcription'])

    shared_sentences = set1.intersection(set2)
    unique_to_set1 = set1.difference(set2)
    unique_to_set2 = set2.difference(set1)
    is_set1_in_set2 = set1.issubset(set2)
    is_set2_in_set1 = set2.issubset(set1)

    result = {
        'df1': d1,
        'df2': d2,
        'shared_sentences': len(shared_sentences),
        f'unique to {d1}': len(unique_to_set1),
        f'unique to {d2}': len(unique_to_set2),
        f'is {d1} in {d2}': is_set1_in_set2,
        f'is {d2} in {d1}': is_set2_in_set1
    }

    return result

In [45]:
def dataset_name(dfs, i):
    if i == 0:
        return 'zenodo'
    elif i == 1:
        return 'wol_alf'
    elif i == 2:
        return 'waxal_git'
    elif i == 3:
        return 'waxal'
    elif i == 4:
        return 'serge'
    elif i == 5:
        return 'google_fleurs'
    elif i == 6:
        return 'alffa_git'
    elif i == 7:
        return 'alffa'


In [46]:
def compare_dataframes(dfs):
    # This function takes a list of dataframes and compares each pair of dataframes
    results = []
    for i in range(len(dfs)):
        for j in range(i+1, len(dfs)):
            result = compare_two_dataframes(dfs[i], dfs[j], dataset_name(dfs, i), dataset_name(dfs, j))
            results.append(result)
    return results

# Loading datasets with sentences

In [20]:
alffa = pd.read_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\alffa\alffa_clean_df.csv")
alffa_git = pd.read_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\alffa_git\alffa_git_clean.csv")
google_fleurs = pd.read_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\google_fleurs\google_fleurs_clean.csv")
serge = pd.read_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\serge\serge_clean_df.csv")
waxal = pd.read_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\waxal\waxal__clean_df.csv")
waxal_git = pd.read_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\waxal_git\waxal_git_clean_df.csv")
wol_alf = pd.read_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\wol_alf\wol_alf_clean.csv")
zenodo = pd.read_csv(r"SPEECH_TO_TEXT\DATA\CLEANED\WOLOF_AUDIO_TRANS\zenodo\zenodo_cleaned.csv")

# Detecting doublon of dataset

In [40]:
dfs = [zenodo, wol_alf, waxal_git, waxal, serge, google_fleurs, alffa_git, alffa]

In [47]:
result = compare_dataframes(dfs)

In [48]:
print(result)

[{'df1': 'zenodo', 'df2': 'wol_alf', 'shared_sentences': 0, 'unique to zenodo': 33919, 'unique to wol_alf': 992, 'is zenodo in wol_alf': False, 'is wol_alf in zenodo': False}, {'df1': 'zenodo', 'df2': 'waxal_git', 'shared_sentences': 0, 'unique to zenodo': 33919, 'unique to waxal_git': 999, 'is zenodo in waxal_git': False, 'is waxal_git in zenodo': False}, {'df1': 'zenodo', 'df2': 'waxal', 'shared_sentences': 0, 'unique to zenodo': 33919, 'unique to waxal': 2376, 'is zenodo in waxal': False, 'is waxal in zenodo': False}, {'df1': 'zenodo', 'df2': 'serge', 'shared_sentences': 0, 'unique to zenodo': 33919, 'unique to serge': 4989, 'is zenodo in serge': False, 'is serge in zenodo': False}, {'df1': 'zenodo', 'df2': 'google_fleurs', 'shared_sentences': 0, 'unique to zenodo': 33919, 'unique to google_fleurs': 1656, 'is zenodo in google_fleurs': False, 'is google_fleurs in zenodo': False}, {'df1': 'zenodo', 'df2': 'alffa_git', 'shared_sentences': 0, 'unique to zenodo': 33919, 'unique to alffa_

In [51]:
from tabulate import tabulate

# Loop over each dictionary in the result list
for i, res in enumerate(result):
    # Convert the dictionary to a DataFrame
    df = pd.DataFrame([res])
    
    # Convert the DataFrame to a table using tabulate
    table = tabulate(df, headers='keys', tablefmt='pretty')
    
    # Print the table
    print(f"Result {i+1}:")
    print(table)

Result 1:
+---+--------+---------+------------------+------------------+-------------------+----------------------+----------------------+
|   |  df1   |   df2   | shared_sentences | unique to zenodo | unique to wol_alf | is zenodo in wol_alf | is wol_alf in zenodo |
+---+--------+---------+------------------+------------------+-------------------+----------------------+----------------------+
| 0 | zenodo | wol_alf |        0         |      33919       |        992        |        False         |        False         |
+---+--------+---------+------------------+------------------+-------------------+----------------------+----------------------+
Result 2:
+---+--------+-----------+------------------+------------------+---------------------+------------------------+------------------------+
|   |  df1   |    df2    | shared_sentences | unique to zenodo | unique to waxal_git | is zenodo in waxal_git | is waxal_git in zenodo |
+---+--------+-----------+------------------+----------------