In [1]:
import json
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List

In [2]:
@dataclass
class Word:
    word: str
    phonetic: str

In [3]:
# https://stackoverflow.com/a/51593236/9655481
def print_full(dataframe):
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 2000)
    pd.set_option("display.float_format", "{:20,.2f}".format)
    pd.set_option("display.max_colwidth", None)
    display(dataframe)
    pd.reset_option("display.max_rows")
    pd.reset_option("display.max_columns")
    pd.reset_option("display.width")
    pd.reset_option("display.float_format")
    pd.reset_option("display.max_colwidth")

def show_words(words: List[Word]):
    df = pd.DataFrame([vars(word) for word in words])
    print_full(df)

### üßë‚Äçüíª **Dataset**

In [6]:
def get_all_words():
    with open("../data/ipa/fr_FR.json", "r") as f:
        data = json.load(f)
    data = data["fr_FR"][0]
    return np.array([Word(key, value) for key, value in data.items()])

def get_random_words(n):
    words = get_all_words()
    rng = np.random.default_rng()
    return rng.choice(words, n, replace=False)

In [7]:
words = get_random_words(20)
show_words(words)

Unnamed: 0,word,phonetic
0,Asie,/azi/
1,d√©sapprennes,/dezap Å…õn/
2,glougloutaient,/gluglut…õ/
3,p√©n√©trant,/penet Å…ëÃÉ/
4,recr√©√®rent,/ Å…ôk Åe…õ Å/
5,outsiders,/awtsajd≈ì Å/
6,pond√®rera,/p…îÃÉd…õ Å…ô Åa/
7,influen√ß√¢tes,/…õÃÉfly…ëÃÉs…ët/
8,√©t√™t√©es,/etete/
9,implora,/…õÃÉpl…î Åa/


In [None]:
def distance(w1: Word, w2: Word):
    """
    Defines a metric to determine the distance between two words based on
    their phonetic representation.

    Employs the Needlemann-Wunsch algorithm to align the two phonetic sequences
    globally and assign a score to the alignment. We use a customized phonetic
    similarity matrix to punish or reward certain phonetic substitutions.
    """
    phoen1, phoen2 = w1.phonetic, w2.phonetic
    n, m = len(phoen1), len(phoen2)

    # Initialize the similarity matrix
    


In [9]:
def find_unique_phonetic_symbols():
    """
    Returns the set of all unique phonetic letters in the dataset.

    Note that one word consists of a sequence of phonetic symbols.
    """
    data = get_all_words()
    phonetic_symbols = set()
    for word in data:
        phonetic_symbols.update(set(word.phonetic))
    return phonetic_symbols

unique = find_unique_phonetic_symbols()
print("  ".join(unique))

m  ,  …•  p  ≈ì  d  ÃÉ     ≈ã  …™  w  l  z  e  y  j   É  s  Àê  …õ  √∏  r  …î  t  u  o   ä  x  k  …ô  i  .  /  g   º  …≤  b  …ë  a  v   Å  f   í  …°  n


In [19]:
# Construct a phonetic similarity matrix that I can print out and fill out by hand
# to assign similarity scores to phonetic symbols. Note that the `unique` variable
# contains all unique phonetic symbols in the dataset.
similarity_matrix = pd.DataFrame(index=list(unique), columns=list(unique))
similarity_matrix = similarity_matrix.fillna(" ")

# store as pdf
similarity_matrix.to_html("similarity_matrix.html")

In [24]:
# https://easypronunciation.com/en/french-letters-pronunciation-ipa-chart#french_consonants
# https://en.wikipedia.org/wiki/Help:IPA/French

# Fill out the similarity matrix by hand
consonants = [
    "b",
    "d",
    "f",
    "…°",
    "k",
    "l",
    "m",
    "n",
    "p",
    "s",
    "t",
    "v",
    "z",
    "…≤",
    " Å",
    " É",
    " í",
    "d í",
    "t É",
    "≈ã",
]
semi_vowels = ["j", "w", "…•"]
oral_vowels = ["a", "e", "i", "o", "u", "y", "√∏", "≈ì", "…î", "…ô", "…õ"]
nasal_vowels = ["…ëÃÉ", "…îÃÉ", "…õÃÉ", "≈ìÃÉ"]

all_symbols = consonants + semi_vowels + oral_vowels + nasal_vowels
print(all_symbols)

similarity_matrix = pd.DataFrame(index=list(all_symbols), columns=list(all_symbols))
similarity_matrix = similarity_matrix.fillna("")
similarity_matrix.to_html("similarity_matrix.html")

['b', 'd', 'f', '…°', 'k', 'l', 'm', 'n', 'p', 's', 't', 'v', 'z', '…≤', ' Å', ' É', ' í', 'd í', 't É', '≈ã', 'j', 'w', '…•', 'a', 'e', 'i', 'o', 'u', 'y', '√∏', '≈ì', '…î', '…ô', '…õ', '…ëÃÉ', '…îÃÉ', '…õÃÉ', '≈ìÃÉ']
