In [6]:
# https://stackoverflow.com/a/76024851/9655481
# %pip install --upgrade jupyter ipywidgets

In [7]:
import json
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List

In [8]:
@dataclass
class Word:
    word: str
    phonetic: str

In [9]:
# https://stackoverflow.com/a/51593236/9655481
def print_full(dataframe):
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 2000)
    pd.set_option("display.float_format", "{:20,.2f}".format)
    pd.set_option("display.max_colwidth", None)
    display(dataframe)
    pd.reset_option("display.max_rows")
    pd.reset_option("display.max_columns")
    pd.reset_option("display.width")
    pd.reset_option("display.float_format")
    pd.reset_option("display.max_colwidth")

def show_words(words: List[Word]):
    df = pd.DataFrame([vars(word) for word in words])
    print_full(df)

---

### üßë‚Äçüíª **Dataset**

In [10]:
def get_all_words():
    with open("../data/ipa/fr_FR.json", "r") as f:
        data = json.load(f)
    data = data["fr_FR"][0]
    return np.array([Word(key, value) for key, value in data.items()])

def get_random_words(n):
    words = get_all_words()
    rng = np.random.default_rng()
    return rng.choice(words, n, replace=False)

In [11]:
words = get_random_words(20)
show_words(words)

Unnamed: 0,word,phonetic
0,animera,/anim…ô Åa/
1,opticiennes,/…îptisj…õn/
2,calories,/kal…î Åi/
3,impropres,/…õÃÉp Å…îp Å/
4,mus√¢mes,/myzam/
5,t√©l√©phon√©es,/telef…îne/
6,poissent,/pwas/
7,abordiez,/ab…î Ådje/
8,ruisselet,/ Å…•isl…õ/
9,encrassait,/…ëÃÉk Åas…õ/


In [12]:
def find_unique_phonetic_symbols():
    """
    Returns the set of all unique phonetic letters in the dataset.

    Note that one word consists of a sequence of phonetic symbols.
    """
    data = get_all_words()
    phonetic_symbols = set()
    for word in data:
        phonetic_symbols.update(set(word.phonetic))
    return phonetic_symbols

unique = find_unique_phonetic_symbols()
print("  ".join(unique))

n  g  …™  e  s  o   É  x  t  ≈ã  v  Àê  z  √∏  u     y  .  r  l  j  …•  …≤  …ô  b  d  f  /  ÃÉ   Å  …î  p  …õ  k   ä  i   º  ≈ì  m   í  w  ,  a  …ë


In [13]:
# Construct a phonetic similarity matrix that I can print out and fill out by hand
# to assign similarity scores to phonetic symbols. Note that the `unique` variable
# contains all unique phonetic symbols in the dataset.
similarity_matrix = pd.DataFrame(index=list(unique), columns=list(unique))
similarity_matrix = similarity_matrix.fillna(" ")

# store as pdf
similarity_matrix.to_html("similarity_matrix.html")

In [14]:
# https://easypronunciation.com/en/french-letters-pronunciation-ipa-chart#french_consonants
# https://en.wikipedia.org/wiki/Help:IPA/French

# Fill out the similarity matrix by hand
consonants = [
    "b",
    "d",
    "f",
    "g",
    "k",
    "l",
    "m",
    "n",
    "p",
    "s",
    "t",
    "v",
    "z",
    "…≤",
    " Å",
    " É",
    " í",
    "d í",
    "t É",
    "≈ã",
]
semi_vowels = ["j", "w", "…•"]
oral_vowels = ["a", "…ë", "e", "i", "o", "u", "y", "√∏", "≈ì", "…î", "…ô", "…õ"]
nasal_vowels = ["…ëÃÉ", "…îÃÉ", "…õÃÉ", "≈ìÃÉ"]

all_symbols = consonants + semi_vowels + oral_vowels + nasal_vowels
print(all_symbols)

similarity_matrix = pd.DataFrame(index=list(all_symbols), columns=list(all_symbols))
similarity_matrix = similarity_matrix.fillna("")
similarity_matrix.to_html("similarity_matrix.html")

['b', 'd', 'f', 'g', 'k', 'l', 'm', 'n', 'p', 's', 't', 'v', 'z', '…≤', ' Å', ' É', ' í', 'd í', 't É', '≈ã', 'j', 'w', '…•', 'a', '…ë', 'e', 'i', 'o', 'u', 'y', '√∏', '≈ì', '…î', '…ô', '…õ', '…ëÃÉ', '…îÃÉ', '…õÃÉ', '≈ìÃÉ']


---

In [15]:
class SimilarityMatrix:
    """
    Similarity matrix between phonetic symbols.
    Can be indexed by two phonetic symbols to get the similarity score between them.
    """

    def __init__(self, similarity_data):
        all_symbols = self._data_to_symbols(similarity_data)
        self.matrix = pd.DataFrame(index=list(all_symbols), columns=list(all_symbols))

        for (symbol1, symbol2), score in similarity_data.items():
            self.matrix.at[symbol1, symbol2] = score
            self.matrix.at[symbol2, symbol1] = score  # symmetric

        # Error if matrix contains NaN values
        if self.matrix.isnull().values.any():
            raise ValueError(
                "Similarity matrix contains NaN values."
                + " Make sure to fill out all values."
            )

    def _data_to_symbols(self, data):
        symbols = set()
        for (symbol1, symbol2), _score in data.items():
            symbols.add(symbol1)
            symbols.add(symbol2)
        return symbols

    def __getitem__(self, key):
        return self.matrix.at[key[0], key[1]]
    
    def symbols(self):
        return list(self.matrix.columns)


class NeedlemanWunsch:
    """
    Implementation of the Needleman-Wunsch algorithm for global sequence alignment
    score calculation. The algorithm uses a similarity matrix and a linear gap penalty.
    """

    def __init__(self, similarity_matrix, gap_penalty):
        self.similarity_matrix = similarity_matrix
        self.gap_penalty = gap_penalty

    def calculate_score(self, a, b):
        """
        Calculates the alignment score between two sequences a and b.
        """
        matrix = np.zeros((len(a) + 1, len(b) + 1))
        for i in range(len(a) + 1):
            matrix[i, 0] = self.gap_penalty * i
        for j in range(len(b) + 1):
            matrix[0, j] = self.gap_penalty * j
        for i in range(1, len(a) + 1):
            for j in range(1, len(b) + 1):
                match = matrix[i - 1, j - 1] + self.similarity_matrix[a[i-1], b[j-1]]
                delete = matrix[i - 1, j] + self.gap_penalty
                insert = matrix[i, j - 1] + self.gap_penalty
                matrix[i, j] = max(match, delete, insert)
        return matrix[-1, -1]

In [16]:
similarity_data = {
    ("A", "A"): 10,
    ("A", "G"): -1,
    ("A", "C"): -3,
    ("A", "T"): -4,
    ("G", "G"): 7,
    ("G", "C"): -5,
    ("G", "T"): -3,
    ("C", "C"): 9,
    ("C", "T"): 0,
    ("T", "T"): 8,
}

# similarity_data = {
#     ("A", "A"): 1,
#     ("A", "G"): -1,
#     ("A", "C"): -1,
#     ("A", "T"): -1,
#     ("G", "G"): 1,
#     ("G", "C"): -1,
#     ("G", "T"): -1,
#     ("C", "C"): 1,
#     ("C", "T"): -1,
#     ("T", "T"): 1,
# }

similarity_matrix = SimilarityMatrix(similarity_data)
allowed_symbols = similarity_matrix.symbols()
score = NeedlemanWunsch(similarity_matrix, -1).calculate_score("GCATGCG", "GCATGCA")
print(f"Allowed symbols: {allowed_symbols}")
print(f"üåü Score: {score}")

Allowed symbols: ['A', 'G', 'T', 'C']
üåü Score: 49.0


---

In [17]:
allowed_symbols = [
    "b",
    "d",
    "f",
    "g",
    "k",
    "l",
    "m",
    "n",
    "p",
    "s",
    "t",
    "v",
    "z",
    "…≤",
    " Å",
    " É",
    " í",
    "d í",
    "t É",
    "≈ã",
    "j",
    "w",
    "…•",
    "a",
    "…ë",
    "e",
    "i",
    "o",
    "u",
    "y",
    "√∏",
    "≈ì",
    "…î",
    "…ô",
    "…õ",
    "…ëÃÉ",
    "…îÃÉ",
    "…õÃÉ",
    "≈ìÃÉ",
]

---

In [34]:
class Phonetics:

    @classmethod
    def preprocess(cls, string) -> List[str]:
        """
        Preprocesses a word by converting it to lowercase and removing whitespace.
        """
        return [cls._preprocess(individual) for individual in string.split(",")]

    @classmethod
    def _preprocess(cls, string) -> str:
        res = string.strip()
        if res[0] == "/":
            res = res[1:]
        if res[-1] == "/":
            res = res[:-1]
        return res

    @classmethod
    def split(cls, string, allowed_symbols):
        """
        Extracts the phonetic symbols from a word by means of a small 1-look-ahead parser.
        """
        symbols = []

        i = 0
        while i < len(string):
            letter = string[i]

            # Ignore these symbols
            if letter in ["Àê", ".", " º", " "]:
                i += 1
                continue

            if letter not in allowed_symbols:
                raise ValueError(f"Invalid symbol '{letter}' in word '{string}'.")

            if letter == "\u0303":
                raise ValueError("Unicode 'Combining tilde' character should have been subsumed into the previous character.")

            if i + 1 < len(string):
                next_letter = string[i + 1]
                if letter in ["…ë", "…õ", "…î", "≈ì"] and next_letter == "\u0303":
                    symbols.append(letter + u"\u0303")
                    i += 2
                    continue

                if letter == "d" and next_letter == " í":
                    symbols.append("d í")
                    i += 2
                    continue

                if letter == "t" and next_letter == " É":
                    symbols.append("t É")
                    i += 2
                    continue

            symbols.append(string[i])
            i += 1

        return symbols

In [35]:
phonetic = "aok≈ìÃÉm…îm…ëÃÉ"
phonetics_processed = Phonetics.preprocess(phonetic)
print(phonetics_processed)
Phonetics.split(phonetics_processed[0], allowed_symbols)

['aok≈ìÃÉm…îm…ëÃÉ']


['a', 'o', 'k', '≈ìÃÉ', 'm', '…î', 'm', '…ëÃÉ']

In [38]:
words = get_all_words()
for i, word in enumerate(words):
    words_processed = Phonetics.preprocess(word.phonetic)
    for word_processed in words_processed:
        Phonetics.split(word_processed, allowed_symbols)

In [None]:
# We use a customized phonetic similarity matrix to punish or reward
# certain phonetic substitutions.
similarity_data = {
    ("A", "A"): 10,
    ("A", "G"): -1,
    ("A", "C"): -3,
    ("A", "T"): -4,
    ("G", "G"): 7,
    ("G", "C"): -5,
    ("G", "T"): -3,
    ("C", "C"): 9,
    ("C", "T"): 0,
    ("T", "T"): 8,
}

score = NeedlemanWunsch(similarity_data, -1).calculate_score("GCATGCG", "GCATGCA")
print(f"üåü Score: {score}")