In [6]:
# https://stackoverflow.com/a/76024851/9655481
# %pip install --upgrade jupyter ipywidgets

In [41]:
import json
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import List
import pickle

In [40]:
@dataclass
class Word:
    word: str
    phonetic: str

@dataclass
class WordIpaChars:
    word: str
    ipa: str
    ipa_chars: List[str]

In [9]:
# https://stackoverflow.com/a/51593236/9655481
def print_full(dataframe):
    pd.set_option("display.max_rows", None)
    pd.set_option("display.max_columns", None)
    pd.set_option("display.width", 2000)
    pd.set_option("display.float_format", "{:20,.2f}".format)
    pd.set_option("display.max_colwidth", None)
    display(dataframe)
    pd.reset_option("display.max_rows")
    pd.reset_option("display.max_columns")
    pd.reset_option("display.width")
    pd.reset_option("display.float_format")
    pd.reset_option("display.max_colwidth")

def show_words(words: List[Word]):
    df = pd.DataFrame([vars(word) for word in words])
    print_full(df)

---

### 🧑‍💻 **Dataset**

In [10]:
def get_all_words():
    with open("../data/ipa/fr_FR.json", "r") as f:
        data = json.load(f)
    data = data["fr_FR"][0]
    return np.array([Word(key, value) for key, value in data.items()])

def get_random_words(n):
    words = get_all_words()
    rng = np.random.default_rng()
    return rng.choice(words, n, replace=False)

In [11]:
words = get_random_words(20)
show_words(words)

Unnamed: 0,word,phonetic
0,animera,/animəʁa/
1,opticiennes,/ɔptisjɛn/
2,calories,/kalɔʁi/
3,impropres,/ɛ̃pʁɔpʁ/
4,musâmes,/myzam/
5,téléphonées,/telefɔne/
6,poissent,/pwas/
7,abordiez,/abɔʁdje/
8,ruisselet,/ʁɥislɛ/
9,encrassait,/ɑ̃kʁasɛ/


In [12]:
def find_unique_phonetic_symbols():
    """
    Returns the set of all unique phonetic letters in the dataset.

    Note that one word consists of a sequence of phonetic symbols.
    """
    data = get_all_words()
    phonetic_symbols = set()
    for word in data:
        phonetic_symbols.update(set(word.phonetic))
    return phonetic_symbols

unique = find_unique_phonetic_symbols()
print("  ".join(unique))

n  g  ɪ  e  s  o  ʃ  x  t  ŋ  v  ː  z  ø  u     y  .  r  l  j  ɥ  ɲ  ə  b  d  f  /  ̃  ʁ  ɔ  p  ɛ  k  ʊ  i  ʼ  œ  m  ʒ  w  ,  a  ɑ


In [13]:
# Construct a phonetic similarity matrix that I can print out and fill out by hand
# to assign similarity scores to phonetic symbols. Note that the `unique` variable
# contains all unique phonetic symbols in the dataset.
similarity_matrix = pd.DataFrame(index=list(unique), columns=list(unique))
similarity_matrix = similarity_matrix.fillna(" ")

# store as pdf
similarity_matrix.to_html("similarity_matrix.html")

In [14]:
# https://easypronunciation.com/en/french-letters-pronunciation-ipa-chart#french_consonants
# https://en.wikipedia.org/wiki/Help:IPA/French

# Fill out the similarity matrix by hand
consonants = [
    "b",
    "d",
    "f",
    "g",
    "k",
    "l",
    "m",
    "n",
    "p",
    "s",
    "t",
    "v",
    "z",
    "ɲ",
    "ʁ",
    "ʃ",
    "ʒ",
    "dʒ",
    "tʃ",
    "ŋ",
]
semi_vowels = ["j", "w", "ɥ"]
oral_vowels = ["a", "ɑ", "e", "i", "o", "u", "y", "ø", "œ", "ɔ", "ə", "ɛ"]
nasal_vowels = ["ɑ̃", "ɔ̃", "ɛ̃", "œ̃"]

all_symbols = consonants + semi_vowels + oral_vowels + nasal_vowels
print(all_symbols)

similarity_matrix = pd.DataFrame(index=list(all_symbols), columns=list(all_symbols))
similarity_matrix = similarity_matrix.fillna("")
similarity_matrix.to_html("similarity_matrix.html")

['b', 'd', 'f', 'g', 'k', 'l', 'm', 'n', 'p', 's', 't', 'v', 'z', 'ɲ', 'ʁ', 'ʃ', 'ʒ', 'dʒ', 'tʃ', 'ŋ', 'j', 'w', 'ɥ', 'a', 'ɑ', 'e', 'i', 'o', 'u', 'y', 'ø', 'œ', 'ɔ', 'ə', 'ɛ', 'ɑ̃', 'ɔ̃', 'ɛ̃', 'œ̃']


---

### 💨 **Algorithm**

In [70]:
import time

class SimilarityMatrix:
    """
    Similarity matrix between phonetic symbols.
    Can be indexed by two phonetic symbols to get the similarity score between them.
    """

    def __init__(self, similarity_data):
        all_symbols = self._data_to_symbols(similarity_data)
        self.matrix = pd.DataFrame(index=list(all_symbols), columns=list(all_symbols))

        for (symbol1, symbol2), score in similarity_data.items():
            self.matrix.at[symbol1, symbol2] = score
            self.matrix.at[symbol2, symbol1] = score  # symmetric

        # Error if matrix contains NaN values
        if self.matrix.isnull().values.any():
            raise ValueError(
                "Similarity matrix contains NaN values."
                + " Make sure to fill out all values."
            )

    def _data_to_symbols(self, data):
        symbols = set()
        for (symbol1, symbol2), _score in data.items():
            symbols.add(symbol1)
            symbols.add(symbol2)
        return symbols

    def __getitem__(self, key):
        return self.matrix.at[key[0], key[1]]
    
    def symbols(self):
        return list(self.matrix.columns)


class NeedlemanWunsch:
    """
    Implementation of the Needleman-Wunsch algorithm for global sequence alignment
    score calculation. The algorithm uses a similarity matrix and a linear gap penalty.
    """

    def __init__(self, similarity_matrix, gap_penalty):
        self.similarity_matrix = similarity_matrix
        self.gap_penalty = gap_penalty

    def calculate_score(self, a: List[str], b: List[str]):
        """
        Calculates the alignment score between two sequences a and b.
        """
        start_time = time.time()

        matrix = np.zeros((len(a) + 1, len(b) + 1))
        for i in range(len(a) + 1):
            matrix[i, 0] = self.gap_penalty * i
        for j in range(len(b) + 1):
            matrix[0, j] = self.gap_penalty * j
        for i in range(1, len(a) + 1):
            for j in range(1, len(b) + 1):
                match = matrix[i - 1, j - 1] + self.similarity_matrix[a[i-1], b[j-1]]
                delete = matrix[i - 1, j] + self.gap_penalty
                insert = matrix[i, j - 1] + self.gap_penalty
                matrix[i, j] = max(match, delete, insert)
        
        end_time = time.time()
        print(f"Time taken: {(end_time - start_time) * 1000} ms")
        
        return matrix[-1, -1]

In [16]:
similarity_data = {
    ("A", "A"): 10,
    ("A", "G"): -1,
    ("A", "C"): -3,
    ("A", "T"): -4,
    ("G", "G"): 7,
    ("G", "C"): -5,
    ("G", "T"): -3,
    ("C", "C"): 9,
    ("C", "T"): 0,
    ("T", "T"): 8,
}

# similarity_data = {
#     ("A", "A"): 1,
#     ("A", "G"): -1,
#     ("A", "C"): -1,
#     ("A", "T"): -1,
#     ("G", "G"): 1,
#     ("G", "C"): -1,
#     ("G", "T"): -1,
#     ("C", "C"): 1,
#     ("C", "T"): -1,
#     ("T", "T"): 1,
# }

similarity_matrix = SimilarityMatrix(similarity_data)
allowed_symbols = similarity_matrix.symbols()
score = NeedlemanWunsch(similarity_matrix, -1).calculate_score("GCATGCG", "GCATGCA")
print(f"Allowed symbols: {allowed_symbols}")
print(f"🌟 Score: {score}")

Allowed symbols: ['A', 'G', 'T', 'C']
🌟 Score: 49.0


---

### **Prepare phonetics data**

In [34]:
class Phonetics:

    @classmethod
    def preprocess(cls, string) -> List[str]:
        """
        Preprocesses a word by converting it to lowercase and removing whitespace.
        """
        return [cls._preprocess(individual) for individual in string.split(",")]

    @classmethod
    def _preprocess(cls, string) -> str:
        res = string.strip()
        if res[0] == "/":
            res = res[1:]
        if res[-1] == "/":
            res = res[:-1]
        return res

    @classmethod
    def split(cls, string, allowed_symbols) -> List[str]:
        """
        Extracts the phonetic symbols from a word by means of a small 1-look-ahead parser.
        """
        symbols = []

        i = 0
        while i < len(string):
            letter = string[i]

            # Ignore these symbols
            if letter in ["ː", ".", "ʼ", " "]:
                i += 1
                continue

            if letter not in allowed_symbols:
                raise ValueError(f"Invalid symbol '{letter}' in word '{string}'.")

            if letter == "\u0303":
                raise ValueError("Unicode 'Combining tilde' character should have been subsumed into the previous character.")

            if i + 1 < len(string):
                next_letter = string[i + 1]
                if letter in ["ɑ", "ɛ", "ɔ", "œ"] and next_letter == "\u0303":
                    symbols.append(letter + u"\u0303")
                    i += 2
                    continue

                if letter == "d" and next_letter == "ʒ":
                    symbols.append("dʒ")
                    i += 2
                    continue

                if letter == "t" and next_letter == "ʃ":
                    symbols.append("tʃ")
                    i += 2
                    continue

            symbols.append(string[i])
            i += 1

        return symbols

In [None]:
french_allowed_symbols = [
    "b",
    "d",
    "f",
    "g",
    "k",
    "l",
    "m",
    "n",
    "p",
    "s",
    "t",
    "v",
    "z",
    "ɲ",
    "ʁ",
    "ʃ",
    "ʒ",
    "dʒ",
    "tʃ",
    "ŋ",
    "j",
    "w",
    "ɥ",
    "a",
    "ɑ",
    "e",
    "i",
    "o",
    "u",
    "y",
    "ø",
    "œ",
    "ɔ",
    "ə",
    "ɛ",
    "ɑ̃",
    "ɔ̃",
    "ɛ̃",
    "œ̃",
]

In [35]:
phonetic = "aokœ̃mɔmɑ̃"
phonetics_processed = Phonetics.preprocess(phonetic)
print(phonetics_processed)
Phonetics.split(phonetics_processed[0], french_allowed_symbols)

['aokœ̃mɔmɑ̃']


['a', 'o', 'k', 'œ̃', 'm', 'ɔ', 'm', 'ɑ̃']

**Pickle data**

In [42]:
processed_ipas = []

words = get_all_words()
for i, word in enumerate(words):
    words_processed = Phonetics.preprocess(word.phonetic)
    for word_processed in words_processed:
        chars = Phonetics.split(word_processed, french_allowed_symbols)
        to_append = WordIpaChars(word.word, word_processed, chars)
        processed_ipas.append(to_append)

with open("../data/ipa/fr_FR.pkl", "wb") as f:
    pickle.dump(processed_ipas, f)

---

### 🧬 **Calculate scores**

In [45]:
with open("../data/ipa/fr_FR.pkl", "rb") as f:
    data = pickle.load(f)

len(data)

245646

In [47]:
french_allowed_symbols = [
    "b",
    "d",
    "f",
    "g",
    "k",
    "l",
    "m",
    "n",
    "p",
    "s",
    "t",
    "v",
    "z",
    "ɲ",
    "ʁ",
    "ʃ",
    "ʒ",
    "dʒ",
    "tʃ",
    "ŋ",
    "j",
    "w",
    "ɥ",
    "a",
    "ɑ",
    "e",
    "i",
    "o",
    "u",
    "y",
    "ø",
    "œ",
    "ɔ",
    "ə",
    "ɛ",
    "ɑ̃",
    "ɔ̃",
    "ɛ̃",
    "œ̃",
]
len(french_allowed_symbols)

39

In [64]:
# Open from pickle
with open("../data/ipa/fr_FR_similarity_costs.pkl", "rb") as f:
    french_similarity_costs = pickle.load(f)
french_similarity_matrix = SimilarityMatrix(french_similarity_costs)
allowed_symbols = french_similarity_matrix.symbols()
print(f"Allowed symbols: {allowed_symbols}")

test_word = Phonetics.split(Phonetics.preprocess("snifasje")[0], french_allowed_symbols)
test_word2 = Phonetics.split(Phonetics.preprocess("sniaasja")[0], french_allowed_symbols)

score = NeedlemanWunsch(french_similarity_matrix, -1).calculate_score(
    test_word, test_word2
)
print(f"🌟 Score: {score}")

Allowed symbols: ['g', 'n', 'e', 's', 'o', 'ʃ', 't', 'ŋ', 'v', 'z', 'ø', 'tʃ', 'u', 'y', 'l', 'j', 'ɥ', 'ɲ', 'ə', 'b', 'd', 'f', 'ʁ', 'ɔ̃', 'œ̃', 'p', 'ɔ', 'ɛ', 'k', 'ɑ̃', 'i', 'dʒ', 'œ', 'm', 'ɛ̃', 'ʒ', 'w', 'a', 'ɑ']
🌟 Score: 8.0


**Now for real**

In [72]:
# Open from pickle
with open("../data/ipa/fr_FR_similarity_costs.pkl", "rb") as f:
    french_similarity_costs = pickle.load(f)
french_similarity_matrix = SimilarityMatrix(french_similarity_costs)
allowed_symbols = french_similarity_matrix.symbols()
print(f"Allowed symbols: {allowed_symbols}")

# Calculate the score for every possible combination of two words.
# Take into account the symmetry, i.e. don't calculate the score for the same pair twice.
output = []
for i, word in enumerate(data[3000:3010]):
    print(i)
    for j, word2 in enumerate(data[3000:3010]):
        if j < i:
            continue
        score = NeedlemanWunsch(french_similarity_matrix, -1).calculate_score(
            word.ipa_chars, word2.ipa_chars
        )
        output.append((word.word, word.ipa, word2.word, word2.ipa, score))

# df = pd.DataFrame(output, columns=["word1", "word1-ipa", "word2", "word2-ipa", "score"])
# df.to_csv("../data/ipa/graph.csv")

Allowed symbols: ['g', 'n', 'e', 's', 'o', 'ʃ', 't', 'ŋ', 'v', 'z', 'ø', 'tʃ', 'u', 'y', 'l', 'j', 'ɥ', 'ɲ', 'ə', 'b', 'd', 'f', 'ʁ', 'ɔ̃', 'œ̃', 'p', 'ɔ', 'ɛ', 'k', 'ɑ̃', 'i', 'dʒ', 'œ', 'm', 'ɛ̃', 'ʒ', 'w', 'a', 'ɑ']
0
Time taken: 0.3204345703125 ms
Time taken: 0.21576881408691406 ms
Time taken: 0.1957416534423828 ms
Time taken: 0.16427040100097656 ms
Time taken: 0.18477439880371094 ms
Time taken: 0.11706352233886719 ms
Time taken: 0.09083747863769531 ms
Time taken: 0.11038780212402344 ms
Time taken: 0.10991096496582031 ms
Time taken: 0.17309188842773438 ms
1
Time taken: 0.38552284240722656 ms
Time taken: 0.22983551025390625 ms
Time taken: 0.2014636993408203 ms
Time taken: 0.20647048950195312 ms
Time taken: 0.12087821960449219 ms
Time taken: 0.1163482666015625 ms
Time taken: 0.14328956604003906 ms
Time taken: 0.14281272888183594 ms
Time taken: 0.560760498046875 ms
2
Time taken: 1.0423660278320312 ms
Time taken: 0.4165172576904297 ms
Time taken: 0.5002021789550781 ms
Time taken: 0.324