The [Phoible](https://phoible.org/) dataset contains phoneme inventories for thousands of languages and dialects. Many languages/dialicts have multiple Phoible records. Here, I'm mapping the data against pre-prepared IPA phoneme tables, then selecting one sample per table per language so that each language is only represented once in the final dataset (to avoid bias by oversampling).



In [None]:
from pathlib import Path
import requests
from tqdm import tqdm

phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

VALIDATE_RESULTS = True

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import numpy as np

phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")

ipa_diacritics_df = pd.read_csv(
    "./data/ipa_diacritics.csv", dtype=str, encoding="utf-8", engine="python"
)

valid_sufixes = set(ipa_diacritics_df.suffix)

phoible_df["symbol"] = phoible_df.Phoneme.apply(lambda x: x[0:-1] if len(x) > 1 else x)
phoible_df["suffix"] = phoible_df.Phoneme.apply(
    lambda x: x[-1] if (len(x) > 1 and x[-1] in valid_sufixes) else None
)

lang_by_dialect_df = (
    phoible_df[["LanguageName", "SpecificDialect", "Phoneme", "symbol", "suffix"]]
    .fillna({"SpecificDialect": "none"})
    .groupby(["LanguageName", "SpecificDialect"])
)

language_names = (
    phoible_df[["LanguageName", "SpecificDialect"]]
    .drop_duplicates()
    .reset_index(drop=True)
    .to_numpy()[:, 0]
)

num_lang = len(lang_by_dialect_df)

print("num_lang", num_lang)
print(phoible_df.shape)

phoible_df.head(5)

In [None]:
# util


def map_long_phonemes(c):
    return c + "ː" if isinstance(c, str) else c


def create_symbol_matrix(phoneme_tbl_df):
    phoneme_long_tbl_df = phoneme_tbl_df.map(map_long_phonemes)
    symbols = set(phoneme_tbl_df.stack().replace("", np.nan).dropna().unique().tolist())
    symbols_long = set([c + "ː" for c in symbols])

    def map_phonemes(df):
        phonemes = set(df.Phoneme.to_list())
        # sym_new = df.symbol.to_numpy()
        # suffixes = df.suffix.to_numpy()
        valid_symbols = phonemes.intersection(symbols)
        valid_symbols_long = phonemes.intersection(symbols_long)

        # print("phoneme_tbl_df", phoneme_tbl_df.head(4))

        # phoneme_tbl_symbols = phoneme_tbl_df.map(lambda x: x[0:-1] if len(x) > 1 else x)
        # phoneme_tbl_sufixes = phoneme_tbl_df.map(
        #     lambda x: x[-1] if (len(x) > 1 and x[-1] in valid_sufixes) else None
        # )

        standard = np.where(phoneme_tbl_df.isin(valid_symbols), 1.0, 0.0)
        long = np.where(phoneme_long_tbl_df.isin(valid_symbols_long), 1.0, 0.0)
        empty = np.zeros(standard.shape)

        # print("df", df[["symbol", "suffix"]].head(4))
        # print("phoneme_tbl_symbols", phoneme_tbl_symbols.iloc[:4, :4])
        # print("phoneme_tbl_sufixes", phoneme_tbl_sufixes.iloc[:4, :4])
        # print("standard", standard[:4, :4])

        result = np.stack([standard, long, empty], axis=2)

        if VALIDATE_RESULTS:
            for row, _ in enumerate(standard):
                for col, _ in enumerate(standard.T):
                    assert standard[row][col] + long[row][col] == result[row][col].sum()

            assert standard.sum() == len(valid_symbols)
            assert long.sum() == len(valid_symbols_long)
            assert result.sum() == standard.sum() + long.sum() + empty.sum()

        return result

    return map_phonemes

Generate pulomic consonants table.

In [None]:
cons_pl_tbl_df = pd.read_csv("./data/consonants_plumonic.csv", dtype=str, index_col=[0], keep_default_na=False, na_values=["-1"])
cons_pl_tbl_df

In [None]:
cons_pl_npy = np.stack(lang_by_dialect_df.apply(create_symbol_matrix(cons_pl_tbl_df), include_groups=False).to_numpy())
assert cons_pl_npy.shape == (num_lang, *cons_pl_tbl_df.shape, 3)
cons_pl_npy.shape

Generate non-pulomic consonants table.

In [None]:
cons_npl_tbl_df = pd.read_csv(
    "./data/consonants_non_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
cons_npl_tbl_df

In [None]:
cons_npl_npy = np.stack(
    lang_by_dialect_df.apply(
        create_symbol_matrix(cons_npl_tbl_df), include_groups=False
    ).to_numpy()
)
assert cons_npl_npy.shape == (num_lang, *cons_npl_tbl_df.shape, 3)
cons_npl_npy.shape

In [None]:
cons_coart_tbl_df = pd.read_csv(
    "./data/consonants_coarticulated.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
cons_coart_tbl_df

In [None]:
cons_coart_npy = np.stack(
    lang_by_dialect_df.apply(
        create_symbol_matrix(cons_coart_tbl_df), include_groups=False
    ).to_numpy()
)
assert cons_coart_npy.shape == (num_lang, *cons_coart_tbl_df.shape, 3)
cons_coart_npy.shape

Generate vowel table.

In [None]:
vowels_tbl_df = pd.read_csv(
    "./data/vowels.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
vowels_tbl_df

In [None]:
vowels_npy = np.stack(
    lang_by_dialect_df.apply(
        create_symbol_matrix(vowels_tbl_df), include_groups=False
    ).to_numpy()
)
assert vowels_npy.shape == (num_lang, *vowels_tbl_df.shape, 3)
vowels_npy.shape

In [None]:
assert (
    cons_pl_npy.shape[0]
    == cons_npl_npy.shape[0]
    == vowels_npy.shape[0]
    == cons_coart_npy.shape[0]
)

(cons_pl_npy.shape, cons_npl_npy.shape, vowels_npy.shape, cons_coart_npy.shape)

For languages that have multiple samples in the Phoible dataset, we pick just one sample. Based on previous analysis, the best option seems to be to just pick the one whith the most phonemes per language, as this generally has more information (for example long and short versions of the phonemes).

In [None]:
unique_names = np.unique(language_names)
print(language_names.shape, unique_names.shape)

cons_pl_selected = np.array(
    [np.max(cons_pl_npy[language_names == name], axis=0) for name in unique_names]
)
cons_npl_selected = np.array(
    [np.max(cons_npl_npy[language_names == name], axis=0) for name in unique_names]
)
cons_coart_selected = np.array(
    [np.max(cons_coart_npy[language_names == name], axis=0) for name in unique_names]
)
vowels_selected = np.array(
    [np.max(vowels_npy[language_names == name], axis=0) for name in unique_names]
)

# sense check
for name in unique_names:
    assert cons_pl_npy[language_names == name].shape[1:] == cons_pl_selected.shape[1:]
    assert cons_npl_npy[language_names == name].shape[1:] == cons_npl_selected.shape[1:]
    assert vowels_npy[language_names == name].shape[1:] == vowels_selected.shape[1:]
    assert cons_coart_npy[language_names == name].shape[1:] == cons_coart_selected.shape[1:]

cons_pl_selected.shape, cons_npl_selected.shape, vowels_selected.shape, cons_coart_selected.shape

In [None]:
np.save("./data/consonants_plumonic.npy", cons_pl_selected)
np.save("./data/consonants_non_plumonic.npy", cons_npl_selected)
np.save("./data/consonants_coarticulated.npy", cons_coart_selected)
np.save("./data/vowels.npy", vowels_selected)

Merge all the data into a single 4d array. Each language is represented by a 3d matrix, with columns of the 3 phoneme types stacked vertically in 3 groups and a padding row between each group.

In [None]:
max_cols = max(cons_pl_npy.shape[2], cons_npl_npy.shape[2], vowels_npy.shape[2])
max_value = max(np.max(cons_pl_npy), np.max(cons_npl_npy), np.max(vowels_npy))


def get_padding(arr, max_cols):
    return ((0, 0), (0, 0), (0, max_cols - arr.shape[2]), (0, 0))


language_phonemes_npy = np.hstack(
    [
        np.pad(
            cons_pl_npy,
            get_padding(cons_pl_npy, max_cols),
            mode="constant",
        ),
        np.full((cons_pl_npy.shape[0], 1, max_cols, 3), 0.1),
        np.pad(
            cons_npl_npy,
            get_padding(cons_npl_npy, max_cols),
            mode="constant",
        ),
        np.full((cons_pl_npy.shape[0], 1, max_cols, 3), 0.1),
        np.pad(vowels_npy, get_padding(vowels_npy, max_cols), mode="constant"),
    ]
)

language_phonemes_npy.shape

In [None]:
import matplotlib.pyplot as plt

r, c = 2, 4

gen = np.random.default_rng()
sample_indices = gen.choice(unique_names.size, r * c, replace=False)
samples = language_phonemes_npy[sample_indices]
sample_names = unique_names[sample_indices]

fig, axs = plt.subplots(r, c, figsize=(22, 12), linewidth=10)
fig.tight_layout()

cnt = 0
for i in range(r):
    for j in range(c):
        axs[i, j].set_title(sample_names[cnt][0:20], fontsize=26)
        axs[i, j].imshow(samples[cnt])
        axs[i, j].tick_params(
            left=False, right=False, labelleft=False, labelbottom=False, bottom=False
        )
        cnt += 1

plt.show()
sample_indices

In [None]:
np.save("./data/language_phonemes.npy", language_phonemes_npy)

In [None]:
# what does an average language look like?
mean_lang = np.mean(language_phonemes_npy, axis=0)

# not required, but just checking that imshow 
# plays nice with an alpha channel
mean_lang = np.dstack([mean_lang, np.ones((*mean_lang.shape[0:2], 1))])

fig = plt.imshow(
    mean_lang,
    cmap=plt.get_cmap("copper_r"),
    vmin=np.min(mean_lang),
    vmax=np.max(mean_lang),
)
plt.axis("off")
plt.title("Average Language")
plt.show()

Investigating unhandled phonemes.

In [None]:
handled_symbols = (
    set(cons_pl_tbl_df.stack().replace("", np.nan).dropna().unique())
    | set(cons_npl_tbl_df.stack().replace("", np.nan).dropna().unique())
    | set(cons_coart_tbl_df.stack().replace("", np.nan).dropna().unique())
    | set(vowels_tbl_df.stack().replace("", np.nan).dropna().unique())
)
handled_symbols_long = set([c + "ː" for c in handled_symbols])
all_handled_symbols = handled_symbols | handled_symbols_long

In [None]:
# need to replace the long implementation above with something that can represent all of these:
print(ipa_diacritics_df.suffix.shape)
# ipa_diacritics_df.suffix.to_numpy().reshape(6, -1).T

diacritics_npy = ipa_diacritics_df.suffix.to_numpy()


def create_symbol_matrix2(phoneme_tbl_df):
    symbols = set(
        phoneme_tbl_df.stack().replace("", np.nan).dropna().unique().tolist()
    )

    def map_phonemes(df):
        phonemes = set(df.Phoneme.to_list())
        valid_symbols = phonemes.intersection(symbols)

        standard = np.where(phoneme_tbl_df.isin(valid_symbols), 1.0, 0.0)
        empty = np.zeros(standard.shape)

        result = np.stack([standard, empty], axis=2)

        return result

    return map_phonemes


tmp_npy = np.stack(
    lang_by_dialect_df.apply(
        create_symbol_matrix2(cons_pl_tbl_df), include_groups=False
    ).to_numpy()
)
tmp_npy.shape

In [None]:
valid_sufixes = set(ipa_diacritics_df.suffix)

all_phonemes = phoible_df.Phoneme.drop_duplicates().to_frame()
all_phonemes["symbol"] = all_phonemes.Phoneme.apply(
    lambda x: x[0:-1] if len(x) > 1 else x
)
all_phonemes["suffix"] = all_phonemes.Phoneme.apply(
    lambda x: x[-1] if (len(x) > 1 and x[-1] in valid_sufixes) else None
)
all_phonemes = all_phonemes[["symbol", "suffix"]].drop_duplicates()

# all_phonemes[["symbol", "suffix"]]
all_phonemes.suffix.value_counts()

In [None]:
unhandled_symbols = phoible_df[~phoible_df.Phoneme.isin(all_handled_symbols)][
    ["LanguageName", "SpecificDialect", "Phoneme", "symbol", "suffix"]
]

unhandled_symbols_by_dialect = (
    unhandled_symbols.fillna({"SpecificDialect": "none"})
    .drop_duplicates()
    .groupby(["LanguageName", "SpecificDialect"])
)

unhandled_counts = (
    unhandled_symbols_by_dialect.count()
    .reset_index()
    .groupby("LanguageName")
    .mean("Phoneme")
    .reset_index()
    .sort_values("Phoneme", ascending=False)
)

unhandled_counts.plot.hist(bins=75)

print(unhandled_symbols)

unhandled_counts