The [Phoible](https://phoible.org/) dataset contains phoneme inventories for thousands of languages and dialects. Many languages/dialicts have multiple Phoible records. Here, I'm mapping the data against pre-prepared IPA phoneme tables, then selecting one sample per table per language so that each language is only represented once in the final dataset (to avoid bias by oversampling).

The IPA phoneme tables contain some phonemes with diacritics and some without. They seem to include only 0 or 1 diacritic perphoneme. 

The Phoible data also contains phonemes with and without diacritics, but they may contain sequences of multiple diacritics after each phoneme. These need to be split and handled seperately.



In [49]:
from pathlib import Path
import requests
from tqdm import tqdm
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import display


phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

VALIDATE_RESULTS = True

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [50]:
ipa_diacritics_df = pd.read_csv(
    "./data/ipa_diacritics.csv", dtype=str, encoding="utf-8", engine="python"
)
valid_diacritics = ipa_diacritics_df.suffix
ipa_diacritics_df.shape

(38, 2)

Load the Phoible data and split the IPA diacritic suffixes from all phonemes so the raw phoneme can be matched against the pre-prepared IPA phoneme tables.

In [51]:
phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")
dialect_phonemes_df = (
    phoible_df[["ISO6393", "LanguageName", "SpecificDialect", "Phoneme"]]
    .copy()
    .fillna({"SpecificDialect": "none"})
)


def split_symbol(phoneme):
    return "".join([x for x in list(phoneme) if x not in valid_diacritics.values])


def split_diacritics(phoneme):
    return "".join([x for x in list(phoneme) if x in valid_diacritics.values])


def split_phoneme_series(df):
    df["symbol"] = df.Phoneme.apply(split_symbol)
    df["diacritics"] = df.Phoneme.apply(split_diacritics)
    df["diacritic_count"] = df.diacritics.str.len()
    df.loc[df.diacritic_count == 0, "diacritics"] = np.nan
    return df


dialect_phonemes_df = split_phoneme_series(dialect_phonemes_df)

display(dialect_phonemes_df.shape)
dialect_phonemes_df.head(3)

(105467, 7)

Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,symbol,diacritics,diacritic_count
0,kor,Korean,none,a,a,,0
1,kor,Korean,none,aː,a,ː,1
2,kor,Korean,none,æ,æ,,0


In [52]:
lang_by_dialect_df = dialect_phonemes_df.groupby(
    ["LanguageName", "SpecificDialect"]
)

Load the pre-prepared IPA tables.

In [53]:
cons_pl_tbl_df = pd.read_csv(
    "./data/consonants_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_pl_tbl_df)

cons_npl_tbl_df = pd.read_csv(
    "./data/consonants_non_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_npl_tbl_df)

cons_coart_tbl_df = pd.read_csv(
    "./data/consonants_coarticulated.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_coart_tbl_df)

vowels_tbl_df = pd.read_csv(
    "./data/vowels.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(vowels_tbl_df)


cons_pl_tbl_shape = [*cons_pl_tbl_df.shape]
cons_npl_tbl_shape = [*cons_npl_tbl_df.shape]
cons_coart_tbl_shape = [*cons_coart_tbl_df.shape]
vowels_tbl_shape = [*vowels_tbl_df.shape]

cons_pl_tbl_shape, cons_npl_tbl_shape, cons_coart_tbl_shape, vowels_tbl_shape

Unnamed: 0,Bilabial,Bilabial_v,Labiodental,Labiodental_v,Dental,Dental_v,Alveolar,Alveolar_v,Postalveolar,Postalveolar_v,...,Palatal,Palatal_v,Velar,Velar_v,Uvular,Uvular_v,Pharyngeal,Pharyngeal_v,Glottal,Glottal_v
Plosive,p,b,,,t̪,d̪,t,d,,,...,c,ɟ,k,ɡ,q,ɢ,,,ʔ,
Nasal,,m,,ɱ,,n̪,,n,,,...,,ɲ,,ŋ,,ɴ,,,,
Trill,,ʙ,,,,r̪,,r,,,...,,,,,,ʀ,,,,
Tap or Flap,,,,ⱱ,,ɾ̪,,ɾ,,,...,,,,,,,,,,
Fricative,ɸ,β,f,v,θ,ð,s,z,ʃ,ʒ,...,ç,ʝ,x,ɣ,χ,ʁ,ħ,ʕ,h,ɦ
Lateral fricative,,,,,,,ɬ,ɮ,,,...,,,,,,,,,,
Approximant,,,,ʋ,,,,ɹ,,,...,,j,,ɰ,,,,,,
Lateral approximant,,,,,,l̪,,l,,,...,,ʎ,,ʟ,,,,,,


Unnamed: 0,0,1,2,3,4,5,6,7
Ejective Stop,pʼ,tʼ,ʈʼ,cʼ,kʼ,qʼ,ʡʼ,
Ejective Fricative,fʼ,θʼ,sʼ,ʃʼ,ʂʼ,ɕʼ,xʼ,χʼ
Ejective Affricate,tsʼ,t̠ʃʼ,ʈʂʼ,kxʼ,qχʼ,,,
Ejective Lateral,ɬʼ,tɬʼ,cʎ̝̊ʼ,kʟ̝̊ʼ,,,,
Click,kʘ,kǀ,kǃ,kǂ,kǁ,,,
Implosive Voiced,ɓ,ɗ,ᶑ,ʄ,ɠ,ʛ,,
Implosive Voiceless,ɓ̥,ɗ̥,ᶑ̥,ʄ̥,ɠ̊,ʛ̥,,


Unnamed: 0,Labial–alveolar,Labial–alveolar_v,Labial–retroflex,Labial–retroflex_v,Labial–palatal,Labial–palatal_v,Labial–velar,Labial–velar_v,Labial–uvular,Labial–uvular_v,Velarized alveolar,Velarized alveolar_v,Uvular–epiglottal,Uvular–epiglottal_v,Palatal-velar,Palatal-velar_v
Nasal,,n͡m,,ɳ͡m,,,,ŋ͡m,,,,,,,,
Plosive,t͡p,d͡b,ʈ͡p,ɖ͡b,,,k͡p,ɡ͡b,q͡p,,,,q͡ʡ,,,
Fricative/approximant,,,,,ɥ̊,ɥ,ʍ,w,,,,,,,ɧ,
Lateral approximant,,,,,,,,,,,,ɫ,,,,
Implosive,,,,,,,ɠ̊͜ɓ̥,ɠ͡ɓ,,,,,,,,
Ejective,,t͡pʼ,,,,,,,,,,,,,,


Unnamed: 0,Front unrounded,Front rounded,Near-front unrounded,Near-front rounded,Central unrounded,Central rounded,Near-back unrounded,Near-back rounded,Back unrounded,Back rounded
Close,i,y,,,ɨ,ʉ,,,ɯ,u
Near-close,,,ɪ,ʏ,,,,ʊ,,
Close-mid,e,ø,,,ɘ,ɵ,,,ɤ,o
Mid,e̞,ø̞,,,ə,,,,ɤ̞,o̞
Open-mid,ɛ,œ,,,ɜ,ɞ,,,ʌ,ɔ
Near-open,æ,,,ɐ,,,,,,
Open,a,ɶ,,,ä,,,,ɑ,ɒ


([8, 22], [7, 8], [6, 16], [7, 10])

In [66]:
def map_tbl_phoneme(phoneme):
    chars = list(phoneme)
    symbols = "".join([x for x in chars if x not in valid_diacritics.values])
    diacritics = np.where(np.isin(valid_diacritics.values, chars), 1, 0)
    if diacritics.sum() == 0 and len(chars) > 0:
        diacritics[0] = 1
    return phoneme, symbols, diacritics

cons_pl_tbl = [
    map_tbl_phoneme(x) for x in cons_pl_tbl_df.stack()
]
cons_pl_tbl

[('p',
  'p',
  array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),
 ('b',
  'b',
  array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),
 ('',
  '',
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),
 ('',
  '',
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])),
 ('t̪',
  't',
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])),
 ('d̪',
  'd',
  array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])),
 ('t',
  't',
  array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0

In [None]:
# do we have duplicates?

table_phonemes_counts = (
    pd.concat(
        [
            cons_pl_tbl_df.stack(),
            cons_npl_tbl_df.stack(),
            cons_coart_tbl_df.stack(),
            vowels_tbl_df.stack(),
        ]
    )
    .replace("", np.nan)
    .dropna()
    .value_counts()
)

assert table_phonemes_counts[table_phonemes_counts == 1].all()

table_phonemes = table_phonemes_counts.index.to_series().reset_index(drop=True)

table_phonemes.head(3)

0      p
1    t͡p
2     ᶑ̥
dtype: object

In [None]:
# stack and split tables to seperate phonemes with/without specific diacritics

cons_pl_tbl_stacked_df = split_phoneme_series(
    cons_pl_tbl_df.stack().reset_index(drop=True).to_frame("Phoneme")
)
cons_npl_tbl_stacked_df = split_phoneme_series(
    cons_npl_tbl_df.stack().reset_index(drop=True).to_frame("Phoneme")
)
cons_coart_tbl_stacked_df = split_phoneme_series(
    cons_coart_tbl_df.stack().reset_index(drop=True).to_frame("Phoneme")
)
vowels_tbl_stacked_df = split_phoneme_series(
    vowels_tbl_df.stack().reset_index(drop=True).to_frame("Phoneme")
)

# combile all table phonemes

all_table_phonemes = (
    pd.concat(
        [
            cons_pl_tbl_stacked_df,
            cons_npl_tbl_stacked_df,
            cons_coart_tbl_stacked_df,
            vowels_tbl_stacked_df,
        ]
    )
)

table_phonemes_with_diacritics = (
    all_table_phonemes[all_table_phonemes.diacritic_count > 0]
    .Phoneme.replace("", np.nan)
    .dropna()
)
table_phonemes_without_diacritics = all_table_phonemes[
    all_table_phonemes.diacritic_count == 0
].Phoneme.replace("", np.nan).dropna()

display(table_phonemes_with_diacritics.head(3))
display(table_phonemes_without_diacritics.head(3))

4     t̪
5     d̪
25    n̪
Name: Phoneme, dtype: object

0    p
1    b
6    t
Name: Phoneme, dtype: object

For each langauge dialect, map all 4 phoneme tables against all the phonemes in the dialect.

If the table phoneme has a diacritic mark, compare the language phoneme and table phoneme directly. Return an array with one phoneme variant indicated for each matching table phoneme.

If the table phoneme doesn't have a diacritic mark, compare the base symbol of the language phoneme against the table phoneme. Return an array with all phoneme variants indicated for each matching table phoneme.

If the table cell is empty, return an array of zeros (length matching the number of phoneme variants) 

In [None]:
# combine duplicate phonemes that don't have explicit diacritic handling
# in one of the tables

handled_phonemes_df = dialect_phonemes_df[
    dialect_phonemes_df.Phoneme.isin(table_phonemes_with_diacritics.values)
]

print(handled_phonemes_df.shape)
display(handled_phonemes_df.head(3))

unhandled_phonemes_df = (
    dialect_phonemes_df[
        ~dialect_phonemes_df.Phoneme.isin(table_phonemes_with_diacritics.values)
    ][
        [
            "ISO6393",
            "LanguageName",
            "SpecificDialect",
            "symbol",
            "diacritics",
            "diacritic_count",
        ]
    ]
    .groupby(["ISO6393", "LanguageName", "SpecificDialect", "symbol"])
    .agg({"diacritics": list, "diacritic_count": "sum"})
    .reset_index()
)
unhandled_phonemes_df["Phoneme"] = unhandled_phonemes_df.symbol
unhandled_phonemes_df = unhandled_phonemes_df[
    [
        "ISO6393",
        "LanguageName",
        "SpecificDialect",
        "Phoneme",
        "symbol",
        "diacritics",
        "diacritic_count",
    ]
]


print(unhandled_phonemes_df.shape)
display(unhandled_phonemes_df.head(3))

phonemes_to_process_df = (
    pd.concat([handled_phonemes_df, unhandled_phonemes_df])
    .sort_values(["LanguageName", "SpecificDialect", "Phoneme"])
    .reset_index(drop=True)
)

print(phonemes_to_process_df.shape)
display(phonemes_to_process_df.head(3))

(3669, 7)


Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,symbol,diacritics,diacritic_count
91,lbe,Lak,none,kʼ,k,ʼ,1
98,lbe,Lak,none,pʼ,p,ʼ,1
104,lbe,Lak,none,qʼ,q,ʼ,1


(80029, 7)


Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,symbol,diacritics,diacritic_count
0,aae,Arbëresh Albanian,Arbëresh Albanian (Hora e Arbëreshëvet),a,a,[nan],0
1,aae,Arbëresh Albanian,Arbëresh Albanian (Hora e Arbëreshëvet),b,b,[nan],0
2,aae,Arbëresh Albanian,Arbëresh Albanian (Hora e Arbëreshëvet),c,c,[nan],0


(83698, 7)


Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,symbol,diacritics,diacritic_count
0,ktz,!XU,none,a,a,"[nan, ̃, ː, ̃ː, ˤ, ̃ˤ, ˤː, ̃ˤː]",12
1,ktz,!XU,none,ae,ae,"[̞, ̞ˤ, ̞̃̃ˤ]",7
2,ktz,!XU,none,ao,ao,"[̞, ̞ˤ, ̞̃̃ˤ]",7


In [None]:
phonemes_to_process_by_dialect_df = phonemes_to_process_df.groupby(
    ["LanguageName", "SpecificDialect"]
)

language_names = (
    phonemes_to_process_by_dialect_df.first()
    .index.get_level_values(0)
    .to_series()
    .reset_index(drop=True)
)

num_lang = len(language_names)

print("num_lang", num_lang)

num_lang 2934


In [None]:
def create_symbol_matrix(tbl_df):
    def map_table(dialect_df):
        # TODO: this should return a np array of shape tbl_df.shape
        result = np.where(
            (
                dialect_df.Phoneme.isin(tbl_df.Phoneme.values)
                | dialect_df.symbol.isin(tbl_df.Phoneme.values)
            ),
            dialect_df.diacritics.map(
                lambda x: (valid_diacritics.isin(list(x))).to_numpy().astype(int)
            ).to_numpy(),
            dialect_df.Phoneme.map(
                lambda _: np.zeros(valid_diacritics.shape[0]),
            ).to_numpy(),
        )
        return result

    return map_table


cons_pl_processed = phonemes_to_process_by_dialect_df.apply(
    create_symbol_matrix(cons_pl_tbl_stacked_df), include_groups=False
).to_numpy()
cons_npl_processed = phonemes_to_process_by_dialect_df.apply(
    create_symbol_matrix(cons_npl_tbl_stacked_df), include_groups=False
).to_numpy()
cons_coart_processed = phonemes_to_process_by_dialect_df.apply(
    create_symbol_matrix(cons_coart_tbl_stacked_df), include_groups=False
).to_numpy()
vowels_processed = phonemes_to_process_by_dialect_df.apply(
    create_symbol_matrix(vowels_tbl_stacked_df), include_groups=False
).to_numpy()


(
    cons_pl_processed.shape,
    cons_npl_processed.shape,
    cons_coart_processed.shape,
    vowels_processed.shape,
)

(66,)
(114,)
(70,)
(20,)
(18,)
(23,)
(44,)
(23,)
(35,)
(34,)
(16,)
(33,)
(27,)
(23,)
(17,)
(24,)
(22,)
(33,)
(23,)
(33,)
(19,)
(20,)
(31,)
(35,)
(23,)
(29,)
(24,)
(23,)
(20,)
(38,)
(20,)
(23,)
(18,)
(33,)
(26,)
(39,)
(30,)
(25,)
(23,)
(17,)
(24,)
(18,)
(36,)
(30,)
(27,)
(33,)
(33,)
(14,)
(31,)
(18,)
(43,)
(44,)
(40,)
(28,)
(23,)
(33,)
(43,)
(23,)
(40,)
(29,)
(19,)
(28,)
(32,)
(35,)
(32,)
(27,)
(42,)
(22,)
(23,)
(26,)
(31,)
(34,)
(40,)
(25,)
(31,)
(41,)
(16,)
(21,)
(35,)
(30,)
(34,)
(36,)
(21,)
(17,)
(32,)
(39,)
(48,)
(33,)
(37,)
(20,)
(30,)
(31,)
(16,)
(17,)
(34,)
(19,)
(24,)
(40,)
(34,)
(28,)
(33,)
(26,)
(24,)
(24,)
(28,)
(22,)
(19,)
(20,)
(41,)
(23,)
(39,)
(32,)
(35,)
(23,)
(18,)
(23,)
(21,)
(33,)
(37,)
(19,)
(20,)
(24,)
(28,)
(39,)
(41,)
(21,)
(31,)
(2,)
(36,)
(24,)
(20,)
(25,)
(34,)
(34,)
(34,)
(37,)
(31,)
(18,)
(33,)
(18,)
(25,)
(28,)
(25,)
(19,)
(24,)
(16,)
(32,)
(25,)
(44,)
(18,)
(22,)
(22,)
(18,)
(26,)
(17,)
(33,)
(37,)
(34,)
(33,)
(19,)
(49,)
(22,)
(31,)
(24,)
(28,)
(27,)
(17,

((2934,), (2934,), (2934,), (2934,))

In [None]:
unique_names = np.unique(language_names)
language_names.shape, unique_names.shape

((2934,), (2727,))

In [None]:
cons_pl_selected = np.array(
    [np.max(cons_pl_processed[language_names == name], axis=0) for name in unique_names]
)
cons_npl_selected = np.array(
    [
        np.max(cons_npl_processed[language_names == name], axis=0)
        for name in unique_names
    ]
)
cons_coart_selected = np.array(
    [
        np.max(cons_coart_processed[language_names == name], axis=0)
        for name in unique_names
    ]
)
vowels_selected = np.array(
    [np.max(vowels_processed[language_names == name], axis=0) for name in unique_names]
)

# sense check
for name in unique_names:
    assert (
        cons_pl_processed[language_names == name].shape[1:]
        == cons_pl_selected.shape[1:]
    )
    assert (
        cons_npl_processed[language_names == name].shape[1:]
        == cons_npl_selected.shape[1:]
    )
    assert (
        cons_coart_processed[language_names == name].shape[1:]
        == cons_coart_selected.shape[1:]
    )
    assert (
        vowels_processed[language_names == name].shape[1:] == vowels_selected.shape[1:]
    )

(
    cons_pl_selected.shape,
    cons_npl_selected.shape,
    vowels_selected.shape,
    cons_coart_selected.shape,
)

ValueError: operands could not be broadcast together with shapes (43,) (44,) 