The [Phoible](https://phoible.org/) dataset contains phoneme inventories for thousands of languages and dialects. Many languages/dialicts have multiple Phoible records. Here, I'm mapping the data against pre-prepared IPA phoneme tables, then selecting one sample per table per language so that each language is only represented once in the final dataset (to avoid bias by oversampling).

The IPA phoneme tables contain some phonemes with diacritics and some without. They seem to include only 0 or 1 diacritic perphoneme. 

The Phoible data also contains phonemes with and without diacritics, but they may contain sequences of multiple diacritics after each phoneme. These need to be split and handled seperately.



In [75]:
from pathlib import Path
import requests
from tqdm import tqdm
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import display


phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

VALIDATE_RESULTS = True

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [76]:
ipa_diacritics_df = pd.read_csv(
    "./data/ipa_diacritics.csv", dtype=str, encoding="utf-8", engine="python"
)
ipa_diacritics_df["unicode"] = ipa_diacritics_df.suffix.apply(lambda x: hex(ord(x)))
valid_diacritics = ipa_diacritics_df.suffix
display(ipa_diacritics_df.head(5))
ipa_diacritics_df.shape

Unnamed: 0,suffix,description,unicode
0,,Standard,0x20
1,ː,Long,0x2d0
2,ˑ,Half-long,0x2d1
3,̥,Voiceless,0x325
4,̊,Voiceless,0x30a


(38, 3)

Load the Phoible data and split the IPA diacritic suffixes from all phonemes so the raw phoneme can be matched against the pre-prepared IPA phoneme tables.

In [77]:
phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")
dialect_phonemes_df = (
    phoible_df[["ISO6393", "LanguageName", "SpecificDialect", "Phoneme"]]
    .copy()
    .fillna({"SpecificDialect": "none"})
)


def split_symbol(phoneme):
    return "".join([x for x in list(phoneme) if x not in valid_diacritics.values])


def split_diacritics(phoneme):
    return "".join([x for x in list(phoneme) if x in valid_diacritics.values])


def split_phoneme_series(df):
    df["symbol"] = df.Phoneme.apply(split_symbol)
    df["diacritics"] = df.Phoneme.apply(split_diacritics)
    df.loc[df.diacritics.str.len() == 0, "diacritics"] = " "
    df["versions"] = df.diacritics.apply(lambda d: ",".join([str(x) for x in valid_diacritics.isin(list(d)).astype(int)]))
    return df


dialect_phonemes_df = split_phoneme_series(dialect_phonemes_df)

display(dialect_phonemes_df.shape)
dialect_phonemes_df.head(3)

(105467, 7)

Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,symbol,diacritics,versions
0,kor,Korean,none,a,a,,"1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
1,kor,Korean,none,aː,a,ː,"0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
2,kor,Korean,none,æ,æ,,"1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."


In [78]:
lang_by_dialect_df = dialect_phonemes_df.groupby(
    ["LanguageName", "SpecificDialect"]
)

Load the pre-prepared IPA tables.

In [79]:
cons_pl_tbl_df = pd.read_csv(
    "./data/consonants_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_pl_tbl_df)

cons_npl_tbl_df = pd.read_csv(
    "./data/consonants_non_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_npl_tbl_df)

cons_coart_tbl_df = pd.read_csv(
    "./data/consonants_coarticulated.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_coart_tbl_df)

vowels_tbl_df = pd.read_csv(
    "./data/vowels.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(vowels_tbl_df)


cons_pl_tbl_shape = [*cons_pl_tbl_df.shape]
cons_npl_tbl_shape = [*cons_npl_tbl_df.shape]
cons_coart_tbl_shape = [*cons_coart_tbl_df.shape]
vowels_tbl_shape = [*vowels_tbl_df.shape]

cons_pl_tbl_shape, cons_npl_tbl_shape, cons_coart_tbl_shape, vowels_tbl_shape

Unnamed: 0,Bilabial,Bilabial_v,Labiodental,Labiodental_v,Dental,Dental_v,Alveolar,Alveolar_v,Postalveolar,Postalveolar_v,...,Palatal,Palatal_v,Velar,Velar_v,Uvular,Uvular_v,Pharyngeal,Pharyngeal_v,Glottal,Glottal_v
Plosive,p,b,,,t̪,d̪,t,d,,,...,c,ɟ,k,ɡ,q,ɢ,,,ʔ,
Nasal,,m,,ɱ,,n̪,,n,,,...,,ɲ,,ŋ,,ɴ,,,,
Trill,,ʙ,,,,r̪,,r,,,...,,,,,,ʀ,,,,
Tap or Flap,,,,ⱱ,,ɾ̪,,ɾ,,,...,,,,,,,,,,
Fricative,ɸ,β,f,v,θ,ð,s,z,ʃ,ʒ,...,ç,ʝ,x,ɣ,χ,ʁ,ħ,ʕ,h,ɦ
Lateral fricative,,,,,,,ɬ,ɮ,,,...,,,,,,,,,,
Approximant,,,,ʋ,,,,ɹ,,,...,,j,,ɰ,,,,,,
Lateral approximant,,,,,,l̪,,l,,,...,,ʎ,,ʟ,,,,,,


Unnamed: 0,0,1,2,3,4,5,6,7
Ejective Stop,pʼ,tʼ,ʈʼ,cʼ,kʼ,qʼ,ʡʼ,
Ejective Fricative,fʼ,θʼ,sʼ,ʃʼ,ʂʼ,ɕʼ,xʼ,χʼ
Ejective Affricate,tsʼ,t̠ʃʼ,ʈʂʼ,kxʼ,qχʼ,,,
Ejective Lateral,ɬʼ,tɬʼ,cʎ̝̊ʼ,kʟ̝̊ʼ,,,,
Click,kʘ,kǀ,kǃ,kǂ,kǁ,,,
Implosive Voiced,ɓ,ɗ,ᶑ,ʄ,ɠ,ʛ,,
Implosive Voiceless,ɓ̥,ɗ̥,ᶑ̥,ʄ̥,ɠ̊,ʛ̥,,


Unnamed: 0,Labial–alveolar,Labial–alveolar_v,Labial–retroflex,Labial–retroflex_v,Labial–palatal,Labial–palatal_v,Labial–velar,Labial–velar_v,Labial–uvular,Labial–uvular_v,Velarized alveolar,Velarized alveolar_v,Uvular–epiglottal,Uvular–epiglottal_v,Palatal-velar,Palatal-velar_v
Nasal,,n͡m,,ɳ͡m,,,,ŋ͡m,,,,,,,,
Plosive,t͡p,d͡b,ʈ͡p,ɖ͡b,,,k͡p,ɡ͡b,q͡p,,,,q͡ʡ,,,
Fricative/approximant,,,,,ɥ̊,ɥ,ʍ,w,,,,,,,ɧ,
Lateral approximant,,,,,,,,,,,,ɫ,,,,
Implosive,,,,,,,ɠ̊͜ɓ̥,ɠ͡ɓ,,,,,,,,
Ejective,,t͡pʼ,,,,,,,,,,,,,,


Unnamed: 0,Front unrounded,Front rounded,Near-front unrounded,Near-front rounded,Central unrounded,Central rounded,Near-back unrounded,Near-back rounded,Back unrounded,Back rounded
Close,i,y,,,ɨ,ʉ,,,ɯ,u
Near-close,,,ɪ,ʏ,,,,ʊ,,
Close-mid,e,ø,,,ɘ,ɵ,,,ɤ,o
Mid,e̞,ø̞,,,ə,,,,ɤ̞,o̞
Open-mid,ɛ,œ,,,ɜ,ɞ,,,ʌ,ɔ
Near-open,æ,,,ɐ,,,,,,
Open,a,ɶ,,,ä,,,,ɑ,ɒ


([8, 22], [7, 8], [6, 16], [7, 10])

Convert the IPA tables to Numpy arrays for use later.

In [80]:
def map_tbl_phoneme(phoneme):
    chars = list(phoneme)
    symbols = [x for x in chars if x not in valid_diacritics.values]
    diacritics = [x for x in chars if x in valid_diacritics.values]
    return np.array([phoneme, "".join(symbols), "".join(diacritics)])

cons_pl_tbl_npy = np.array(
    [map_tbl_phoneme(phoneme) for phoneme in cons_pl_tbl_df.stack()]
)
cons_npl_tbl_npy = np.array(
    [map_tbl_phoneme(phoneme) for phoneme in cons_npl_tbl_df.stack()]
)
cons_coart_tbl_npy = np.array(
    [map_tbl_phoneme(phoneme) for phoneme in cons_coart_tbl_df.stack()]
)
vowels_tbl_npy = np.array(
    [map_tbl_phoneme(phoneme) for phoneme in vowels_tbl_df.stack()]
)

# print a sample
cons_pl_tbl_npy[0], cons_npl_tbl_npy[0], cons_coart_tbl_npy[0], vowels_tbl_npy[0]

(array(['p', 'p', ''], dtype='<U2'),
 array(['pʼ', 'p', 'ʼ'], dtype='<U5'),
 array(['', '', ''], dtype='<U5'),
 array(['i', 'i', ''], dtype='<U2'))

In [81]:
# do we have duplicates?

all_table_entries = np.concatenate(
    [
        cons_pl_tbl_npy,
        cons_npl_tbl_npy,
        cons_coart_tbl_npy,
        vowels_tbl_npy,
    ]
)

table_phonemes = all_table_entries[all_table_entries[:, 0] != ""]

unique_phonemes, table_phonemes_counts = np.unique(
    table_phonemes[:, 0], return_counts=True
)

assert np.max(table_phonemes_counts) == 1

table_phonemes[0:3]

array([['p', 'p', ''],
       ['b', 'b', ''],
       ['t̪', 't', '̪']], dtype='<U5')

In [82]:
# stack and split tables to seperate phonemes with/without specific diacritics

table_phonemes_with_diacritics = (
    table_phonemes[np.char.str_len(table_phonemes[:, 2]) > 0]
)
table_phonemes_without_diacritics = table_phonemes[
    np.char.str_len(table_phonemes[:, 2]) == 0
]

table_phonemes_with_diacritics[0:3], table_phonemes_without_diacritics[0:3]

(array([['t̪', 't', '̪'],
        ['d̪', 'd', '̪'],
        ['n̪', 'n', '̪']], dtype='<U5'),
 array([['p', 'p', ''],
        ['b', 'b', ''],
        ['t', 't', '']], dtype='<U5'))

For each langauge dialect, map all 4 phoneme tables against all the phonemes in the dialect.

If the table phoneme has a diacritic mark, compare the language phoneme and table phoneme directly. Return an array with one phoneme variant indicated for each matching table phoneme.

If the table phoneme doesn't have a diacritic mark, compare the base symbol of the language phoneme against the table phoneme. Return an array with all phoneme variants indicated for each matching table phoneme.

If the table cell is empty, return an array of zeros (length matching the number of phoneme variants) 

In [83]:
# combine duplicate phonemes that don't have explicit diacritic handling
# in one of the tables

handled_phonemes_df = dialect_phonemes_df[
    dialect_phonemes_df.Phoneme.isin(table_phonemes_with_diacritics[:, 0])
]

unhandled_phonemes_df = (
    dialect_phonemes_df[
        ~dialect_phonemes_df.Phoneme.isin(table_phonemes_with_diacritics[:, 0])
    ][
        [
            "ISO6393",
            "LanguageName",
            "SpecificDialect",
            "symbol",
            "diacritics",
            "versions",
        ]
    ]
    .groupby(["ISO6393", "LanguageName", "SpecificDialect", "symbol"])
    .agg({"diacritics": "".join, "versions": "sum"})
    .reset_index()
)
unhandled_phonemes_df["Phoneme"] = unhandled_phonemes_df.symbol
unhandled_phonemes_df = unhandled_phonemes_df[
    [
        "ISO6393",
        "LanguageName",
        "SpecificDialect",
        "Phoneme",
        "symbol",
        "diacritics",
        "versions",
    ]
]


phonemes_to_process_df = (
    pd.concat([handled_phonemes_df, unhandled_phonemes_df])
    .sort_values(["LanguageName", "SpecificDialect", "Phoneme"])
    .reset_index(drop=True)
)


print("dialect_phonemes_df", dialect_phonemes_df.shape)
print("handled_phonemes_df", handled_phonemes_df.shape)
print("unhandled_phonemes_df", unhandled_phonemes_df.shape)
print("phonemes_to_process_df", phonemes_to_process_df.shape)
display(handled_phonemes_df.head(3))
display(unhandled_phonemes_df.head(3))
display(phonemes_to_process_df.head(3))

dialect_phonemes_df (105467, 7)
handled_phonemes_df (3669, 7)
unhandled_phonemes_df (80029, 7)
phonemes_to_process_df (83698, 7)


Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,symbol,diacritics,versions
91,lbe,Lak,none,kʼ,k,ʼ,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
98,lbe,Lak,none,pʼ,p,ʼ,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
104,lbe,Lak,none,qʼ,q,ʼ,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."


Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,symbol,diacritics,versions
0,aae,Arbëresh Albanian,Arbëresh Albanian (Hora e Arbëreshëvet),a,a,,"1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
1,aae,Arbëresh Albanian,Arbëresh Albanian (Hora e Arbëreshëvet),b,b,,"1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
2,aae,Arbëresh Albanian,Arbëresh Albanian (Hora e Arbëreshëvet),c,c,,"1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."


Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,symbol,diacritics,versions
0,ktz,!XU,none,a,a,̃ː̃ːˤ̃ˤˤː̃ˤː,"1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
1,ktz,!XU,none,ae,ae,̞̞ˤ̞̃̃ˤ,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."
2,ktz,!XU,none,ao,ao,̞̞ˤ̞̃̃ˤ,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,..."


In [84]:
phonemes_to_process_by_dialect_df = phonemes_to_process_df.groupby(
    ["LanguageName", "SpecificDialect"]
)

language_names_and_dialects = phonemes_to_process_by_dialect_df.first().index.to_frame().reset_index(drop=True)

num_lang = len(language_names_and_dialects)

print("num_lang", num_lang)

num_lang 2934


In [85]:
def create_symbol_matrix(phoneme_tbl):
    print("phoneme_tbl", phoneme_tbl.shape)

    def map_table(dialect_df):
        zeros = np.zeros(valid_diacritics.shape[0])
        # this should return a matrix of shape phoneme_tbl.shape

        is_handled = np.isin(phoneme_tbl[:, 0], dialect_df.Phoneme.values)
        is_unhandled = ~np.isin(phoneme_tbl[:, 0], dialect_df.Phoneme.values) & np.isin(
            phoneme_tbl[:, 1], dialect_df.symbol.values
        )
        print("is_handled", is_handled)
        print("is_unhandled", is_unhandled)

        # handled = np.array(
        #     [
        #         valid_diacritics.isin(list(phoneme_tbl[i, 2])).to_numpy()
        #         if x
        #         else zeros.copy()
        #         for i, x in enumerate(is_handled)
        #     ]
        # ).astype(int)

        # phonemes not handled directly in the table
        # is_unhandled = ~np.isin(phoneme_tbl[:, 0], dialect_df.Phoneme.values) | np.isin(
        #     phoneme_tbl[:, 1], dialect_df.symbol.values
        # )
        # unhandled = np.array(
        #     [
        #         valid_diacritics.isin(
        #             list(
        #                 dialect_df[
        #                     (dialect_df.symbol == phoneme_tbl[i, 1])
        #                     & (dialect_df.Phoneme != phoneme_tbl[i, 0])
        #                 ].diacritics.get(0, "")
        #             )
        #         ).to_numpy()
        #         if x
        #         else zeros.copy()
        #         for i, x in enumerate(is_unhandled)
        #     ]
        # ).astype(int)

        # print("handled", handled.shape, handled.sum())
        # print(
        #     "unhandled",
        #     unhandled.shape,
        #     unhandled.sum(),
        # )

        return zeros

    return map_table


cons_pl_processed = phonemes_to_process_by_dialect_df.apply(
    create_symbol_matrix(cons_pl_tbl_npy), include_groups=False
).to_numpy()
# cons_npl_processed = phonemes_to_process_by_dialect_df.apply(
#     create_symbol_matrix(cons_npl_tbl_npy), include_groups=False
# ).to_numpy()
# cons_coart_processed = phonemes_to_process_by_dialect_df.apply(
#     create_symbol_matrix(cons_coart_tbl_npy), include_groups=False
# ).to_numpy()
# vowels_processed = phonemes_to_process_by_dialect_df.apply(
#     create_symbol_matrix(vowels_tbl_npy), include_groups=False
# ).to_numpy()


(
    cons_pl_processed[0].shape,
    cons_pl_processed[0][0].shape,
)

phoneme_tbl (144, 3)


is_handled [ True  True False False False False  True  True False False False False
 False False  True  True False False False False False  True False False
 False False False  True False False False False False False False  True
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False False False False False False False  True  True  True  True
 False False False False  True False False False False False False  True
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False  True False False False False False False False False
 False False False False False False False False False False False False]
is_unhandled [False False False False  True  True False False False False False False
 False False False False F

((38,), ())

In [86]:
unique_names = np.unique(language_names_and_dialects.LanguageName)
language_names_and_dialects.shape, unique_names.shape

((2934, 2), (2727,))

In [87]:
grouped = [
    {
        "name": name,
        "group": cons_pl_processed[language_names_and_dialects.LanguageName == name][0][0].shape,
    }
    for name in unique_names
]
grouped

[{'name': '!XU', 'group': ()},
 {'name': '!Xóõ', 'group': ()},
 {'name': '!Xun', 'group': ()},
 {'name': 'ABIPON', 'group': ()},
 {'name': 'ACHE', 'group': ()},
 {'name': 'ACHUMAWI', 'group': ()},
 {'name': 'ACOMA', 'group': ()},
 {'name': 'ADZERA', 'group': ()},
 {'name': 'AGHEM', 'group': ()},
 {'name': 'AHTNA', 'group': ()},
 {'name': 'AINU', 'group': ()},
 {'name': 'AIZI', 'group': ()},
 {'name': 'AKAN', 'group': ()},
 {'name': 'AKAWAIO', 'group': ()},
 {'name': 'ALABAMA', 'group': ()},
 {'name': 'ALAMBLAK', 'group': ()},
 {'name': 'ALAWA', 'group': ()},
 {'name': 'ALBANIAN', 'group': ()},
 {'name': 'ALEUT', 'group': ()},
 {'name': 'ALLADIAN', 'group': ()},
 {'name': 'AMAHUACA', 'group': ()},
 {'name': 'AMELE', 'group': ()},
 {'name': 'AMHARIC', 'group': ()},
 {'name': 'AMO', 'group': ()},
 {'name': 'AMUESHA', 'group': ()},
 {'name': 'AMUZGO', 'group': ()},
 {'name': 'ANDAMANESE', 'group': ()},
 {'name': 'ANDOKE', 'group': ()},
 {'name': 'ANGAATIHA', 'group': ()},
 {'name': 'ANGA

In [88]:
cons_pl_selected = np.array(
    [np.max(cons_pl_processed[language_names == name][:], axis=0) for name in unique_names]
)
cons_npl_selected = np.array(
    [
        np.max(cons_npl_processed[language_names == name], axis=0)
        for name in unique_names
    ]
)
cons_coart_selected = np.array(
    [
        np.max(cons_coart_processed[language_names == name], axis=0)
        for name in unique_names
    ]
)
vowels_selected = np.array(
    [np.max(vowels_processed[language_names == name], axis=0) for name in unique_names]
)

# sense check
for name in unique_names:
    assert (
        cons_pl_processed[language_names == name].shape[1:]
        == cons_pl_selected.shape[1:]
    )
    assert (
        cons_npl_processed[language_names == name].shape[1:]
        == cons_npl_selected.shape[1:]
    )
    assert (
        cons_coart_processed[language_names == name].shape[1:]
        == cons_coart_selected.shape[1:]
    )
    assert (
        vowels_processed[language_names == name].shape[1:] == vowels_selected.shape[1:]
    )

(
    cons_pl_selected.shape,
    cons_npl_selected.shape,
    vowels_selected.shape,
    cons_coart_selected.shape,
)

NameError: name 'language_names' is not defined