The [Phoible](https://phoible.org/) dataset contains phoneme inventories for thousands of languages and dialects. Many languages/dialicts have multiple Phoible records. Here, I'm mapping the data against pre-prepared IPA phoneme tables, then selecting one sample per table per language so that each language is only represented once in the final dataset (to avoid bias by oversampling).



In [561]:
from pathlib import Path
import requests
from tqdm import tqdm
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import display


phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

VALIDATE_RESULTS = True

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [562]:
ipa_diacritics_df = pd.read_csv(
    "./data/ipa_diacritics.csv", dtype=str, encoding="utf-8", engine="python"
)
valid_diacritics = ipa_diacritics_df.suffix
display(valid_diacritics.head(3))
ipa_diacritics_df.shape

0    NaN
1      ː
2      ˑ
Name: suffix, dtype: object

(38, 2)

Load the Phoible data and split the IPA diacritic suffixes from all phonemes so the raw phoneme can be matched against the pre-prepared IPA phoneme tables.

In [563]:
phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")
dialect_phonemes_df = (
    phoible_df[["ISO6393", "LanguageName", "SpecificDialect", "Phoneme"]]
    .copy()
    .fillna({"SpecificDialect": "none"})
)

dialect_phonemes_df["symbol"] = dialect_phonemes_df.Phoneme.str.split("").apply(
    lambda p: "".join([x for x in p if x != "" and x not in valid_diacritics.values])
)
dialect_phonemes_df["diacritics"] = dialect_phonemes_df.Phoneme.str.split("").apply(
    lambda p: "".join([x for x in p if x != "" and x in valid_diacritics.values])
)
dialect_phonemes_df["diacritic_count"] = dialect_phonemes_df.diacritics.str.len()

display(dialect_phonemes_df.shape)
dialect_phonemes_df.head(3)

(105467, 7)

Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,symbol,diacritics,diacritic_count
0,kor,Korean,none,a,a,,0
1,kor,Korean,none,aː,a,ː,1
2,kor,Korean,none,æ,æ,,0


In [564]:
lang_by_dialect_df = dialect_phonemes_df.head(500).groupby(
    ["LanguageName", "SpecificDialect"]
)

language_names = (
    lang_by_dialect_df.first()
    .index.get_level_values(0)
    .to_series()
    .reset_index(drop=True)
)

num_lang = len(language_names)

print("num_lang", num_lang)

num_lang 11


Load the pre-prepared IPA tables.

In [565]:
cons_pl_tbl_df = pd.read_csv(
    "./data/consonants_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_pl_tbl_df)

cons_npl_tbl_df = pd.read_csv(
    "./data/consonants_non_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_npl_tbl_df)

cons_coart_tbl_df = pd.read_csv(
    "./data/consonants_coarticulated.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
(cons_coart_tbl_df)

vowels_tbl_df = pd.read_csv(
    "./data/vowels.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
(vowels_tbl_df)

Unnamed: 0,Bilabial,Bilabial_v,Labiodental,Labiodental_v,Dental,Dental_v,Alveolar,Alveolar_v,Postalveolar,Postalveolar_v,...,Palatal,Palatal_v,Velar,Velar_v,Uvular,Uvular_v,Pharyngeal,Pharyngeal_v,Glottal,Glottal_v
Plosive,p,b,,,t̪,d̪,t,d,,,...,c,ɟ,k,ɡ,q,ɢ,,,ʔ,
Nasal,,m,,ɱ,,n̪,,n,,,...,,ɲ,,ŋ,,ɴ,,,,
Trill,,ʙ,,,,r̪,,r,,,...,,,,,,ʀ,,,,
Tap or Flap,,,,ⱱ,,ɾ̪,,ɾ,,,...,,,,,,,,,,
Fricative,ɸ,β,f,v,θ,ð,s,z,ʃ,ʒ,...,ç,ʝ,x,ɣ,χ,ʁ,ħ,ʕ,h,ɦ
Lateral fricative,,,,,,,ɬ,ɮ,,,...,,,,,,,,,,
Approximant,,,,ʋ,,,,ɹ,,,...,,j,,ɰ,,,,,,
Lateral approximant,,,,,,l̪,,l,,,...,,ʎ,,ʟ,,,,,,


Unnamed: 0,0,1,2,3,4,5,6,7
Ejective Stop,pʼ,tʼ,ʈʼ,cʼ,kʼ,qʼ,ʡʼ,
Ejective Fricative,fʼ,θʼ,sʼ,ʃʼ,ʂʼ,ɕʼ,xʼ,χʼ
Ejective Affricate,tsʼ,t̠ʃʼ,ʈʂʼ,kxʼ,qχʼ,,,
Ejective Lateral,ɬʼ,tɬʼ,cʎ̝̊ʼ,kʟ̝̊ʼ,,,,
Click,kʘ,kǀ,kǃ,kǂ,kǁ,,,
Implosive Voiced,ɓ,ɗ,ᶑ,ʄ,ɠ,ʛ,,
Implosive Voiceless,ɓ̥,ɗ̥,ᶑ̥,ʄ̥,ɠ̊,ʛ̥,,


Unnamed: 0,Front unrounded,Front rounded,Near-front unrounded,Near-front rounded,Central unrounded,Central rounded,Near-back unrounded,Near-back rounded,Back unrounded,Back rounded
Close,i,y,,,ɨ,ʉ,,,ɯ,u
Near-close,,,ɪ,ʏ,,,,ʊ,,
Close-mid,e,ø,,,ɘ,ɵ,,,ɤ,o
Mid,e̞,ø̞,,,ə,,,,ɤ̞,o̞
Open-mid,ɛ,œ,,,ɜ,ɞ,,,ʌ,ɔ
Near-open,æ,,,ɐ,,,,,,
Open,a,ɶ,,,ä,,,,ɑ,ɒ


In [566]:
# do we have duplicates?

table_phonemes_counts = (
    pd.concat(
        [
            cons_pl_tbl_df.stack(),
            cons_npl_tbl_df.stack(),
            cons_coart_tbl_df.stack(),
            vowels_tbl_df.stack(),
        ]
    )
    .replace("", np.nan)
    .dropna()
    .value_counts()
)

assert table_phonemes_counts[table_phonemes_counts == 1].all()

table_phonemes = table_phonemes_counts.index.to_series().reset_index(drop=True)

table_phonemes.head(3)

0      p
1    t͡p
2     ᶑ̥
dtype: object

In [567]:
# split tables to seperate phonemes with/without specific diacritics


# def map_diacritic(diacritics, keep_diacritic=True):
#     def mapper(p):
#         if keep_diacritic:
#             return p if any(c in diacritics for c in p) else ""
#         else:
#             return "" if any(c in diacritics for c in p) else p

#     return mapper


# def split_table(tbl, diacritics):
#     w = tbl.replace(np.nan, "").map(map_diacritic(diacritics, keep_diacritic=True))
#     wo = tbl.replace(np.nan, "").map(map_diacritic(diacritics, keep_diacritic=False))
#     return w, wo


# cons_pl_w_diacritic_tbl_df, cons_pl_wo_diacritic_tbl_df = split_table(
#     cons_pl_tbl_df, valid_diacritics
# )
# cons_npl_w_diacritic_tbl_df, cons_npl_wo_diacritic_tbl_df = split_table(
#     cons_npl_tbl_df, valid_diacritics
# )
# cons_coart_w_diacritic_tbl_df, cons_coart_wo_diacritic_tbl_df = split_table(
#     cons_coart_tbl_df, valid_diacritics
# )
# vowels_w_diacritic_tbl_df, vowels_wo_diacritic_tbl_df = split_table(
#     vowels_tbl_df, valid_diacritics
# )

# display(vowels_w_diacritic_tbl_df)
# display(vowels_wo_diacritic_tbl_df)

In [592]:
def map_diacritic(diacritics, keep_diacritic=True):
    def mapper(p):
        if keep_diacritic:
            return p if any(c in diacritics.values for c in p) else ""
        else:
            return "" if any(c in diacritics.values for c in p) else p

    return mapper


def map_tbl_phoneme(tbl, diacritics, lang_tbl):
    def mapper(p):
        if p == "":
            return np.zeros(len(diacritics) + 1)
        else:
            return np.ones(len(diacritics) + 1)

    return mapper


def create_symbol_matrix(tbl, diacritics):
    tbl_w = tbl.replace(np.nan, "").map(map_diacritic(diacritics, keep_diacritic=True))
    tbl_wo = tbl.replace(np.nan, "").map(
        map_diacritic(diacritics, keep_diacritic=False)
    )

    def map_phonemes(df):
        new_approach_tbl = (
            tbl.replace(np.nan, "").map(map_tbl_phoneme(tbl, diacritics, df)).to_numpy()
        )
        new_approach_tbl = np.array(
            [[row for row in col] for col in new_approach_tbl], dtype=np.float64
        )
        print(new_approach_tbl.shape)
        print(new_approach_tbl[0, 0].shape)
        exists = np.where(
            (tbl_w.isin(set(df.Phoneme)) | tbl_wo.isin(set(df.symbol))), 1.0, 0.0
        )

        def map_table_diacritics(chars):
            truth = diacritics.isin(list(chars)).values.astype(float)
            zeros = np.zeros(diacritics.shape[0]).astype(float)
            return np.where(
                truth,
                truth,
                zeros,
            )

        phoneme_variants_tbl = tbl_w.map(map_table_diacritics).to_numpy()
        phoneme_variants_tbl = np.array(
            [[row for row in col] for col in phoneme_variants_tbl], dtype=np.float64
        )

        phoneme_variants_lang = df.diacritics.map(map_table_diacritics).to_numpy()
        phoneme_variants_lang = np.array(
            [[row for row in col] for col in phoneme_variants_lang], dtype=np.float64
        )

        print(
            "phoneme_variants_tbl",
            phoneme_variants_tbl.shape,
            phoneme_variants_tbl.sum(),
        )

        print(
            "phoneme_variants_lang",
            phoneme_variants_lang.shape,
            phoneme_variants_lang.sum(),
        )
        print("exists", exists.shape, exists.sum())
        print("diacritics", diacritics.shape)
        empty = np.zeros(exists.shape)
        empty2 = np.zeros(exists.shape)

        return exists

        return np.stack([exists, empty, empty2], axis=2)

    return map_phonemes

In [591]:
cons_pl_npy = np.stack(
    lang_by_dialect_df.apply(
        create_symbol_matrix(cons_pl_tbl_df, valid_diacritics),
        include_groups=False,
    )
)
# assert cons_pl_npy.shape == (num_lang, *cons_pl_tbl_df.shape, 3)
print("cons_pl_npy", type(cons_pl_npy), cons_pl_npy.shape, cons_pl_npy[0,0,0])

cons_pl_npy[0]

(8, 22, 39)
(39,)
phoneme_variants_tbl (8, 22, 38) 6.0
phoneme_variants_lang (53, 38) 24.0
exists (8, 22) 23.0
diacritics (38,)
(8, 22, 39)
(39,)
phoneme_variants_tbl (8, 22, 38) 6.0
phoneme_variants_lang (35, 38) 18.0
exists (8, 22) 17.0
diacritics (38,)
(8, 22, 39)
(39,)
phoneme_variants_tbl (8, 22, 38) 6.0
phoneme_variants_lang (56, 38) 55.0
exists (8, 22) 28.0
diacritics (38,)
(8, 22, 39)
(39,)
phoneme_variants_tbl (8, 22, 38) 6.0
phoneme_variants_lang (32, 38) 16.0
exists (8, 22) 13.0
diacritics (38,)
(8, 22, 39)
(39,)
phoneme_variants_tbl (8, 22, 38) 6.0
phoneme_variants_lang (8, 38) 6.0
exists (8, 22) 2.0
diacritics (38,)
(8, 22, 39)
(39,)
phoneme_variants_tbl (8, 22, 38) 6.0
phoneme_variants_lang (40, 38) 20.0
exists (8, 22) 11.0
diacritics (38,)
(8, 22, 39)
(39,)
phoneme_variants_tbl (8, 22, 38) 6.0
phoneme_variants_lang (34, 38) 19.0
exists (8, 22) 20.0
diacritics (38,)
(8, 22, 39)
(39,)
phoneme_variants_tbl (8, 22, 38) 6.0
phoneme_variants_lang (68, 38) 68.0
exists (8, 22) 2

array([[1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 1., 1., 0., 0., 1., 1.,
        1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0.,
        1., 1., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]])

In [570]:
example_df = pd.DataFrame(
    [["A", True, True], ["A", True, False], ["B", False, True], ["B", False, False]],
    columns=["grp", "col1", "col2"],
)


def mapper(val):
    return np.ones(3) if val is True else np.zeros(3)


result = (
    example_df.groupby(["grp"])
    .transform(lambda df: df.map(mapper))
    .explode(["col1", "col2"])
    .to_numpy()
    .reshape(4, 2, 3)
)

# result = np.array([[row for row in col] for col in result], dtype=np.float64)

# result = np.array(list(result[:,:][:]), dtype=np.float64)

# result = example_df[["col1", "col2"]].applymap(mapper).values
# print(result.shape)
# result = np.stack(result).reshape(4, 2, 3)

print(result.shape)

(4, 2, 3)
