The [Phoible](https://phoible.org/) dataset contains phoneme inventories for thousands of languages and dialects. Many languages/dialicts have multiple Phoible records. Here, I'm mapping the data against pre-prepared IPA phoneme tables, then selecting one sample per table per language so that each language is only represented once in the final dataset (to avoid bias by oversampling).

The IPA phoneme tables contain some phonemes with diacritics and some without. They seem to include only 0 or 1 diacritic perphoneme. 

The Phoible data also contains phonemes with and without diacritics, but they may contain sequences of multiple diacritics after each phoneme. These need to be split and handled seperately.



In [36]:
from pathlib import Path
import requests
from tqdm import tqdm
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import display


phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

VALIDATE_RESULTS = True

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [37]:
ipa_diacritics_df = pd.read_csv(
    "./data/ipa_diacritics.csv", dtype=str, encoding="utf-8", engine="python"
)
valid_diacritics = ipa_diacritics_df.suffix
print(ipa_diacritics_df.shape)
ipa_diacritics_df.head(3)

(38, 2)


Unnamed: 0,suffix,description
0,,Standard
1,ː,Long
2,ˑ,Half-long


Load the Phoible data and split the IPA diacritic suffixes from all phonemes so the raw phoneme can be matched against the pre-prepared IPA phoneme tables.

In [38]:
phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")
dialect_phonemes_df = (
    phoible_df[["ISO6393", "LanguageName", "SpecificDialect", "Phoneme"]]
    .copy()
    .fillna({"SpecificDialect": "none"})
)

def split_symbol(phoneme):
    return "".join([x for x in list(phoneme) if x != "" and x not in valid_diacritics.values])


def split_diacritics(phoneme):
    return "".join(
        [x for x in list(phoneme) if x != "" and x in valid_diacritics.values]
    )

def split_phoneme_series(df):
    df["symbol"] = df.Phoneme.apply(split_symbol)
    df["diacritics"] = df.Phoneme.apply(split_diacritics)
    df["diacritic_count"] = df.diacritics.str.len()
    return df
    

dialect_phonemes_df = split_phoneme_series(dialect_phonemes_df)

display(dialect_phonemes_df.shape)
dialect_phonemes_df.head(3)

(105467, 7)

Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,symbol,diacritics,diacritic_count
0,kor,Korean,none,a,a,,0
1,kor,Korean,none,aː,a,ː,1
2,kor,Korean,none,æ,æ,,0


In [39]:
lang_by_dialect_df = dialect_phonemes_df.groupby(
    ["LanguageName", "SpecificDialect"]
)

language_names = (
    lang_by_dialect_df.first()
    .index.get_level_values(0)
    .to_series()
    .reset_index(drop=True)
)

num_lang = len(language_names)

print("num_lang", num_lang)

num_lang 2949


Load the pre-prepared IPA tables.

In [40]:
cons_pl_tbl_df = pd.read_csv(
    "./data/consonants_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_pl_tbl_df)

cons_npl_tbl_df = pd.read_csv(
    "./data/consonants_non_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_npl_tbl_df)

cons_coart_tbl_df = pd.read_csv(
    "./data/consonants_coarticulated.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_coart_tbl_df)

vowels_tbl_df = pd.read_csv(
    "./data/vowels.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(vowels_tbl_df)


cons_pl_tbl_shape = [*cons_pl_tbl_df.shape]
cons_npl_tbl_shape = [*cons_npl_tbl_df.shape]
cons_coart_tbl_shape = [*cons_coart_tbl_df.shape]
vowels_tbl_shape = [*vowels_tbl_df.shape]

cons_pl_tbl_shape, cons_npl_tbl_shape, cons_coart_tbl_shape, vowels_tbl_shape

Unnamed: 0,Bilabial,Bilabial_v,Labiodental,Labiodental_v,Dental,Dental_v,Alveolar,Alveolar_v,Postalveolar,Postalveolar_v,...,Palatal,Palatal_v,Velar,Velar_v,Uvular,Uvular_v,Pharyngeal,Pharyngeal_v,Glottal,Glottal_v
Plosive,p,b,,,t̪,d̪,t,d,,,...,c,ɟ,k,ɡ,q,ɢ,,,ʔ,
Nasal,,m,,ɱ,,n̪,,n,,,...,,ɲ,,ŋ,,ɴ,,,,
Trill,,ʙ,,,,r̪,,r,,,...,,,,,,ʀ,,,,
Tap or Flap,,,,ⱱ,,ɾ̪,,ɾ,,,...,,,,,,,,,,
Fricative,ɸ,β,f,v,θ,ð,s,z,ʃ,ʒ,...,ç,ʝ,x,ɣ,χ,ʁ,ħ,ʕ,h,ɦ
Lateral fricative,,,,,,,ɬ,ɮ,,,...,,,,,,,,,,
Approximant,,,,ʋ,,,,ɹ,,,...,,j,,ɰ,,,,,,
Lateral approximant,,,,,,l̪,,l,,,...,,ʎ,,ʟ,,,,,,


Unnamed: 0,0,1,2,3,4,5,6,7
Ejective Stop,pʼ,tʼ,ʈʼ,cʼ,kʼ,qʼ,ʡʼ,
Ejective Fricative,fʼ,θʼ,sʼ,ʃʼ,ʂʼ,ɕʼ,xʼ,χʼ
Ejective Affricate,tsʼ,t̠ʃʼ,ʈʂʼ,kxʼ,qχʼ,,,
Ejective Lateral,ɬʼ,tɬʼ,cʎ̝̊ʼ,kʟ̝̊ʼ,,,,
Click,kʘ,kǀ,kǃ,kǂ,kǁ,,,
Implosive Voiced,ɓ,ɗ,ᶑ,ʄ,ɠ,ʛ,,
Implosive Voiceless,ɓ̥,ɗ̥,ᶑ̥,ʄ̥,ɠ̊,ʛ̥,,


Unnamed: 0,Labial–alveolar,Labial–alveolar_v,Labial–retroflex,Labial–retroflex_v,Labial–palatal,Labial–palatal_v,Labial–velar,Labial–velar_v,Labial–uvular,Labial–uvular_v,Velarized alveolar,Velarized alveolar_v,Uvular–epiglottal,Uvular–epiglottal_v,Palatal-velar,Palatal-velar_v
Nasal,,n͡m,,ɳ͡m,,,,ŋ͡m,,,,,,,,
Plosive,t͡p,d͡b,ʈ͡p,ɖ͡b,,,k͡p,ɡ͡b,q͡p,,,,q͡ʡ,,,
Fricative/approximant,,,,,ɥ̊,ɥ,ʍ,w,,,,,,,ɧ,
Lateral approximant,,,,,,,,,,,,ɫ,,,,
Implosive,,,,,,,ɠ̊͜ɓ̥,ɠ͡ɓ,,,,,,,,
Ejective,,t͡pʼ,,,,,,,,,,,,,,


Unnamed: 0,Front unrounded,Front rounded,Near-front unrounded,Near-front rounded,Central unrounded,Central rounded,Near-back unrounded,Near-back rounded,Back unrounded,Back rounded
Close,i,y,,,ɨ,ʉ,,,ɯ,u
Near-close,,,ɪ,ʏ,,,,ʊ,,
Close-mid,e,ø,,,ɘ,ɵ,,,ɤ,o
Mid,e̞,ø̞,,,ə,,,,ɤ̞,o̞
Open-mid,ɛ,œ,,,ɜ,ɞ,,,ʌ,ɔ
Near-open,æ,,,ɐ,,,,,,
Open,a,ɶ,,,ä,,,,ɑ,ɒ


([8, 22], [7, 8], [6, 16], [7, 10])

In [41]:
# do we have duplicates?

table_phonemes_counts = (
    pd.concat(
        [
            cons_pl_tbl_df.stack(),
            cons_npl_tbl_df.stack(),
            cons_coart_tbl_df.stack(),
            vowels_tbl_df.stack(),
        ]
    )
    .replace("", np.nan)
    .dropna()
    .value_counts()
)

assert table_phonemes_counts[table_phonemes_counts == 1].all()

table_phonemes = table_phonemes_counts.index.to_series().reset_index(drop=True)

table_phonemes.head(3)

0      p
1    t͡p
2     ᶑ̥
dtype: object

In [48]:
# stack and split tables to seperate phonemes with/without specific diacritics

cons_pl_tbl_stacked_df = split_phoneme_series(
    cons_pl_tbl_df.stack().reset_index(drop=True).to_frame("Phoneme")
)
cons_npl_tbl_stacked_df = split_phoneme_series(
    cons_npl_tbl_df.stack().reset_index(drop=True).to_frame("Phoneme")
)
cons_coart_tbl_stacked_df = split_phoneme_series(
    cons_coart_tbl_df.stack().reset_index(drop=True).to_frame("Phoneme")
)
vowels_tbl_stacked_df = split_phoneme_series(
    vowels_tbl_df.stack().reset_index(drop=True).to_frame("Phoneme")
)

cons_pl_tbl_stacked_df

Unnamed: 0,Phoneme,symbol,diacritics,diacritic_count
0,p,p,,0
1,b,b,,0
2,,,,0
3,,,,0
4,t̪,t,̪,1
...,...,...,...,...
139,ʎ,ʎ,,0
140,,,,0
141,ʟ,ʟ,,0
142,,,,0
