The [Phoible](https://phoible.org/) dataset contains phoneme inventories for thousands of languages and dialects. Many languages/dialicts have multiple Phoible records. Here, I'm mapping the data against pre-prepared IPA phoneme tables, then selecting one sample per table per language so that each language is only represented once in the final dataset (to avoid bias by oversampling).



In [1]:
from pathlib import Path
import requests
from tqdm import tqdm
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import numpy as np

phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

VALIDATE_RESULTS = True

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [2]:
ipa_diacritics_df = pd.read_csv(
    "./data/ipa_diacritics.csv", dtype=str, encoding="utf-8", engine="python"
)
valid_suffixes = set(ipa_diacritics_df.suffix)
ipa_diacritics_df.shape

(38, 2)

In [3]:
phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")
dialect_phonemes_df = (
    phoible_df[["ISO6393", "LanguageName", "SpecificDialect", "Phoneme", "GlyphID"]]
    .copy()
    .fillna({"SpecificDialect": "none"})
)

lang_by_dialect_df = dialect_phonemes_df.groupby(["LanguageName", "SpecificDialect"])

language_names = (
    dialect_phonemes_df[["LanguageName", "SpecificDialect"]]
    .drop_duplicates()
    .reset_index(drop=True)
    .to_numpy()[:, 0]
)

num_lang = len(lang_by_dialect_df)

print("num_lang", num_lang)
print(dialect_phonemes_df.shape)

dialect_phonemes_df.head(3)

num_lang 2949
(105467, 5)


Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,GlyphID
0,kor,Korean,none,a,0061
1,kor,Korean,none,aː,0061+02D0
2,kor,Korean,none,æ,00E6


In [11]:
dialect_phonemes_df["symbol"] = dialect_phonemes_df.Phoneme.str.split("").apply(
    lambda p: "".join([x for x in p if x != "" and x not in valid_suffixes])
)
dialect_phonemes_df["diacritics"] = dialect_phonemes_df.Phoneme.str.split("").apply(
    lambda p: "".join([x for x in p if x != "" and x in valid_suffixes])
)

dialect_phonemes_df.head(5)

Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme,GlyphID,symbol,diacritics
0,kor,Korean,none,a,0061,a,
1,kor,Korean,none,aː,0061+02D0,a,ː
2,kor,Korean,none,æ,00E6,æ,
3,kor,Korean,none,æː,00E6+02D0,æ,ː
4,kor,Korean,none,e,0065,e,
