In [308]:
from pathlib import Path
import requests
from tqdm import tqdm

phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [309]:
import pandas as pd
import numpy as np

phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")

phobile_by_language_dialect_df = (phoible_df[["LanguageName", "SpecificDialect", "Phoneme"]]
                         .fillna({"SpecificDialect": "none"})
                         .groupby(["LanguageName", "SpecificDialect"]))

num_lang = len(phobile_by_language_dialect_df)

In [310]:
# util

def create_symbol_matrix(symbol_type_df):
    symbol_type_long_df = symbol_type_df.map(lambda c: c + "ː" if type(c) ==  "str" else c)
    symbols = set(symbol_type_df.stack().replace("", np.nan).dropna().unique().tolist())
    symbols_long = set([c + "ː" for c in symbols])

    def map_symbols(df):
        valid_symbols = set(df.Phoneme.to_list()).intersection(symbols)
        valid_symbols_long = set(data.Phoneme.to_list()).intersection(symbols_long)

        standard = np.where(symbol_type_df.isin(valid_symbols), 1., 0.)
        long = np.where(symbol_type_long_df.isin(valid_symbols_long), 1., 0.)
        return np.stack([standard, long], axis=2)

    return map_symbols

Generate pulomic consonants data.

In [311]:
consonants_plumonic_df = pd.read_csv("./data/consonants_plumonic.csv", dtype=str, index_col=[0], keep_default_na=False, na_values=["-1"])
consonants_plumonic_long_df = consonants_plumonic_df.map(lambda c: c + "ː" if type(c) ==  str else c)
consonants_plumonic_long_df

Unnamed: 0,Bilabial,Bilabial_v,Labiodental,Labiodental_v,Dental,Dental_v,Alveolar,Alveolar_v,Postalveolar,Postalveolar_v,...,Palatal,Palatal_v,Velar,Velar_v,Uvular,Uvular_v,Pharyngeal,Pharyngeal_v,Glottal,Glottal_v
Plosive,pː,bː,ː,ː,t̪ː,d̪ː,tː,dː,ː,ː,...,cː,ɟː,kː,ɡː,qː,ɢː,ː,,ʔː,
Nasal,ː,mː,ː,ɱː,ː,n̪ː,ː,nː,ː,ː,...,ː,ɲː,ː,ŋː,ː,ɴː,,,,
Trill,ː,ʙː,ː,ː,ː,r̪ː,ː,rː,ː,ː,...,ː,ː,,,ː,ʀː,ː,ː,,
Tap or Flap,ː,ː,ː,ⱱː,ː,ɾ̪ː,ː,ɾː,ː,ː,...,ː,ː,,,ː,ː,ː,ː,,
Fricative,ɸː,βː,fː,vː,θː,ðː,sː,zː,ʃː,ʒː,...,çː,ʝː,xː,ɣː,χː,ʁː,ħː,ʕː,hː,ɦː
Lateral fricative,,,,,ː,ː,ɬː,ɮː,ː,ː,...,ː,ː,ː,ː,ː,ː,,,,
Approximant,ː,ː,ː,ʋː,ː,ː,ː,ɹː,ː,ː,...,ː,jː,ː,ɰː,ː,ː,ː,ː,,
Lateral approximant,,,,,ː,l̪ː,ː,lː,ː,ː,...,ː,ʎː,ː,ʟː,ː,ː,,,,


In [312]:
# sense check
for (l, d), data in phobile_by_language_dialect_df:
    symbols = set(consonants_plumonic_df.stack().replace("", np.nan).dropna().unique().tolist())
    symbols_long = set([c + "ː" for c in symbols])

    phonemes = set(data.Phoneme.to_list())

    valid_symbols = phonemes.intersection(symbols)
    valid_symbols_long = phonemes.intersection(symbols_long)

    standard = np.where(consonants_plumonic_df.isin(valid_symbols), 1., 0.)
    long = np.where(consonants_plumonic_long_df.isin(valid_symbols_long), 1., 0.)
    result = np.stack([standard, long], axis=2)

    assert result.sum() == len(valid_symbols | valid_symbols_long)

In [313]:
consonants_plumonic_npy = np.stack(phobile_by_language_dialect_df.apply(create_symbol_matrix(consonants_plumonic_df), include_groups=False).to_numpy())
assert consonants_plumonic_npy.shape == (num_lang, *consonants_plumonic_df.shape, 2)
np.save("./data/consonants_plumonic.npy", consonants_plumonic_npy)

Generate non-pulomic consonants data.

In [314]:
consonants_non_plumonic_df = pd.read_csv("./data/consonants_non_plumonic.csv", dtype=str, index_col=[0], keep_default_na=False, na_values=["-1"])
consonants_non_plumonic_long_df = consonants_non_plumonic_df.map(lambda c: c + "ː" if type(c) ==  str else c)
consonants_non_plumonic_df

Unnamed: 0,0,1,2,3,4,5,6,7
Ejective Stop,pʼ,tʼ,ʈʼ,cʼ,kʼ,qʼ,ʡʼ,
Ejective Fricative,fʼ,θʼ,sʼ,ʃʼ,ʂʼ,ɕʼ,xʼ,χʼ
Ejective Affricate,tsʼ,t̠ʃʼ,ʈʂʼ,kxʼ,qχʼ,,,
Ejective Lateral,ɬʼ,tɬʼ,cʎ̝̊ʼ,kʟ̝̊ʼ,,,,
Click,kʘ,kǀ,kǃ,kǂ,kǁ,,,
Implosive Voiced,ɓ,ɗ,ᶑ,ʄ,ɠ,ʛ,,
Implosive Voiceless,ɓ̥,ɗ̥,ᶑ̥,ʄ̥,ɠ̊,ʛ̥,,


In [315]:
# sense check
for (l, d), data in phobile_by_language_dialect_df:
    symbols = set(consonants_non_plumonic_df.stack().replace("", np.nan).dropna().unique().tolist())
    symbols_long = set([c + "ː" for c in symbols])

    valid_symbols = set(data.Phoneme.to_list()).intersection(symbols)
    valid_symbols_long = set(data.Phoneme.to_list()).intersection(symbols_long)

    standard = np.where(consonants_non_plumonic_df.isin(valid_symbols), 1., 0.)
    long = np.where(consonants_non_plumonic_long_df.isin(valid_symbols_long), 1., 0.)
    result = np.stack([standard, long], axis=2)
    
    assert result.sum() == len(valid_symbols | valid_symbols_long)

In [316]:
consonants_non_plumonic_npy = np.stack(phobile_by_language_dialect_df.apply(create_symbol_matrix(consonants_non_plumonic_df), include_groups=False).to_numpy())
assert consonants_non_plumonic_npy.shape == (num_lang, *consonants_non_plumonic_df.shape, 2)
np.save("./data/consonants_non_plumonic.npy", consonants_non_plumonic_npy)

Generate vowel data.

In [317]:
phoible_df[(phoible_df.LanguageName == "Welsh") & (phoible_df.SegmentClass == "vowel")][["Phoneme"]]

Unnamed: 0,Phoneme
86146,a
86147,ai
86148,au
86149,ɑː
86152,eː
86153,ə
86154,əi
86155,əu
86156,ɛ
86157,ɛu
