In [166]:
from pathlib import Path
import requests
from tqdm import tqdm

phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [167]:
import pandas as pd
import numpy as np

phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")

phobile_by_language_dialect_df = (phoible_df[["LanguageName", "SpecificDialect", "Phoneme"]]
                         .fillna({"SpecificDialect": "none"})
                         .groupby(["LanguageName", "SpecificDialect"]))

num_lang = len(phobile_by_language_dialect_df)

In [168]:
# util

def create_symbol_matrix(symbol_type_df, symbols):
    def map_symbols(df):
        valid_symbols = set(df.Phoneme.to_list()).intersection(symbols)
        return np.where(symbol_type_df.isin(valid_symbols), 1., 0.)
    return map_symbols

Generate pulomic consonants data.

In [169]:
consonants_plumonic_df = pd.read_csv("./data/consonants_plumonic.csv", dtype=str, index_col=[0], keep_default_na=False, na_values=["-1"])
consonants_plumonic = set(consonants_plumonic_df.stack().replace("", np.nan).dropna().unique().tolist())
consonants_plumonic_df

Unnamed: 0,Bilabial,Bilabial_v,Labiodental,Labiodental_v,Dental,Dental_v,Alveolar,Alveolar_v,Postalveolar,Postalveolar_v,...,Palatal,Palatal_v,Velar,Velar_v,Uvular,Uvular_v,Pharyngeal,Pharyngeal_v,Glottal,Glottal_v
Plosive,p,b,,,t̪,d̪,t,d,,,...,c,ɟ,k,ɡ,q,ɢ,,,ʔ,
Nasal,,m,,ɱ,,n̪,,n,,,...,,ɲ,,ŋ,,ɴ,,,,
Trill,,ʙ,,,,r̪,,r,,,...,,,,,,ʀ,,,,
Tap or Flap,,,,ⱱ,,ɾ̪,,ɾ,,,...,,,,,,,,,,
Fricative,ɸ,β,f,v,θ,ð,s,z,ʃ,ʒ,...,ç,ʝ,x,ɣ,χ,ʁ,ħ,ʕ,h,ɦ
Lateral fricative,,,,,,,ɬ,ɮ,,,...,,,,,,,,,,
Approximant,,,,ʋ,,,,ɹ,,,...,,j,,ɰ,,,,,,
Lateral approximant,,,,,,l̪,,l,,,...,,ʎ,,ʟ,,,,,,


In [170]:
# sense check
for (l, d), data in phobile_by_language_dialect_df:
    valid_symbols = set(data.Phoneme.to_list()).intersection(consonants_plumonic)
    result = np.where(consonants_plumonic_df.isin(valid_symbols), 1., 0.)
    assert result.sum() == len(valid_symbols)

In [171]:
consonants_plumonic_npy = np.stack(phobile_by_language_dialect_df.apply(create_symbol_matrix(consonants_plumonic_df, consonants_plumonic), include_groups=False).to_numpy())
assert consonants_plumonic_npy.shape == (num_lang, *consonants_plumonic_df.shape)
np.save("./data/consonants_plumonic.npy", consonants_plumonic_npy)

Generate non-pulomic consonants data.

In [172]:
consonants_non_plumonic_df = pd.read_csv("./data/consonants_non_plumonic.csv", dtype=str, index_col=[0], keep_default_na=False, na_values=["-1"])
consonants_non_plumonic = set(consonants_non_plumonic_df.stack().replace("", np.nan).dropna().unique().tolist())
consonants_non_plumonic_df

Unnamed: 0,0,1,2,3,4,5,6,7
Ejective Stop,pʼ,tʼ,ʈʼ,cʼ,kʼ,qʼ,ʡʼ,
Ejective Fricative,fʼ,θʼ,sʼ,ʃʼ,ʂʼ,ɕʼ,xʼ,χʼ
Ejective Affricate,tsʼ,t̠ʃʼ,ʈʂʼ,kxʼ,qχʼ,,,
Ejective Lateral,ɬʼ,tɬʼ,cʎ̝̊ʼ,kʟ̝̊ʼ,,,,
Click,kʘ,kǀ,kǃ,kǂ,kǁ,,,
Implosive Voiced,ɓ,ɗ,ᶑ,ʄ,ɠ,ʛ,,
Implosive Voiceless,ɓ̥,ɗ̥,ᶑ̥,ʄ̥,ɠ̊,ʛ̥,,


In [173]:
# sense check
for (l, d), data in phobile_by_language_dialect_df:
    valid_symbols = set(data.Phoneme.to_list()).intersection(consonants_non_plumonic)
    result = np.where(consonants_non_plumonic_df.isin(valid_symbols), 1., 0.)
    assert result.sum() == len(valid_symbols)

In [174]:
consonants_non_plumonic_npy = np.stack(phobile_by_language_dialect_df.apply(create_symbol_matrix(consonants_non_plumonic_df, consonants_non_plumonic), include_groups=False).to_numpy())
assert consonants_non_plumonic_npy.shape == (num_lang, *consonants_non_plumonic_df.shape)
np.save("./data/consonants_non_plumonic.npy", consonants_non_plumonic_npy)

Generate vowel data.

In [177]:
phoible_df[(phoible_df.LanguageName == "Welsh") & (phoible_df.SegmentClass == "vowel")][["Phoneme"]]

Unnamed: 0,Phoneme
86146,a
86147,ai
86148,au
86149,ɑː
86152,eː
86153,ə
86154,əi
86155,əu
86156,ɛ
86157,ɛu


In [182]:
# Long sounds haven't been accounted for and will be missing from the consonant data
phoible_df[["SegmentClass", "Phoneme"]][phoible_df.Phoneme.str.endswith("ː")].drop_duplicates()

Unnamed: 0,SegmentClass,Phoneme
1,vowel,aː
3,vowel,æː
5,vowel,eː
7,vowel,ɤː
10,vowel,iː
...,...,...
93916,vowel,ɯ̰̽ː
93920,vowel,y̰ː
93941,consonant,kxʰː
94701,vowel,u̯ɔː
