In [417]:
from pathlib import Path
import requests
from tqdm import tqdm

phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [418]:
import pandas as pd
import numpy as np

phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")

phobile_by_language_dialect_df = (phoible_df[["LanguageName", "SpecificDialect", "Phoneme"]]
                         .fillna({"SpecificDialect": "none"})
                         .groupby(["LanguageName", "SpecificDialect"]))

num_lang = len(phobile_by_language_dialect_df)

In [419]:
# util

def create_symbol_matrix(symbol_type_df):
    symbol_type_long_df = symbol_type_df.map(lambda c: c + "ː" if type(c) ==  "str" else c)
    symbols = set(symbol_type_df.stack().replace("", np.nan).dropna().unique().tolist())
    symbols_long = set([c + "ː" for c in symbols])

    def map_symbols(df):
        valid_symbols = set(df.Phoneme.to_list()).intersection(symbols)
        valid_symbols_long = set(data.Phoneme.to_list()).intersection(symbols_long)

        standard = np.where(symbol_type_df.isin(valid_symbols), 1., 0.)
        long = np.where(symbol_type_long_df.isin(valid_symbols_long), 1., 0.)
        return np.stack([standard, long], axis=2)

    return map_symbols

Generate pulomic consonants data.

In [420]:
consonants_plumonic_df = pd.read_csv("./data/consonants_plumonic.csv", dtype=str, index_col=[0], keep_default_na=False, na_values=["-1"])
consonants_plumonic_long_df = consonants_plumonic_df.map(lambda c: c + "ː" if type(c) ==  str else c)
consonants_plumonic_df

Unnamed: 0,Bilabial,Bilabial_v,Labiodental,Labiodental_v,Dental,Dental_v,Alveolar,Alveolar_v,Postalveolar,Postalveolar_v,...,Palatal,Palatal_v,Velar,Velar_v,Uvular,Uvular_v,Pharyngeal,Pharyngeal_v,Glottal,Glottal_v
Plosive,p,b,,,t̪,d̪,t,d,,,...,c,ɟ,k,ɡ,q,ɢ,,,ʔ,
Nasal,,m,,ɱ,,n̪,,n,,,...,,ɲ,,ŋ,,ɴ,,,,
Trill,,ʙ,,,,r̪,,r,,,...,,,,,,ʀ,,,,
Tap or Flap,,,,ⱱ,,ɾ̪,,ɾ,,,...,,,,,,,,,,
Fricative,ɸ,β,f,v,θ,ð,s,z,ʃ,ʒ,...,ç,ʝ,x,ɣ,χ,ʁ,ħ,ʕ,h,ɦ
Lateral fricative,,,,,,,ɬ,ɮ,,,...,,,,,,,,,,
Approximant,,,,ʋ,,,,ɹ,,,...,,j,,ɰ,,,,,,
Lateral approximant,,,,,,l̪,,l,,,...,,ʎ,,ʟ,,,,,,


In [421]:
# sense check
for (l, d), data in phobile_by_language_dialect_df:
    symbols = set(consonants_plumonic_df.stack().replace("", np.nan).dropna().unique().tolist())
    symbols_long = set([c + "ː" for c in symbols])

    phonemes = set(data.Phoneme.to_list())

    valid_symbols = phonemes.intersection(symbols)
    valid_symbols_long = phonemes.intersection(symbols_long)

    standard = np.where(consonants_plumonic_df.isin(valid_symbols), 1., 0.)
    long = np.where(consonants_plumonic_long_df.isin(valid_symbols_long), 1., 0.)
    result = np.stack([standard, long], axis=2)

    assert result.sum() == len(valid_symbols | valid_symbols_long)

In [422]:
consonants_plumonic_npy = np.stack(phobile_by_language_dialect_df.apply(create_symbol_matrix(consonants_plumonic_df), include_groups=False).to_numpy())
assert consonants_plumonic_npy.shape == (num_lang, *consonants_plumonic_df.shape, 2)
np.save("./data/consonants_plumonic.npy", consonants_plumonic_npy)
consonants_plumonic_npy.shape

(2949, 8, 22, 2)

Generate non-pulomic consonants data.

In [423]:
consonants_non_plumonic_df = pd.read_csv("./data/consonants_non_plumonic.csv", dtype=str, index_col=[0], keep_default_na=False, na_values=["-1"])
consonants_non_plumonic_long_df = consonants_non_plumonic_df.map(lambda c: c + "ː" if type(c) ==  str else c)
consonants_non_plumonic_df

Unnamed: 0,0,1,2,3,4,5,6,7
Ejective Stop,pʼ,tʼ,ʈʼ,cʼ,kʼ,qʼ,ʡʼ,
Ejective Fricative,fʼ,θʼ,sʼ,ʃʼ,ʂʼ,ɕʼ,xʼ,χʼ
Ejective Affricate,tsʼ,t̠ʃʼ,ʈʂʼ,kxʼ,qχʼ,,,
Ejective Lateral,ɬʼ,tɬʼ,cʎ̝̊ʼ,kʟ̝̊ʼ,,,,
Click,kʘ,kǀ,kǃ,kǂ,kǁ,,,
Implosive Voiced,ɓ,ɗ,ᶑ,ʄ,ɠ,ʛ,,
Implosive Voiceless,ɓ̥,ɗ̥,ᶑ̥,ʄ̥,ɠ̊,ʛ̥,,


In [424]:
# sense check
for (l, d), data in phobile_by_language_dialect_df:
    symbols = set(consonants_non_plumonic_df.stack().replace("", np.nan).dropna().unique().tolist())
    symbols_long = set([c + "ː" for c in symbols])

    valid_symbols = set(data.Phoneme.to_list()).intersection(symbols)
    valid_symbols_long = set(data.Phoneme.to_list()).intersection(symbols_long)

    standard = np.where(consonants_non_plumonic_df.isin(valid_symbols), 1., 0.)
    long = np.where(consonants_non_plumonic_long_df.isin(valid_symbols_long), 1., 0.)
    result = np.stack([standard, long], axis=2)
    
    assert result.sum() == len(valid_symbols | valid_symbols_long)

In [425]:
consonants_non_plumonic_npy = np.stack(phobile_by_language_dialect_df.apply(create_symbol_matrix(consonants_non_plumonic_df), include_groups=False).to_numpy())
assert consonants_non_plumonic_npy.shape == (num_lang, *consonants_non_plumonic_df.shape, 2)
np.save("./data/consonants_non_plumonic.npy", consonants_non_plumonic_npy)
consonants_non_plumonic_npy.shape

(2949, 7, 8, 2)

Generate vowel data.

In [426]:
vowels_df = pd.read_csv("./data/vowels.csv", dtype=str, index_col=[0], keep_default_na=False, na_values=["-1"])
vowels_long_df = vowels_df.map(lambda c: c + "ː" if type(c) ==  str else c)
vowels_df

Unnamed: 0,Front unrounded,Front rounded,Near-front unrounded,Near-front rounded,Central unrounded,Central rounded,Near-back unrounded,Near-back rounded,Back unrounded,Back rounded
Close,i,y,,,ɨ,ʉ,,,ɯ,u
Near-close,,,ɪ,ʏ,,,,ʊ,,
Close-mid,e,ø,,,ɘ,ɵ,,,ɤ,o
Mid,e̞,ø̞,,,ə,,,,ɤ̞,o̞
Open-mid,ɛ,œ,,,ɜ,ɞ,,,ʌ,ɔ
Near-open,æ,,,ɐ,,,,,,
Open,a,ɶ,,,ä,,,,ɑ,ɒ


In [427]:
# sense check
for (l, d), data in phobile_by_language_dialect_df:
    symbols = set(vowels_df.stack().replace("", np.nan).dropna().unique().tolist())
    symbols_long = set([c + "ː" for c in symbols])

    valid_symbols = set(data.Phoneme.to_list()).intersection(symbols)
    valid_symbols_long = set(data.Phoneme.to_list()).intersection(symbols_long)

    standard = np.where(vowels_df.isin(valid_symbols), 1., 0.)
    long = np.where(vowels_long_df.isin(valid_symbols_long), 1., 0.)
    result = np.stack([standard, long], axis=2)
    
    assert result.sum() == len(valid_symbols | valid_symbols_long)

In [428]:
vowels_npy = np.stack(phobile_by_language_dialect_df.apply(create_symbol_matrix(vowels_df), include_groups=False).to_numpy())
assert vowels_npy.shape == (num_lang, *vowels_df.shape, 2)
np.save("./data/vowels.npy", vowels_npy)
vowels_npy.shape

(2949, 7, 10, 2)

Merge all the data into a single np array

In [429]:
max_shape = tuple(np.maximum(
    np.maximum(consonants_plumonic_npy.shape, consonants_non_plumonic_npy.shape), 
    vowels_npy.shape).tolist())

def get_padding(max_shape, shape):
    return tuple([tuple([0, x-y]) for x, y in zip(max_shape, shape)])

consonants_plumonic_padded_npy = np.pad(consonants_plumonic_npy, get_padding(max_shape, consonants_plumonic_npy.shape), mode='constant')
consonants_non_plumonic_padded_npy = np.pad(consonants_non_plumonic_npy, get_padding(max_shape, consonants_non_plumonic_npy.shape), mode='constant')
vowels_padded_npy = np.pad(vowels_npy, get_padding(max_shape, vowels_npy.shape), mode='constant')


In [430]:
language_phonemes_npy = np.stack([consonants_plumonic_padded_npy, consonants_non_plumonic_padded_npy, vowels_padded_npy], axis=1)
language_phonemes_npy.shape

(2949, 3, 8, 22, 2)

In [431]:
np.save("./data/language_phonemes.npy", language_phonemes_npy)