In [43]:
from pathlib import Path
import requests
from tqdm import tqdm

phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [44]:
import pandas as pd
import numpy as np

phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")

In [45]:
consonants_plumonic_df = pd.read_csv("./consonants_plumonic.csv", dtype=str, index_col=[0], keep_default_na=False, na_values=["-1"])
consonants_plumonic = set(consonants_plumonic_df.stack().replace("", np.nan).dropna().unique().tolist())
consonants_plumonic_df

Unnamed: 0,Bilabial,Bilabial_v,Labiodental,Labiodental_v,Dental,Dental_v,Alveolar,Alveolar_v,Postalveolar,Postalveolar_v,...,Palatal,Palatal_v,Velar,Velar_v,Uvular,Uvular_v,Pharyngeal,Pharyngeal_v,Glottal,Glottal_v
Plosive,p,b,,,t̪,d̪,t,d,,,...,c,ɟ,k,ɡ,q,ɢ,,,ʔ,
Nasal,,m,,ɱ,,n̪,,n,,,...,,ɲ,,ŋ,,ɴ,,,,
Trill,,ʙ,,,,r̪,,r,,,...,,,,,,ʀ,,,,
Tap or Flap,,,,ⱱ,,ɾ̪,,ɾ,,,...,,,,,,,,,,
Fricative,ɸ,β,f,v,θ,ð,s,z,ʃ,ʒ,...,ç,ʝ,x,ɣ,χ,ʁ,ħ,ʕ,h,ɦ
Lateral fricative,,,,,,,ɬ,ɮ,,,...,,,,,,,,,,
Approximant,,,,ʋ,,,,ɹ,,,...,,j,,ɰ,,,,,,
Lateral approximant,,,,,,l̪,,l,,,...,,ʎ,,ʟ,,,,,,


In [46]:
welsh_plumonics = phoible_df[(phoible_df.LanguageName == "Welsh") & (phoible_df.Phoneme.isin(consonants_plumonic))][['Phoneme', 'SegmentClass']]
welsh_plumonics.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19 entries, 86150 to 86193
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Phoneme       19 non-null     object
 1   SegmentClass  19 non-null     object
dtypes: object(2)
memory usage: 456.0+ bytes


In [47]:
phobile_consonants_df = phoible_df[phoible_df["SegmentClass"] == "consonant"][["LanguageName", "SpecificDialect", "Phoneme"]]
phobile_consonants_df.fillna({"SpecificDialect": "none"}, inplace=True)

grouped_consonants = phobile_consonants_df.groupby(["LanguageName", "SpecificDialect"])

In [54]:
# sense check
for (l, d), data in grouped_consonants:
    plumonics = set(data.Phoneme.to_list()).intersection(consonants_plumonic)
    result = np.where(consonants_plumonic_df.isin(plumonics), 1., 0.)
    assert result.sum() == len(plumonics)

In [74]:
def np_map(df):
    plumonics = set(df.Phoneme.to_list()).intersection(consonants_plumonic)
    matrix = np.where(consonants_plumonic_df.isin(plumonics), 1., 0.)
    return matrix
    
mapped = grouped_consonants.apply(np_map, include_groups=False).to_numpy()
print(type(mapped))
consonants_plumonic_npy = np.array(mapped)

print(consonants_plumonic_npy.shape)
consonants_plumonic_npy

<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class 'numpy.ndarray'>
(8, 22)
<class '

array([array([[1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 1.,
               0., 0., 0., 0., 0., 0.],
              [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
               0., 0., 0., 0., 0., 0.],
              [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
               0., 0., 0., 0., 0., 0.],
              [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
               0., 0., 0., 0., 0., 0.],
              [0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 0., 0., 0., 0., 1., 0.,
               0., 0., 0., 0., 0., 1.],
              [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
               0., 0., 0., 0., 0., 0.],
              [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
               0., 0., 0., 0., 0., 0.],
              [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
               0., 0., 0., 0., 0., 0.]])                                      ,
       a

In [None]:
# sense check
for (l, d), data in grouped_consonants:
    plumonics = set(data.Phoneme.to_list()).intersection(consonants_plumonic)
    # result = np.where(consonants_plumonic_df.isin(lang_consonants), 1, 0)
    result = consonants_plumonic_df.isin(plumonics).astype(int)