In [51]:
from pathlib import Path
import requests
from tqdm import tqdm
import matplotlib.pyplot as plt

%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import display


phoible_data_url = "https://raw.githubusercontent.com/phoible/dev/v2.0/data/phoible.csv"
phobile_file_path = "./downloads/phoible.csv"

VALIDATE_RESULTS = True

if not Path(phobile_file_path).exists():
    response = requests.get(phoible_data_url, stream=True)

    with open(phobile_file_path, "wb") as fh:
        for data in tqdm(response.iter_content()):
            fh.write(data)

In [52]:
phoible_df = pd.read_csv(phobile_file_path, encoding="utf-8", engine="python")
phoible_df.columns.to_frame(name="col")

Unnamed: 0,col
InventoryID,InventoryID
Glottocode,Glottocode
ISO6393,ISO6393
LanguageName,LanguageName
SpecificDialect,SpecificDialect
GlyphID,GlyphID
Phoneme,Phoneme
Allophones,Allophones
Marginal,Marginal
SegmentClass,SegmentClass


In [53]:
# diacritics_to_remove = set(["ː", "ʰ", "ˀ"])

phonemes_df = (
    phoible_df[["ISO6393", "LanguageName", "SpecificDialect", "Phoneme"]]
    # phoible_df.drop(columns=["InventoryID", "Glottocode", "GlyphID", "Allophones", "Source"])
    .copy()
    .fillna({"SpecificDialect": "none"})
)
# phonemes_df.Phoneme = phonemes_df.Phoneme.str.replace("|".join(diacritics_to_remove), "", regex=True)
phonemes_df = phonemes_df.drop_duplicates()
display(phonemes_df.head(5))
phoible_df.shape, phonemes_df.shape

Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme
0,kor,Korean,none,a
1,kor,Korean,none,aː
2,kor,Korean,none,æ
3,kor,Korean,none,æː
4,kor,Korean,none,e


((105467, 48), (103790, 4))

In [54]:
cons_pl_tbl_df = pd.read_csv(
    "./data/consonants_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_pl_tbl_df)

cons_npl_tbl_df = pd.read_csv(
    "./data/consonants_non_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_npl_tbl_df)

cons_coart_tbl_df = pd.read_csv(
    "./data/consonants_coarticulated.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(cons_coart_tbl_df)

vowels_tbl_df = pd.read_csv(
    "./data/vowels.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)
display(vowels_tbl_df)

all_ipa_phonemes = pd.concat(
    [
        cons_pl_tbl_df.stack(),
        cons_npl_tbl_df.stack(),
        cons_coart_tbl_df.stack(),
        vowels_tbl_df.stack(),
    ]
).replace("", np.nan).dropna().reset_index(drop=True)

print("all_ipa_phonemes", all_ipa_phonemes.shape)

cons_pl_tbl_df.shape, cons_npl_tbl_df.shape, cons_coart_tbl_df.shape, vowels_tbl_df.shape

Unnamed: 0,Bilabial,Bilabial_v,Labiodental,Labiodental_v,Dental,Dental_v,Alveolar,Alveolar_v,Postalveolar,Postalveolar_v,...,Palatal,Palatal_v,Velar,Velar_v,Uvular,Uvular_v,Pharyngeal,Pharyngeal_v,Glottal,Glottal_v
Plosive,p,b,,,t̪,d̪,t,d,,,...,c,ɟ,k,ɡ,q,ɢ,,,ʔ,
Nasal,,m,,ɱ,,n̪,,n,,,...,,ɲ,,ŋ,,ɴ,,,,
Trill,,ʙ,,,,r̪,,r,,,...,,,,,,ʀ,,,,
Tap or Flap,,,,ⱱ,,ɾ̪,,ɾ,,,...,,,,,,,,,,
Fricative,ɸ,β,f,v,θ,ð,s,z,ʃ,ʒ,...,ç,ʝ,x,ɣ,χ,ʁ,ħ,ʕ,h,ɦ
Lateral fricative,,,,,,,ɬ,ɮ,,,...,,,,,,,,,,
Approximant,,,,ʋ,,,,ɹ,,,...,,j,,ɰ,,,,,,
Lateral approximant,,,,,,l̪,,l,,,...,,ʎ,,ʟ,,,,,,


Unnamed: 0,0,1,2,3,4,5,6,7
Ejective Stop,pʼ,tʼ,ʈʼ,cʼ,kʼ,qʼ,ʡʼ,
Ejective Fricative,fʼ,θʼ,sʼ,ʃʼ,ʂʼ,ɕʼ,xʼ,χʼ
Ejective Affricate,tsʼ,t̠ʃʼ,ʈʂʼ,kxʼ,qχʼ,,,
Ejective Lateral,ɬʼ,tɬʼ,cʎ̝̊ʼ,kʟ̝̊ʼ,,,,
Click,kʘ,kǀ,kǃ,kǂ,kǁ,,,
Implosive Voiced,ɓ,ɗ,ᶑ,ʄ,ɠ,ʛ,,
Implosive Voiceless,ɓ̥,ɗ̥,ᶑ̥,ʄ̥,ɠ̊,ʛ̥,,


Unnamed: 0,Labial–alveolar,Labial–alveolar_v,Labial–retroflex,Labial–retroflex_v,Labial–palatal,Labial–palatal_v,Labial–velar,Labial–velar_v,Labial–uvular,Labial–uvular_v,Velarized alveolar,Velarized alveolar_v,Uvular–epiglottal,Uvular–epiglottal_v,Palatal-velar,Palatal-velar_v
Nasal,,n͡m,,ɳ͡m,,,,ŋ͡m,,,,,,,,
Plosive,t͡p,d͡b,ʈ͡p,ɖ͡b,,,k͡p,ɡ͡b,q͡p,,,,q͡ʡ,,,
Fricative/approximant,,,,,ɥ̊,ɥ,ʍ,w,,,,,,,ɧ,
Lateral approximant,,,,,,,,,,,,ɫ,,,,
Implosive,,,,,,,ɠ̊͜ɓ̥,ɠ͡ɓ,,,,,,,,
Ejective,,t͡pʼ,,,,,,,,,,,,,,


Unnamed: 0,Front unrounded,Front rounded,Near-front unrounded,Near-front rounded,Central unrounded,Central rounded,Near-back unrounded,Near-back rounded,Back unrounded,Back rounded
Close,i,y,,,ɨ,ʉ,,,ɯ,u
Near-close,,,ɪ,ʏ,,,,ʊ,,
Close-mid,e,ø,,,ɘ,ɵ,,,ɤ,o
Mid,e̞,ø̞,,,ə,,,,ɤ̞,o̞
Open-mid,ɛ,œ,,,ɜ,ɞ,,,ʌ,ɔ
Near-open,æ,,,ɐ,,,,,,
Open,a,ɶ,,,ä,,,,ɑ,ɒ


all_ipa_phonemes (159,)


((8, 22), (7, 8), (6, 16), (7, 10))

In [55]:
handled_phonemes = phonemes_df[phonemes_df.Phoneme.isin(all_ipa_phonemes.values)]
unhandled_phonemes = phonemes_df[~phonemes_df.Phoneme.isin(all_ipa_phonemes.values)]

display(handled_phonemes.head(5))
display(unhandled_phonemes.head(5))

handled_phonemes.shape, unhandled_phonemes.shape

Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme
0,kor,Korean,none,a
2,kor,Korean,none,æ
4,kor,Korean,none,e
6,kor,Korean,none,ɤ
8,kor,Korean,none,h


Unnamed: 0,ISO6393,LanguageName,SpecificDialect,Phoneme
1,kor,Korean,none,aː
3,kor,Korean,none,æː
5,kor,Korean,none,eː
7,kor,Korean,none,ɤː
10,kor,Korean,none,iː


((66774, 4), (37016, 4))

In [66]:
dialect_phoneme_counts = handled_phonemes.groupby(["ISO6393", "LanguageName", "SpecificDialect"]).count()
dialect_phoneme_counts

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Phoneme
ISO6393,LanguageName,SpecificDialect,Unnamed: 3_level_1
aae,Arbëresh Albanian,Arbëresh Albanian (Hora e Arbëreshëvet),33
aae,Arbëresh Albanian,Arbëresh Albanian (Kundisa),30
aal,KOTOKO,none,31
aap,"Arára, Pará",none,16
aar,Afar,none,23
...,...,...,...
zul,Zulu,none,31
zum,Kumzari,"Kumzari spoken in Khasab, Oman",21
zun,ZUNI,none,14
zun,Zuni,none,22
