In [18]:
%reload_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from IPython.display import display


In [19]:
data = np.load("./data/generated_languages.npz", allow_pickle=True)
generated_languages = np.squeeze(data["generated_languages"])
weirdness = data["weirdness"]

l = generated_languages.shape[0]

most_different = np.argmax(weirdness)
least_different = np.argmin(weirdness)

generated_cons_pl = generated_languages[:, 0:14, 0:24]
generated_cons_npl = generated_languages[:, 15:22, 0:10]
generated_cons_coart = generated_languages[:, 15:18, 10:14]
generated_vowels = generated_languages[:, 15:22, 14:24]

generated_languages.shape, generated_cons_pl.shape, generated_cons_npl.shape, generated_cons_coart.shape, generated_vowels.shape

((100, 24, 24), (100, 14, 24), (100, 7, 10), (100, 3, 4), (100, 7, 10))

In [20]:
# load the phoneme table dataframes

cons_pl_tbl_df = pd.read_csv(
    "./data/consonants_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
    on_bad_lines="warn",
)

cons_npl_tbl_df = pd.read_csv(
    "./data/consonants_non_plumonic.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)

cons_coart_tbl_df = pd.read_csv(
    "./data/consonants_coarticulated.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)

vowels_tbl_df = pd.read_csv(
    "./data/vowels.csv",
    dtype=str,
    index_col=[0],
    keep_default_na=False,
    na_values=["-1"],
)

cons_pl_tbl_df.shape, cons_npl_tbl_df.shape, cons_coart_tbl_df.shape, vowels_tbl_df.shape

((14, 24), (7, 10), (3, 4), (7, 10))

In [21]:
def map_generated_to_phonemes(tbl):
    def mapper(lang):
        return np.where(lang > 0, tbl.replace(np.nan, "*").replace("", "*"), "")
    return mapper
    
generated_cons_pl_phonemes = np.array([map_generated_to_phonemes(cons_pl_tbl_df)(p) for p in generated_cons_pl])
generated_cons_npl_phonemes = np.array([map_generated_to_phonemes(cons_npl_tbl_df)(p) for p in generated_cons_npl])
generated_cons_coart_phonemes = np.array([map_generated_to_phonemes(cons_coart_tbl_df)(p) for p in generated_cons_coart])
generated_vowels_phonemes = np.array([map_generated_to_phonemes(vowels_tbl_df)(p) for p in generated_vowels])

# display(pd.DataFrame(generated_cons_pl_phonemes[5], index=cons_pl_tbl_df.index, columns=cons_pl_tbl_df.columns))
# display(pd.DataFrame(generated_cons_npl_phonemes[5], index=cons_npl_tbl_df.index, columns=cons_npl_tbl_df.columns))
# display(pd.DataFrame(generated_cons_coart_phonemes[5], index=cons_coart_tbl_df.index, columns=cons_coart_tbl_df.columns))
# display(pd.DataFrame(generated_vowels_phonemes[5], index=vowels_tbl_df.index, columns=vowels_tbl_df.columns))

generated_cons_pl_phonemes.shape, generated_cons_npl_phonemes.shape, generated_cons_coart_phonemes.shape, generated_vowels_phonemes.shape

((100, 14, 24), (100, 7, 10), (100, 3, 4), (100, 7, 10))

In [22]:
consonants = [[c for c in np.unique(cs) if c != ""] for  cs in np.hstack([
    generated_cons_pl_phonemes.reshape(l, -1),
    generated_cons_npl_phonemes.reshape(l, -1),
    generated_cons_coart_phonemes.reshape(l, -1),
])]

vowels = [[v for v in np.unique(vs) if v != ""] for  vs in generated_vowels_phonemes.reshape(l, -1)]

print(f"wierdness: {np.min(weirdness)}")
print(consonants[least_different])
print(vowels[least_different])
print(f"wierdness: {np.max(weirdness)}")
print(consonants[most_different])
print(vowels[most_different])

wierdness: 0.0
['b', 'h', 'j', 'k', 'm', 'n', 'p', 's', 't', 't̠ʃ', 'w', 'ŋ', 'ɡ', 'ɲ']
['a', 'e', 'i', 'o', 'u']
wierdness: 1.0
['b', 'd̠ʒ', 'f', 'h', 'j', 'k', 'l', 'm', 'n', 'p', 's', 't', 't̠ʃ', 'v', 'w', 'x', 'z', 'ŋ', 'ɡ', 'ɣ', 'ɲ', 'ɻ', 'ʃ', 'ʈ', 'ʒ']
['a', 'e', 'i', 'o', 'u', 'ɔ', 'ə', 'ɛ']
