In [128]:
import numpy as np
import pandas as pd

In [129]:
language_phonemes = np.load("./data/language_phonemes.npy", allow_pickle=True).astype(int)
language_names = np.load("./data/language_names.npy", allow_pickle=True)[:, 0]

assert language_phonemes.shape[0] == language_names.shape[0]
language_names.shape, language_phonemes.shape

((2949,), (2949, 8, 42))

For languages that have multiple samples in the Phoible dataset, we need to select a way of choosing one sample or combine multiple samples into one.

In [130]:
unique_names, unique_names_count = np.unique(language_names, return_counts=True)
multiple_samples = unique_names[unique_names_count > 1]
single_sample = unique_names[unique_names_count == 1]

language_data = {
    name: {"data": language_phonemes[language_names == name]} for name in unique_names
}

# sense check
for name in unique_names:
    assert (
        language_phonemes[language_names == name].shape
        == language_data[name]["data"].shape
    )

In [131]:
all_values = np.sort(np.unique(language_phonemes))
# all_values = all_values[all_values > 0]

for name, v in language_data.items():
    if name in multiple_samples:
        v["min"] = np.min(v["data"], axis=0)
        v["max"] = np.max(v["data"], axis=0)
        v["mean"] = np.rint(np.mean(v["data"], axis=0))
        v["diff"] = v["max"] - v["min"]
    else:
        v["min"] = v["data"][0]
        v["max"] = v["data"][0]
        v["mean"] = v["data"][0]
        v["diff"] = np.zeros(v["data"][0].shape)

    min, min_counts = np.unique(v["min"], return_counts=True)
    max, max_counts = np.unique(v["max"], return_counts=True)

    counts = [
        *[0 if n not in min else min_counts[np.where(min == n)][0] for n in all_values],
        *[0 if n not in max else max_counts[np.where(max == n)][0] for n in all_values],
    ]
    v["counts"] = np.array(counts)

# sense check
assert all(
    [
        metric.shape == language_phonemes.shape[1:]
        for k, v in language_data.items()
        for n, metric in v.items()
        if n != "data" and n != "counts"
    ]
)

In [132]:
lang_stats = np.array(
    [
        [name, v["min"].sum(), v["max"].sum(), v["mean"].sum(), v["diff"].sum()]
        for name, v in language_data.items()
        if name in multiple_samples
    ]
)

lang_stats_df = pd.DataFrame(
    lang_stats[:, 1:], index=lang_stats[:, 0], columns=["min", "max", "mean", "diff"]
).sort_values("diff", ascending=False)
lang_stats_df.head(5)

Unnamed: 0,min,max,mean,diff
Remo,9,97,56.0,88
Iron Ossetic,0,88,18.0,88
Laz,3,90,21.0,87
Lithuanian,4,87,34.0,83
Spanish,3,84,23.0,81


The table above shows a huge difference between max/min values in each sample.

The values shown are sums of all the values in the language_phoneme matrix for each language.

In [133]:
phoneme_type_counts = np.array(
    [v["counts"] for name, v in language_data.items() if name in multiple_samples]
)

phoneme_type_counts_df = pd.DataFrame(
    phoneme_type_counts,
    index=lang_stats[:, 0],
    columns=[f"{v}_min" for v in all_values] + [f"{v}_max" for v in all_values],
).sort_values("3_max", ascending=False)
phoneme_type_counts_df.head(10)

Unnamed: 0,0_min,1_min,2_min,3_min,0_max,1_max,2_max,3_max
Shan,331,5,0,0,291,11,1,33
Remo,327,9,0,0,295,13,0,28
Mingrelian,327,9,0,0,283,24,2,27
Lithuanian,332,4,0,0,295,18,0,23
English (British),321,15,0,0,305,7,1,23
Kabardian,331,5,0,0,293,22,0,21
Greek,321,15,0,0,292,24,1,19
Western Balochi,329,7,0,0,296,22,0,18
Friulian,330,6,0,0,307,11,0,18
Spanish,333,3,0,0,282,38,2,14


It seems most of the difference can be explained by some datasets including information on phoneme length. In the language_phoneme matrix, a bitmaks-like encoding was used. 

- 0 = phoneme not present
- 1 = phoneme present (regular length)
- 2 = phoneme present (long length)
- 3 = phoneme present (regular & long length)

Based on that knowledge, it probably makes sense to use the `max` sample for each language. It probably contains the most phonemes for the language and also encodes more information about each phoneme than samples lacking any data on phoneme length.