In [112]:
%pip install -e git+https://github.com/cldf-datasets/wals.git@v2020#egg=cldfbench_wals --quiet

Note: you may need to restart the kernel to use updated packages.


In [113]:
from cldfbench import get_dataset
import pandas as pd
import numpy as np

In [114]:
wals = get_dataset("wals").cldf_reader()

In [115]:
lang_df = pd.DataFrame(wals.iter_rows("LanguageTable"))[["ID", "Name", "ISO639P3code"]]
display(lang_df.head(3))
lang_df.shape

Unnamed: 0,ID,Name,ISO639P3code
0,aab,Arapesh (Abu),
1,aar,Aari,aiw
2,aba,Abau,aau


(2662, 3)

In [116]:
param_df = (
    pd.DataFrame(wals.iter_rows("ParameterTable"))
    .set_index(["Area", "Chapter"])
    .drop(columns=["Description", "Contributor_ID"])
)

param_df = param_df[
    ~param_df.index.get_level_values("Area").isin(["Sign Languages", "Other"])
]

areas = param_df.index.unique(level="Area")
chapters = param_df.index.unique(level="Chapter")

print(f"Areas: {areas.shape[0]}")
print(f"Chapters: {chapters.shape[0]}")
print(f"Parameters: {param_df.shape[0]}")

param_ids = param_df["ID"].to_numpy()
param_names = param_df["Name"].to_numpy()

param_df.groupby(level=[0]).head(2)

Areas: 9
Chapters: 140
Parameters: 188


Unnamed: 0_level_0,Unnamed: 1_level_0,ID,Name
Area,Chapter,Unnamed: 2_level_1,Unnamed: 3_level_1
Phonology,Consonant Inventories,1A,Consonant Inventories
Phonology,Vowel Quality Inventories,2A,Vowel Quality Inventories
Morphology,Fusion of Selected Inflectional Formatives,20A,Fusion of Selected Inflectional Formatives
Morphology,Exponence of Selected Inflectional Formatives,21A,Exponence of Selected Inflectional Formatives
Nominal Categories,Number of Genders,30A,Number of Genders
Nominal Categories,Sex-based and Non-sex-based Gender Systems,31A,Sex-based and Non-sex-based Gender Systems
Nominal Syntax,Obligatory Possessive Inflection,58A,Obligatory Possessive Inflection
Nominal Syntax,Obligatory Possessive Inflection,58B,Number of Possessive Nouns
Verbal Categories,Perfective/Imperfective Aspect,65A,Perfective/Imperfective Aspect
Verbal Categories,The Past Tense,66A,The Past Tense


In [117]:
code_df = pd.DataFrame(wals.iter_rows("CodeTable"))
code_df.head(3)

Unnamed: 0,ID,Parameter_ID,Name,Description,Number,icon
0,1A-1,1A,Small,Small,1,c0000dd
1,1A-2,1A,Moderately small,Moderately small,2,c9999ff
2,1A-3,1A,Average,Average,3,cffffff


In [118]:
value_df = pd.DataFrame(wals.iter_rows("ValueTable")).drop(
    columns=["ID", "Code_ID", "Comment", "Source", "Example_ID"]
)
value_df["Value"] = value_df["Value"].astype(int)

value_df.head(3)

Unnamed: 0,Language_ID,Parameter_ID,Value
0,aab,81A,2
1,aab,82A,1
2,aab,83A,2


In [119]:
example_df = pd.DataFrame(wals.iter_rows("ExampleTable"))
example_df.head(3)

Unnamed: 0,ID,Language_ID,Primary_Text,Analyzed_Word,Gloss,Translated_Text,Meta_Language_ID,Comment
0,igt-1,mnd,tāmen tōu zìxíngchē,"[tāmen, tōu, zìxíngchē]","[3PL, steal, bicycle]",They steal bicycles.,,
1,igt-2,mnd,[wǒ gěi nǐ de] shū,"[[wǒ, gěi, nǐ, de], shū]","[[1SG, give, 2SG, LINK], book]",the book [that I gave you],,
2,igt-3,nbd,dımın ıdw ṍww(ı),"[dımın, ıdw, ṍww(ı)]","[ten, eight, two]",82,,


In [120]:
def map_params(df):
    lang = df.to_numpy()
    lang_value_indices = np.isin(lang[:, 0], param_ids).nonzero()
    param_indices = np.isin(param_ids, lang[:, 0]).nonzero()
    result = np.zeros(param_ids.shape)
    result[param_indices] = lang[:, 1][lang_value_indices]
    return result


unique_languages = np.unique(value_df.Language_ID)

language_parameters = np.hstack(
    [
        np.stack(
            value_df.groupby("Language_ID")
            .apply(map_params, include_groups=False)
            .to_numpy()
        ).astype(int),
        np.zeros((unique_languages.shape[0], 4)),
    ]
).reshape(unique_languages.shape[0], 16, -1)

language_parameters.shape, param_ids.shape, param_names.shape

((2660, 16, 12), (188,), (188,))

In [121]:
np.savez_compressed(
    "./data/language_parameters.npz",
    language_parameters=language_parameters,
    language_names=unique_languages,
    param_ids=param_ids,
    param_names=param_names,
)