# WALS Correlation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
values_df = pd.read_csv('../data/wals-2020/cldf/values.csv')
parameters_df = pd.read_csv('../data/wals-2020/cldf/parameters.csv')
codes_df = pd.read_csv('../data/wals-2020/cldf/codes.csv')
language_names_df = pd.read_csv('../data/wals-2020/cldf/language_names.csv')

## Denormalize values table

In [3]:
language_names_df = language_names_df.groupby('Language_ID').first()
df = pd.merge(values_df, language_names_df, on='Language_ID')
df = df[['Language_ID', 'Name', 'Parameter_ID', 'Value']].rename(columns={'Name': 'Language_Name', 'Value': 'Parameter_Value'})
df = pd.merge(df, parameters_df, left_on='Parameter_ID', right_on='ID')
df = df[['Language_ID', 'Language_Name', 'Parameter_ID', 'Name', 'Area', 'Parameter_Value']].rename(columns={'Name': 'Parameter_Name'})
df = pd.merge(df, codes_df, left_on=('Parameter_ID', 'Parameter_Value'), right_on=('Parameter_ID', 'Number'))
df = df[['Language_ID', 'Language_Name', 'Area', 'Parameter_Name', 'Name', 'Description']].rename(columns={'Name': 'Value'})
df

Unnamed: 0,Language_ID,Language_Name,Area,Parameter_Name,Value,Description
0,aar,Aari,Morphology,Prefixing vs. Suffixing in Inflectional Morpho...,Strongly suffixing,Predominantly suffixing
1,abi,Abipon,Morphology,Prefixing vs. Suffixing in Inflectional Morpho...,Strongly suffixing,Predominantly suffixing
2,abn,Arabana,Morphology,Prefixing vs. Suffixing in Inflectional Morpho...,Strongly suffixing,Predominantly suffixing
3,acu,Achuar-Shiwiar,Morphology,Prefixing vs. Suffixing in Inflectional Morpho...,Strongly suffixing,Predominantly suffixing
4,adn,Adnyamathanha,Morphology,Prefixing vs. Suffixing in Inflectional Morpho...,Strongly suffixing,Predominantly suffixing
...,...,...,...,...,...,...
74109,hna,Mina,Word Order,Double-headed relative clauses,Double-headed as nondominant type,Double-headed as nondominant type
74110,ygr,"Yagaria, Move dialect",Word Order,Double-headed relative clauses,Double-headed as nondominant type,Double-headed as nondominant type
74111,jms,"Dogon, Jamsay",Word Order,Double-headed relative clauses,Double-headed or internally-headed,Double-headed or internally-headed
74112,kmb,Kombai,Word Order,Double-headed relative clauses,Double-headed dominant,Double-headed dominant


## Compare two languages on differences

In [4]:
lang1_df = df[df.Language_ID == 'slo']
lang2_df = df[df.Language_ID == 'pol']
both_df = pd.merge(lang1_df, lang2_df, on='Parameter_Name')
both_df = both_df[['Area_x', 'Parameter_Name', 'Value_x', 'Value_y']]
both_df = both_df.rename(columns={'Value_x': 'Lang1', 'Value_y': 'Lang2'})
both_df[both_df.Lang1 != both_df.Lang2]

Unnamed: 0,Area_x,Parameter_Name,Lang1,Lang2
3,Word Order,Order of Subject and Verb,SV,No dominant order
6,Word Order,Order of Genitive and Noun,No dominant order,Noun-Genitive
12,Simple Clauses,Expression of Pronominal Subjects,Subject affixes on verb,Subject clitics on variable host
27,Phonology,Fixed Stress Locations,No fixed stress,Penultimate
28,Phonology,Weight-Sensitive Stress,Unbounded: Stress can be anywhere,Fixed stress (no weight-sensitivity)
29,Phonology,Weight Factors in Weight-Sensitive Stress Systems,Prominence,No weight
33,Verbal Categories,Epistemic Possibility,Other,Verbal constructions
38,Lexicon,Tea,Words derived from Sinitic cha,Others
41,Phonology,Rhythm Types,No rhythmic stress,Trochaic
