In [67]:
import pandas
import lingtypology
import math
from lingtypology.db_apis import Wals
from scipy.stats import chi2_contingency, fisher_exact

In [11]:
#Download all Wals
#If you don't want to do it, just read 'full_wals.csv' with pandas
features_list = Wals().features_list
w = Wals(*features_list)
w.show_citation = False
data = w.get_df(join_how='outer')
#data = pandas.read_csv('full_wals.csv', low_memory=False)

In [17]:
data.to_csv('full_wals.csv')

In [13]:
data.head()

Unnamed: 0,wals_code,language,genus,family,coordinates,_1A_area,_1A,_1A_num,_1A_desc,_2A_area,...,_144W_num,_144W_desc,_144X_area,_144X,_144X_num,_144X_desc,_144Y_area,_144Y,_144Y_num,_144Y_desc
0,kiw,Kiwai (Southern),Kiwaian,Kiwaian,"(-8.0, 143.5)",Phonology,1. Small,1.0,Small,Phonology,...,,,,,,,,,,
1,xoo,!Xóõ,Tu,Tu,"(-24.0, 21.5)",Phonology,5. Large,5.0,Large,Phonology,...,,,,,,,,,,
2,ani,//Ani,Khoe-Kwadi,Khoe-Kwadi,"(-18.9166666667, 21.9166666667)",Phonology,5. Large,5.0,Large,Phonology,...,,,,,,,,,,
3,abi,Abipón,South Guaicuruan,Guaicuruan,"(-29.0, -61.0)",Phonology,2. Moderately small,2.0,Moderately small,Phonology,...,,,,,,,,,,
4,abk,Abkhaz,Northwest Caucasian,Northwest Caucasian,"(43.0833333333, 41.0)",Phonology,5. Large,5.0,Large,Phonology,...,,,,,,,,,,


In [24]:
binary = {}
for feature in data:
    if feature.endswith('_desc'):
        values = set([value for value in data[feature] if isinstance(value, str)])
        if len(values) == 2:
            binary[feature] = list(values)
binary

{'_10A_desc': ['Contrast absent', 'Contrast present'],
 '_25B_desc': ['Non-zero marking', 'Zero-marking'],
 '_39B_desc': ['Inclusive and exclusive differentiated',
  'No inclusive/exclusive opposition'],
 '_47A_desc': ['Differentiated', 'Identical'],
 '_58A_desc': ['Absent', 'Exists'],
 '_63A_desc': ["'And' different from 'with'", "'And' identical to 'with'"],
 '_65A_desc': ['Grammatical marking', 'No grammatical marking'],
 '_67A_desc': ['No inflectional future', 'Inflectional future exists'],
 '_73A_desc': ['Inflectional optative absent',
  'Inflectional optative present'],
 '_90F_desc': ['Adjoined relative clause dominant', 'Adjoined or correlative'],
 '_107A_desc': ['Present', 'Absent'],
 '_119A_desc': ['Identical', 'Different'],
 '_120A_desc': ['Impossible', 'Possible'],
 '_129A_desc': ['Identical', 'Different'],
 '_130A_desc': ['Identical', 'Different'],
 '_136B_desc': ['No m in first person singular', 'm in first person singular'],
 '_137B_desc': ['m in second person singular',


In [72]:
matrix = pandas.DataFrame({
    'feature': list(binary)
})
simplified_matrix = pandas.DataFrame({
    'feature': list(binary)
})
for main in binary:
    verbose = []
    simple = []
    for depending in binary:
        if not main == depending:
            # Get the necessary data and strip nans
            df = data[['wals_code', main, depending]]
            df = df[df[main].astype('str') != 'nan']
            df = df[df[depending].astype(str) != 'nan']
            main_value = binary[main][0]
            # Default distribution
            default = [
                len(df[df[depending] == binary[depending][0]]),
                len(df[df[depending] == binary[depending][1]])
            ]
            # Get the main value only
            df = df[df[main] == main_value]
            # Count lenght all depending values
            test = [
                len(df[df[depending] == binary[depending][0]]),
                len(df[df[depending] == binary[depending][1]])
            ]
            # Count chi square
            table = [default, test]
            try:
                test_result = chi2_contingency(table)
                test = 'chi2'
            except Exception:
            #    test_result = fisher_exact(table)
            #    test = 'Fisher'
                test_result = [math.nan, math.nan]
                test = 'None'
        else:
            test_result = [1, 1.0]
        verbose.append({
            'main value': main_value,
            'depending': binary[depending],
            'test': test,
            'pvalue': test_result[1]
        })
        simple.append('%.05f' % test_result[1])
    matrix[main] = verbose
    simplified_matrix[main] = simple


In [73]:
simplified_matrix

Unnamed: 0,feature,_10A_desc,_25B_desc,_39B_desc,_47A_desc,_58A_desc,_63A_desc,_65A_desc,_67A_desc,_73A_desc,_90F_desc,_107A_desc,_119A_desc,_120A_desc,_129A_desc,_130A_desc,_136B_desc,_137B_desc,_141A_desc
0,_10A_desc,1.0,0.94327,,0.63296,0.99779,0.80975,0.91771,0.86412,0.98945,,0.97514,0.84055,0.61625,0.64395,0.12491,0.50258,0.62273,
1,_25B_desc,0.90442,1.0,,0.96609,0.99078,0.88464,0.84589,0.34463,0.92299,,0.9851,0.77664,0.81411,0.87967,0.78495,0.94111,0.98871,
2,_39B_desc,1.0,1.0,1.0,0.66501,1.0,1.0,,,1.0,0.5207,0.71188,0.66783,1.0,0.95231,0.62988,1.0,,
3,_47A_desc,0.8212,0.98463,0.66501,1.0,0.94862,0.87929,0.56504,0.74412,0.74823,,0.97962,0.89754,0.66652,0.77152,0.06119,0.93108,0.37411,
4,_58A_desc,0.99359,0.98912,,0.80643,1.0,0.89656,0.62163,0.82154,0.932,,0.83584,0.9888,0.9892,0.67262,0.16527,0.42401,0.15776,
5,_63A_desc,0.89407,0.9383,,0.80389,0.92806,1.0,0.5017,0.95157,0.73758,1.0,0.92092,0.05295,0.67791,0.31932,0.30268,0.87106,0.22178,
6,_65A_desc,0.97888,0.92766,1.0,0.55886,0.86265,0.64709,1.0,0.91288,0.85752,,0.92346,0.94508,0.39401,1.0,0.46192,0.96863,0.85658,
7,_67A_desc,0.89986,0.76561,1.0,0.71917,0.94626,0.95085,0.91035,1.0,0.83677,0.57615,0.53791,0.83045,0.5481,0.27406,0.12489,0.72031,0.81673,
8,_73A_desc,0.9749,0.9413,,0.33082,0.91946,0.54947,0.58411,0.53013,1.0,,0.98298,0.58046,0.36147,0.68742,0.80634,0.84187,0.65556,
9,_90F_desc,1.0,1.0,0.5207,,1.0,,,0.50499,1.0,1.0,0.40174,1.0,0.81948,,0.57615,1.0,,
