In [2]:
import pandas
import lingtypology
import math
from lingtypology.db_apis import Wals
from scipy.stats import chi2_contingency, fisher_exact

In [3]:
#Download all Wals
#If you don't want to do it, just read 'full_wals.csv' with pandas
features_list = Wals().features_list
w = Wals(*features_list)
w.show_citation = False
data = w.get_df(join_how='outer')
#data = pandas.read_csv('full_wals.csv', low_memory=False)

In [4]:
data.to_csv('full_wals.csv')

In [5]:
data.head()

Unnamed: 0,wals_code,language,genus,family,coordinates,_1A_area,_1A,_1A_num,_1A_desc,_2A_area,...,_144W_num,_144W_desc,_144X_area,_144X,_144X_num,_144X_desc,_144Y_area,_144Y,_144Y_num,_144Y_desc
0,kiw,Kiwai (Southern),Kiwaian,Kiwaian,"(-8.0, 143.5)",Phonology,1. Small,1.0,Small,Phonology,...,,,,,,,,,,
1,xoo,!Xóõ,Tu,Tu,"(-24.0, 21.5)",Phonology,5. Large,5.0,Large,Phonology,...,,,,,,,,,,
2,ani,//Ani,Khoe-Kwadi,Khoe-Kwadi,"(-18.9166666667, 21.9166666667)",Phonology,5. Large,5.0,Large,Phonology,...,,,,,,,,,,
3,abi,Abipón,South Guaicuruan,Guaicuruan,"(-29.0, -61.0)",Phonology,2. Moderately small,2.0,Moderately small,Phonology,...,,,,,,,,,,
4,abk,Abkhaz,Northwest Caucasian,Northwest Caucasian,"(43.0833333333, 41.0)",Phonology,5. Large,5.0,Large,Phonology,...,,,,,,,,,,


In [6]:
binary = {}
for feature in data:
    if feature.endswith('_desc'):
        values = set([value for value in data[feature] if isinstance(value, str)])
        if len(values) == 2:
            binary[feature] = list(values)
binary

{'_10A_desc': ['Contrast absent', 'Contrast present'],
 '_25B_desc': ['Zero-marking', 'Non-zero marking'],
 '_39B_desc': ['No inclusive/exclusive opposition',
  'Inclusive and exclusive differentiated'],
 '_47A_desc': ['Differentiated', 'Identical'],
 '_58A_desc': ['Exists', 'Absent'],
 '_63A_desc': ["'And' identical to 'with'", "'And' different from 'with'"],
 '_65A_desc': ['Grammatical marking', 'No grammatical marking'],
 '_67A_desc': ['No inflectional future', 'Inflectional future exists'],
 '_73A_desc': ['Inflectional optative absent',
  'Inflectional optative present'],
 '_90F_desc': ['Adjoined or correlative', 'Adjoined relative clause dominant'],
 '_107A_desc': ['Present', 'Absent'],
 '_119A_desc': ['Different', 'Identical'],
 '_120A_desc': ['Impossible', 'Possible'],
 '_129A_desc': ['Different', 'Identical'],
 '_130A_desc': ['Different', 'Identical'],
 '_136B_desc': ['m in first person singular', 'No m in first person singular'],
 '_137B_desc': ['m in second person singular',


In [10]:
matrix = pandas.DataFrame({
    'feature': list(binary)
})
simplified_matrix = pandas.DataFrame({
    'feature': list(binary)
})
for main in binary:
    verbose = []
    simple = []
    for depending in binary:
        main_value = binary[main][0]
        if not main == depending:
            # Get the necessary data and strip nans
            df = data[['wals_code', main, depending]]
            df = df[df[main].astype('str') != 'nan']
            df = df[df[depending].astype(str) != 'nan']
            # Default distribution
            default = [
                len(df[df[depending] == binary[depending][0]]),
                len(df[df[depending] == binary[depending][1]])
            ]
            # Get the main value only
            df = df[df[main] == main_value]
            # Count lenght all depending values
            test = [
                len(df[df[depending] == binary[depending][0]]),
                len(df[df[depending] == binary[depending][1]])
            ]
            # Count chi square
            table = [default, test]
            try:
                test_result = chi2_contingency(table)
                test = 'chi2'
            except Exception:
            #    test_result = fisher_exact(table)
            #    test = 'Fisher'
                test_result = [math.nan, math.nan]
                test = 'None'
        else:
            test_result = [1, 1.0]
            test = None
        verbose.append({
            'main value': main_value,
            'depending': binary[depending],
            'test': test,
            'pvalue': test_result[1]
        })
        simple.append('%.05f' % test_result[1])
    matrix[main] = verbose
    simplified_matrix[main] = simple


  expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
  observed = observed + 0.5 * np.sign(expected - observed)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [11]:
simplified_matrix

Unnamed: 0,feature,_10A_desc,_25B_desc,_39B_desc,_47A_desc,_58A_desc,_63A_desc,_65A_desc,_67A_desc,_73A_desc,_90F_desc,_107A_desc,_119A_desc,_120A_desc,_129A_desc,_130A_desc,_136B_desc,_137B_desc,_141A_desc
0,_10A_desc,1.0,0.99444,,0.63296,0.85429,0.69362,0.91771,0.86412,0.98945,,0.97514,0.9185,0.61625,0.83437,0.63398,0.19119,0.62273,
1,_25B_desc,0.90442,1.0,,0.96609,0.82507,0.92484,0.84589,0.34463,0.92299,,0.9851,0.91172,0.81411,0.86925,0.97363,0.75445,0.98871,
2,_39B_desc,1.0,,1.0,0.66501,,,,,1.0,0.27029,0.71188,0.70514,1.0,0.91135,0.76981,,,
3,_47A_desc,0.8212,0.84267,0.66501,1.0,0.62565,0.74572,0.56504,0.74412,0.74823,,0.97962,0.90094,0.66652,0.91034,0.66789,0.83953,0.37411,
4,_58A_desc,0.99359,0.79088,,0.80643,1.0,0.90317,0.62163,0.82154,0.932,,0.83584,0.96488,0.9892,0.85478,0.69289,0.05867,0.15776,
5,_63A_desc,0.89407,0.99169,,0.80389,0.92873,1.0,0.5017,0.95157,0.73758,,0.92092,0.19855,0.67791,0.43482,0.89728,0.83558,0.22178,
6,_65A_desc,0.97888,0.87432,,0.55886,0.45505,0.4289,1.0,0.91288,0.85752,,0.92346,0.97399,0.39401,0.97668,0.89212,0.88036,0.85658,
7,_67A_desc,0.89986,0.07144,,0.71917,0.69214,0.96398,0.91035,1.0,0.83677,0.50499,0.53791,0.88731,0.5481,0.48062,0.724,0.4436,0.81673,
8,_73A_desc,0.9749,0.46067,,0.33082,0.94114,0.21691,0.58411,0.53013,1.0,,0.98298,0.67955,0.36147,0.76975,0.98827,0.60808,0.65556,
9,_90F_desc,1.0,,0.27029,,,,,0.50499,1.0,1.0,0.40174,,0.81948,1.0,0.50499,,,


In [18]:
for_tex = simplified_matrix[simplified_matrix.columns[:6]].head(6)

In [21]:
print(for_tex.to_latex())

\begin{tabular}{lllllll}
\toprule
{} &    feature & \_10A\_desc & \_25B\_desc & \_39B\_desc & \_47A\_desc & \_58A\_desc \\
\midrule
0 &  \_10A\_desc &   1.00000 &   0.99444 &       nan &   0.63296 &   0.85429 \\
1 &  \_25B\_desc &   0.90442 &   1.00000 &       nan &   0.96609 &   0.82507 \\
2 &  \_39B\_desc &   1.00000 &       nan &   1.00000 &   0.66501 &       nan \\
3 &  \_47A\_desc &   0.82120 &   0.84267 &   0.66501 &   1.00000 &   0.62565 \\
4 &  \_58A\_desc &   0.99359 &   0.79088 &       nan &   0.80643 &   1.00000 \\
5 &  \_63A\_desc &   0.89407 &   0.99169 &       nan &   0.80389 &   0.92873 \\
\bottomrule
\end{tabular}



In [23]:
len(simplified_matrix)

18