In [90]:
import pandas
import os
import lingtypology
import math
import numpy as np
import matplotlib.pyplot as plt
from lingtypology.db_apis import Phoible
from scipy.stats import linregress, chi2_contingency
from functools import reduce

In [19]:
#Вытащим все бинарные фичи
p = Phoible(aggregated=False)
binary_features = []
df = p.get_df()
for col in df:
    if [cell for cell in set(df[col]) if cell in ('+', '-')] == ['-', '+']:
        binary_features.append(col)
binary_features

Moran, Steven & McCloy, Daniel (eds.) 2019.
PHOIBLE 2.0.
Jena: Max Planck Institute for the Science of Human History.
(Available online at http://phoible.org, Accessed on 2019-05-23.)


['syllabic',
 'short',
 'long',
 'consonantal',
 'sonorant',
 'continuant',
 'delayedRelease',
 'approximant',
 'tap',
 'trill',
 'nasal',
 'lateral',
 'labial',
 'round',
 'labiodental',
 'coronal',
 'anterior',
 'distributed',
 'strident',
 'dorsal',
 'high',
 'low',
 'front',
 'back',
 'tense',
 'retractedTongueRoot',
 'advancedTongueRoot',
 'periodicGlottalSource',
 'epilaryngealSource',
 'spreadGlottis',
 'constrictedGlottis',
 'fortis',
 'raisedLarynxEjective',
 'loweredLarynxImplosive',
 'click']

Посчитаем всё про бинарные фичи для датасетов из Phoible

In [157]:
def fwrite(path, data):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(data)

def count_stats(phoible, subset, feature, count_regressions=False):
    phoible.subset = subset
    data = p.get_df()
    amount_with_feature = data[data[feature] == '+'].groupby('Glottocode').size()
    
    languages = [lingtypology.glottolog.get_by_glot_id(glot_id) for glot_id in amount_with_feature.index]
    with_feature = pandas.DataFrame({
        'language': languages,
        feature: amount_with_feature,
        'elevation': lingtypology.get_elevations(languages),
    })
    with_feature = with_feature[with_feature.elevation != '']
    if with_feature.empty:
        print('No data: ' + subset)
        return
    
    if count_regressions:
        #Зависит ли количество абруптивных в языках, где они суть, от высоты
        regression_no_zeros = linregress(
            list(map(int, with_feature[feature])),
            list(map(int, with_feature.elevation))
        )
    
    no_feature = data[~data.Glottocode.isin(list(amount_with_feature.index))]
    no_feature = no_feature.drop_duplicates(subset='Glottocode')
    languages = [lingtypology.glottolog.get_by_glot_id(glot_id) for glot_id in no_feature.Glottocode]
    no_feature = pandas.DataFrame({
        'language': languages,
        feature: 0,
        'elevation': lingtypology.get_elevations(languages),
    })
    no_feature = no_feature[no_feature.elevation != '']
    all_ = pandas.concat((with_feature, no_feature))

    #Зависит ли количество абруптивных/имплозивных во всех яхыках от высоты
    if count_regressions:
        regression_with_zeros = linregress(
            list(map(int, all_[feature])),
            list(map(int, all_.elevation))
        )

    higher = all_[all_.elevation > 1500]
    higher = [len(higher[higher[feature] > 0]), len(higher[higher[feature] == 0])]
    lower = all_[all_.elevation <= 1500]
    lower = [len(lower[lower[feature] > 0]), len(lower[lower[feature] == 0])]
    table = [higher, lower]
    
    #Правда ли, что, если больше 1500 метров, то ты с фичёй?
    try:
        chi = chi2_contingency(table)
    except ValueError:
        chi = [math.nan, math.nan, math.nan, math.nan]
    
    #Нарисуем все графики и запишем все данные в файлы
    cdir = 'phoible_results' + os.path.sep + subset
    if not os.path.exists(cdir):
        os.mkdir(cdir)
    
    if count_regressions:
        #График регрессия для языков с фичёй
        plt.scatter(with_feature[feature], with_feature.elevation, color='black')
        axes = plt.gca()
        x_vals = np.array(axes.get_xlim())
        y_vals = regression_no_zeros.intercept + regression_no_zeros.slope*x_vals 
        plt.plot(x_vals, y_vals, linewidth=3)
        plt.savefig(cdir + os.path.sep + '{}_linear_regression_only.png'.format(feature), format='PNG')
        plt.cla()
        plt.clf()

        #График регрессии для всех языков по фиче
        plt.scatter(all_[feature], all_.elevation, color='black')
        axes = plt.gca()
        x_vals = np.array(axes.get_xlim())
        y_vals = regression_with_zeros.intercept + regression_with_zeros.slope*x_vals 
        plt.plot(x_vals, y_vals, linewidth=3)
        plt.savefig(cdir + os.path.sep + '{}_linear_regression_all.png'.format(feature), format='PNG')
        plt.cla()
        plt.clf()
    
        #Результаты подсчёта регрессии
        reg_str = 'Slope:\t{slope}\nIntercept:\t{intercept}\nR_value:\t{rvalue}\nP_value:\t{pvalue}'
        fwrite(
            cdir + os.path.sep + '{}_linear_regression_only.csv'.format(feature),
            reg_str.format(
                slope = regression_no_zeros.slope,
                intercept = regression_no_zeros.intercept,
                rvalue = regression_no_zeros.rvalue,
                pvalue = regression_no_zeros.pvalue
            )
        )
        fwrite(
            cdir + os.path.sep + '{}_linear_regression_all.csv'.format(feature),
            reg_str.format(
                slope = regression_with_zeros.slope,
                intercept = regression_with_zeros.intercept,
                rvalue = regression_with_zeros.rvalue,
                pvalue = regression_with_zeros.pvalue
            )
        )
    
    #Результаты хи-квадрата
    fwrite(
        cdir + os.path.sep + '{}_chi2.csv'.format(feature),
        'chi2:\t{chi2}\nP_value:\t{pvalue}\nDegrees of freedom:\t{dof}\nExpected:\t{ex}'.format(
            chi2 = chi[0],
            pvalue = chi[1],
            dof = chi[2],
            ex = chi[3]
        )
    )
    
    #Чистые данные
    with_feature.to_csv(cdir + os.path.sep + '{}_with_raw.csv'.format(feature))
    all_.to_csv(cdir + os.path.sep + '{}_all_raw.csv'.format(feature))
    if count_regressions:
        return subset, chi, regression_no_zeros, regression_with_zeros
    else:
        return subset, chi

In [159]:
if __name__ == '__main__':
    features = binary_features#['loweredLarynxImplosive', 'raisedLarynxEjective', 'long', 'short']
    subsets = ['UPSID', 'SPA', 'AA', 'PH', 'GM', 'RA', 'SAPHON']
    results = {}
    if not os.path.exists('phoible_results'):
        os.mkdir('phoible_results')
    p = Phoible(subset='all', aggregated=False)
    p.show_citation = False
    for feature in features:
        processed_subsets = []
        regressions_no_zeros = []
        regressions_with_zeros = []
        chi2s = []
        for subset in subsets:
            r = count_stats(p, subset, feature, count_regressions=True) if feature == 'raisedLarynxEjective' \
                                                else count_stats(p, subset, feature)
            if r:
                processed_subsets.append(r[0])
                if feature == 'raisedLarynxEjective':
                    regressions_no_zeros.append(r[2])
                    regressions_with_zeros.append(r[3])
                    chi2s.append(r[1])
                else:
                    chi2s.append(r[1])
        plt.close()
        if feature == 'raisedLarynxEjective':
            regressed_result = pandas.DataFrame({
                'Dataset': processed_subsets,
                'Regression (only with feature)': ['%.015f' % r.pvalue for r in regressions_no_zeros],
                'Regression (all languages)': ['%.015f' % r.pvalue for r in regressions_with_zeros],
                'Chi2 Test': ['%.015f' % c[1] for c in chi2s if not math.isnan(c[1])]
            })
        else:
            if not all((math.isnan(el) for el in [c[1] for c in chi2s])):
                result = pandas.DataFrame({
                    'Dataset': processed_subsets + ['Median'],
                    feature: ['%.015f' % c[1] for c in chi2s] + \
                    [np.median([c[1] for c in chi2s if not math.isnan(c[1])])]
                })
                results[feature] = result

Elevations for these languages were not found: Kaliai, Nama, Katcha
Elevations for these languages were not found: Ikwo, Ezaa
Elevations for these languages were not found: Mianmin, Bikele, Korafe, Mvumbo, Karo, Saanich, Lorette Huron, Endo, Kuay
Elevations for these languages were not found: Chaha, Besleri, Efutu, Frafra, Zayse, Ezha, Soddo, Mmani, Copi, Kambe, Oko, Pana, Ikalanga, Kauma, Dinka, Gumer, Moghamo
Elevations for these languages were not found: Naiki, Mising, Abujmaria
Elevations for these languages were not found: Karo, Miraña, Khithaulhu, Shipibo
Elevations for these languages were not found: Kaliai, Nama, Katcha
No data: AA
No data: PH
Elevations for these languages were not found: Chaha, Besleri, Efutu, Frafra, Zayse, Soddo, Ezha, Mmani, Copi, Kambe, Oko, Pana, Ikalanga, Kauma, Dinka, Gumer, Moghamo
Elevations for these languages were not found: Abujmaria
Elevations for these languages were not found: Naiki, Mising
No data: SAPHON
Elevations for these languages were no

Elevations for these languages were not found: Naiki, Mising, Abujmaria
Elevations for these languages were not found: Karo, Miraña, Khithaulhu, Shipibo
Elevations for these languages were not found: Nama, Katcha
Elevations for these languages were not found: Kaliai
Elevations for these languages were not found: Ikwo, Ezaa
Elevations for these languages were not found: Mianmin, Bikele, Korafe, Mvumbo, Kuay
Elevations for these languages were not found: Saanich, Karo, Endo, Lorette Huron
Elevations for these languages were not found: Chaha, Besleri, Efutu, Frafra, Ezha, Mmani, Copi, Kambe, Oko, Pana, Ikalanga, Kauma, Soddo, Gumer, Moghamo
Elevations for these languages were not found: Zayse, Dinka
Elevations for these languages were not found: Naiki
Elevations for these languages were not found: Mising, Abujmaria
Elevations for these languages were not found: Karo, Miraña, Khithaulhu, Shipibo
Elevations for these languages were not found: Kaliai, Nama, Katcha
Elevations for these langua

Elevations for these languages were not found: Karo, Miraña, Khithaulhu, Shipibo
No data: UPSID
No data: SPA
No data: AA
Elevations for these languages were not found: Mianmin, Bikele, Korafe, Mvumbo, Saanich, Karo, Lorette Huron, Endo, Kuay
Elevations for these languages were not found: Chaha, Besleri, Efutu, Frafra, Zayse, Soddo, Ezha, Mmani, Copi, Kambe, Oko, Pana, Ikalanga, Kauma, Dinka, Gumer, Moghamo
No data: RA
No data: SAPHON
Elevations for these languages were not found: Kaliai, Nama
Elevations for these languages were not found: Katcha
Elevations for these languages were not found: Ikwo
Elevations for these languages were not found: Ezaa
Elevations for these languages were not found: Mianmin, Korafe, Mvumbo, Karo, Saanich, Lorette Huron, Kuay
Elevations for these languages were not found: Endo, Bikele
Elevations for these languages were not found: Chaha, Besleri, Efutu, Frafra, Zayse, Ezha, Soddo, Copi, Kambe, Oko, Pana, Ikalanga, Kauma, Dinka, Gumer
Elevations for these lang

In [160]:
regressed_result

Unnamed: 0,Dataset,Regression (only with feature),Regression (all languages),Chi2 Test
0,UPSID,0.950559282993466,4.4964081592e-05,3.2921681908e-05
1,SPA,0.475539733143422,5.592842023e-06,0.000176784757431
2,PH,0.731523538203316,0.392451413030472,0.160190111324293
3,GM,0.038586492300174,0.0,0.0
4,SAPHON,0.018874875617294,5.031926e-09,0.000377241915218


In [161]:
df = pandas.DataFrame()
for i, result in enumerate(results):
    if i == 0:
        df = results[result]
    else:
        df = pandas.merge(df, results[result], how='outer', on='Dataset')
df = df.reindex([0, 1, 2, 3, 5, 6, 7, 4])
df.to_csv('phoible_result.csv')

In [162]:
df

Unnamed: 0,Dataset,short,long,delayedRelease,tap,trill,nasal,lateral,labial,round,...,back,tense,retractedTongueRoot,advancedTongueRoot,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,loweredLarynxImplosive,click
0,UPSID,0.730419672713904,0.620511832347898,0.610642343383107,0.92722783157449,0.517374876065757,0.738778553384912,0.117359459300816,,0.266708754830617,...,,0.266708754830617,0.124305390808548,,,0.362376239087925,0.127966661243741,,0.565424335905343,
1,SPA,0.497428201856995,0.831051620467149,0.433495995608892,0.987328733151668,0.960465877523403,,0.546315254202639,0.378695084030329,0.378695084030329,...,,,0.893600396371416,0.378695084030329,,0.885780256031918,0.132809308454472,0.808314671561521,0.877635260028769,
2,GM,0.658720709358762,0.006986869609299,0.843461590175212,0.836723984942173,0.949874476413349,0.160318898286199,0.641480155123851,,0.160318898286199,...,0.160318898286199,0.160318898286199,0.824181832981538,,0.160318898286199,0.048041180787709,0.005662174866105,,0.224522852678438,0.160318898286199
3,RA,0.08259276921486,0.112500539872434,,0.112500539872434,0.062153287592109,,0.930140153281564,,,...,,,0.930140153281564,,,0.894069996254116,0.12442786417134,,0.321514293603832,
5,AA,,0.755885144448328,,0.907576007474965,0.486470077078995,,0.049114227976661,,,...,,,0.225210272657504,,,0.130185303900607,0.649133295389312,,0.567918568194081,
6,PH,,0.254948600104008,0.905140088960697,0.790767254178321,0.132695245671694,0.757345703655888,0.320518850359147,,,...,,0.255246281705311,0.866506575329833,0.255246281705311,0.255246281705311,0.808968585395863,0.14319697650972,,0.94554733555659,
7,SAPHON,,0.02869427959889,0.485630464439581,0.349615490771714,0.85201833901043,0.711266316433214,1.332869234e-06,,,...,,,,0.186412393772568,,0.009041205598374,0.342260126597239,,0.643234605638429,
4,Median,0.578074,0.254949,0.610642,0.836724,0.517375,0.725022,0.320519,0.378695,0.266709,...,0.160319,0.255246,0.845344,0.255246,0.207783,0.362376,0.132809,0.808315,0.567919,0.160319
