In [2]:
import pandas
import os
import lingtypology
import math
import numpy as np
import matplotlib.pyplot as plt
from lingtypology.db_apis import Phoible
from scipy.stats import linregress, chi2_contingency
from functools import reduce

In [13]:
#Вытащим все бинарные фичи
p = Phoible(aggregated=False)
binary_features = []
df = p.get_df()
for col in df:
    cond = [cell for cell in set(df[col]) if cell in ('+', '-')]
    if cond == ['-', '+'] or cond == ['+', '-']:
        binary_features.append(col)
binary_features

Moran, Steven & McCloy, Daniel (eds.) 2019.
PHOIBLE 2.0.
Jena: Max Planck Institute for the Science of Human History.
(Available online at http://phoible.org, Accessed on 2019-05-30.)


['syllabic',
 'short',
 'long',
 'consonantal',
 'sonorant',
 'continuant',
 'delayedRelease',
 'approximant',
 'tap',
 'trill',
 'nasal',
 'lateral',
 'labial',
 'round',
 'labiodental',
 'coronal',
 'anterior',
 'distributed',
 'strident',
 'dorsal',
 'high',
 'low',
 'front',
 'back',
 'tense',
 'retractedTongueRoot',
 'advancedTongueRoot',
 'periodicGlottalSource',
 'epilaryngealSource',
 'spreadGlottis',
 'constrictedGlottis',
 'fortis',
 'raisedLarynxEjective',
 'loweredLarynxImplosive',
 'click']

Посчитаем всё про бинарные фичи для датасетов из Phoible

In [14]:
def fwrite(path, data):
    with open(path, 'w', encoding='utf-8') as f:
        f.write(data)

def count_stats(phoible, subset, feature, count_regressions=False):
    phoible.subset = subset
    data = p.get_df()
    amount_with_feature = data[data[feature] == '+'].groupby('Glottocode').size()
    
    languages = [lingtypology.glottolog.get_by_glot_id(glot_id) for glot_id in amount_with_feature.index]
    with_feature = pandas.DataFrame({
        'language': languages,
        feature: amount_with_feature,
        'elevation': lingtypology.get_elevations(languages),
    })
    with_feature = with_feature[with_feature.elevation != '']
    if with_feature.empty:
        print('No data: ' + subset)
        return
    
    if count_regressions:
        #Зависит ли количество абруптивных в языках, где они суть, от высоты
        regression_no_zeros = linregress(
            list(map(int, with_feature[feature])),
            list(map(int, with_feature.elevation))
        )
    
    no_feature = data[~data.Glottocode.isin(list(amount_with_feature.index))]
    no_feature = no_feature.drop_duplicates(subset='Glottocode')
    languages = [lingtypology.glottolog.get_by_glot_id(glot_id) for glot_id in no_feature.Glottocode]
    no_feature = pandas.DataFrame({
        'language': languages,
        feature: 0,
        'elevation': lingtypology.get_elevations(languages),
    })
    no_feature = no_feature[no_feature.elevation != '']
    all_ = pandas.concat((with_feature, no_feature))

    #Зависит ли количество абруптивных/имплозивных во всех яхыках от высоты
    if count_regressions:
        regression_with_zeros = linregress(
            list(map(int, all_[feature])),
            list(map(int, all_.elevation))
        )

    higher = all_[all_.elevation > 1500]
    higher = [len(higher[higher[feature] > 0]), len(higher[higher[feature] == 0])]
    lower = all_[all_.elevation <= 1500]
    lower = [len(lower[lower[feature] > 0]), len(lower[lower[feature] == 0])]
    table = [higher, lower]
    
    #Правда ли, что, если больше 1500 метров, то ты с фичёй?
    try:
        chi = chi2_contingency(table)
    except ValueError:
        chi = [math.nan, math.nan, math.nan, math.nan]
    
    #Нарисуем все графики и запишем все данные в файлы
    cdir = 'phoible_results' + os.path.sep + subset
    if not os.path.exists(cdir):
        os.mkdir(cdir)
    
    if count_regressions:
        #График регрессия для языков с фичёй
        plt.scatter(with_feature[feature], with_feature.elevation, color='black')
        axes = plt.gca()
        x_vals = np.array(axes.get_xlim())
        y_vals = regression_no_zeros.intercept + regression_no_zeros.slope*x_vals 
        plt.plot(x_vals, y_vals, linewidth=3)
        plt.savefig(cdir + os.path.sep + '{}_linear_regression_only.png'.format(feature), format='PNG')
        plt.cla()
        plt.clf()

        #График регрессии для всех языков по фиче
        plt.scatter(all_[feature], all_.elevation, color='black')
        axes = plt.gca()
        x_vals = np.array(axes.get_xlim())
        y_vals = regression_with_zeros.intercept + regression_with_zeros.slope*x_vals 
        plt.plot(x_vals, y_vals, linewidth=3)
        plt.savefig(cdir + os.path.sep + '{}_linear_regression_all.png'.format(feature), format='PNG')
        plt.cla()
        plt.clf()
    
        #Результаты подсчёта регрессии
        reg_str = 'Slope:\t{slope}\nIntercept:\t{intercept}\nR_value:\t{rvalue}\nP_value:\t{pvalue}'
        fwrite(
            cdir + os.path.sep + '{}_linear_regression_only.csv'.format(feature),
            reg_str.format(
                slope = regression_no_zeros.slope,
                intercept = regression_no_zeros.intercept,
                rvalue = regression_no_zeros.rvalue,
                pvalue = regression_no_zeros.pvalue
            )
        )
        fwrite(
            cdir + os.path.sep + '{}_linear_regression_all.csv'.format(feature),
            reg_str.format(
                slope = regression_with_zeros.slope,
                intercept = regression_with_zeros.intercept,
                rvalue = regression_with_zeros.rvalue,
                pvalue = regression_with_zeros.pvalue
            )
        )
    
    #Результаты хи-квадрата
    fwrite(
        cdir + os.path.sep + '{}_chi2.csv'.format(feature),
        'chi2:\t{chi2}\nP_value:\t{pvalue}\nDegrees of freedom:\t{dof}\nExpected:\t{ex}'.format(
            chi2 = chi[0],
            pvalue = chi[1],
            dof = chi[2],
            ex = chi[3]
        )
    )
    
    #Чистые данные
    with_feature.to_csv(cdir + os.path.sep + '{}_with_raw.csv'.format(feature))
    all_.to_csv(cdir + os.path.sep + '{}_all_raw.csv'.format(feature))
    if count_regressions:
        return subset, chi, regression_no_zeros, regression_with_zeros
    else:
        return subset, chi

In [15]:
if __name__ == '__main__':
    features = binary_features#['loweredLarynxImplosive', 'raisedLarynxEjective', 'long', 'short']
    subsets = ['UPSID', 'SPA', 'AA', 'PH', 'GM', 'RA', 'SAPHON']
    results = {}
    if not os.path.exists('phoible_results'):
        os.mkdir('phoible_results')
    p = Phoible(subset='all', aggregated=False)
    p.show_citation = False
    for feature in features:
        processed_subsets = []
        regressions_no_zeros = []
        regressions_with_zeros = []
        chi2s = []
        for subset in subsets:
            r = count_stats(p, subset, feature, count_regressions=True) if feature == 'raisedLarynxEjective' \
                                                else count_stats(p, subset, feature)
            if r:
                processed_subsets.append(r[0])
                if feature == 'raisedLarynxEjective':
                    regressions_no_zeros.append(r[2])
                    regressions_with_zeros.append(r[3])
                    chi2s.append(r[1])
                else:
                    chi2s.append(r[1])
        plt.close()
        if feature == 'raisedLarynxEjective':
            regressed_result = pandas.DataFrame({
                'Dataset': processed_subsets,
                'Regression (only with feature)': ['%.015f' % r.pvalue for r in regressions_no_zeros],
                'Regression (all languages)': ['%.015f' % r.pvalue for r in regressions_with_zeros],
                'Chi2 Test': ['%.015f' % c[1] for c in chi2s if not math.isnan(c[1])]
            })
        else:
            if not all((math.isnan(el) for el in [c[1] for c in chi2s])):
                result = pandas.DataFrame({
                    'Dataset': processed_subsets + ['Median'],
                    feature: ['%.04f' % c[1] for c in chi2s] + \
                    [np.median([c[1] for c in chi2s if not math.isnan(c[1])])]
                })
                results[feature] = result

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)
  result = method(y)


Elevations for these languages were not found: Nama, Katcha, Kaliai
Elevations for these languages were not found: Ikwo, Ezaa
Elevations for these languages were not found: Bikele, Karo, Saanich, Mianmin, Kuay, Endo, Lorette Huron, Mvumbo, Korafe
Elevations for these languages were not found: Pana, Frafra, Dinka, Ezha, Soddo, Kambe, Ikalanga, Oko, Chaha, Kauma, Efutu, Gumer, Copi, Moghamo, Besleri, Mmani, Zayse
Elevations for these languages were not found: Mising, Naiki, Abujmaria
Elevations for these languages were not found: Shipibo, Khithaulhu, Karo, Miraña
Elevations for these languages were not found: Nama, Katcha, Kaliai
No data: AA
No data: PH
Elevations for these languages were not found: Pana, Frafra, Dinka, Ezha, Soddo, Kambe, Ikalanga, Oko, Chaha, Kauma, Efutu, Gumer, Copi, Moghamo, Besleri, Mmani, Zayse
Elevations for these languages were not found: Abujmaria
Elevations for these languages were not found: Naiki, Mising
No data: SAPHON
Elevations for these languages were no

Elevations for these languages were not found: Mising, Naiki, Abujmaria
Elevations for these languages were not found: Shipibo, Khithaulhu, Karo, Miraña
Elevations for these languages were not found: Nama, Katcha
Elevations for these languages were not found: Kaliai
Elevations for these languages were not found: Ikwo, Ezaa
Elevations for these languages were not found: Bikele, Mianmin, Kuay, Mvumbo, Korafe
Elevations for these languages were not found: Karo, Lorette Huron, Endo, Saanich
Elevations for these languages were not found: Pana, Frafra, Ezha, Soddo, Kambe, Ikalanga, Oko, Chaha, Kauma, Efutu, Gumer, Copi, Moghamo, Besleri, Mmani
Elevations for these languages were not found: Dinka, Zayse
Elevations for these languages were not found: Naiki
Elevations for these languages were not found: Mising, Abujmaria
Elevations for these languages were not found: Shipibo, Khithaulhu, Karo, Miraña
Elevations for these languages were not found: Nama, Katcha, Kaliai
Elevations for these langua

Elevations for these languages were not found: Shipibo, Khithaulhu, Karo, Miraña
No data: UPSID
No data: SPA
No data: AA
Elevations for these languages were not found: Bikele, Karo, Saanich, Mianmin, Kuay, Endo, Lorette Huron, Mvumbo, Korafe
Elevations for these languages were not found: Pana, Frafra, Dinka, Ezha, Soddo, Kambe, Ikalanga, Oko, Chaha, Kauma, Efutu, Gumer, Copi, Moghamo, Besleri, Mmani, Zayse
No data: RA
No data: SAPHON
Elevations for these languages were not found: Nama, Kaliai
Elevations for these languages were not found: Katcha
Elevations for these languages were not found: Ikwo
Elevations for these languages were not found: Ezaa
Elevations for these languages were not found: Karo, Saanich, Mianmin, Kuay, Lorette Huron, Mvumbo, Korafe
Elevations for these languages were not found: Bikele, Endo
Elevations for these languages were not found: Pana, Frafra, Dinka, Ezha, Soddo, Kambe, Ikalanga, Oko, Chaha, Kauma, Efutu, Gumer, Copi, Besleri, Zayse
Elevations for these lang

In [16]:
regressed_result

Unnamed: 0,Dataset,Regression (only with feature),Regression (all languages),Chi2 Test
0,UPSID,0.950559282993466,4.4964081592e-05,3.2921681908e-05
1,SPA,0.475539733143422,5.592842023e-06,0.000176784757431
2,PH,0.731523538203316,0.392451413030472,0.160190111324293
3,GM,0.038586492300174,0.0,0.0
4,SAPHON,0.018874875617294,5.031926e-09,0.000377241915218


In [17]:
df = pandas.DataFrame()
for i, result in enumerate(results):
    if i == 0:
        df = results[result]
    else:
        df = pandas.merge(df, results[result], how='outer', on='Dataset')
df = df.reindex([0, 1, 2, 3, 5, 6, 7, 4])
df.to_csv('phoible_result.csv')

In [18]:
df

Unnamed: 0,Dataset,short,long,delayedRelease,tap,trill,nasal,lateral,labial,round,...,back,tense,retractedTongueRoot,advancedTongueRoot,epilaryngealSource,spreadGlottis,constrictedGlottis,fortis,loweredLarynxImplosive,click
0,UPSID,0.7304,0.6205,0.6106,0.9272,0.5174,0.7388,0.1174,,0.2667,...,,0.2667,0.1243,,,0.3624,0.128,,0.5654,
1,SPA,0.4974,0.8311,0.4335,0.9873,0.9605,,0.5463,0.3787,0.3787,...,,,0.8936,0.3787,,0.8858,0.1328,0.8083,0.8776,
2,GM,0.6587,0.007,0.8435,0.8367,0.9499,0.1603,0.6415,,0.1603,...,0.1603,0.1603,0.8242,,0.1603,0.048,0.0057,,0.2245,0.1603
3,RA,0.0826,0.1125,,0.1125,0.0622,,0.9301,,,...,,,0.9301,,,0.8941,0.1244,,0.3215,
5,AA,,0.7559,,0.9076,0.4865,,0.0491,,,...,,,0.2252,,,0.1302,0.6491,,0.5679,
6,PH,,0.2549,0.9051,0.7908,0.1327,0.7573,0.3205,,,...,,0.2552,0.8665,0.2552,0.2552,0.809,0.1432,,0.9455,
7,SAPHON,,0.0287,0.4856,0.3496,0.852,0.7113,0.0,,,...,,,,0.1864,,0.009,0.3423,,0.6432,
4,Median,0.578074,0.254949,0.610642,0.836724,0.517375,0.725022,0.320519,0.378695,0.266709,...,0.160319,0.255246,0.845344,0.255246,0.207783,0.362376,0.132809,0.808315,0.567919,0.160319


In [26]:
I = df[['Dataset', 'short', 'long', 'delayedRelease', 'tap', 'trill', 'nasal']]
print(I.to_latex())

\begin{tabular}{llllllll}
\toprule
{} & Dataset &     short &      long & delayedRelease &       tap &     trill &     nasal \\
\midrule
0 &   UPSID &    0.7304 &    0.6205 &         0.6106 &    0.9272 &    0.5174 &    0.7388 \\
1 &     SPA &    0.4974 &    0.8311 &         0.4335 &    0.9873 &    0.9605 &       nan \\
2 &      GM &    0.6587 &    0.0070 &         0.8435 &    0.8367 &    0.9499 &    0.1603 \\
3 &      RA &    0.0826 &    0.1125 &            nan &    0.1125 &    0.0622 &       nan \\
5 &      AA &       NaN &    0.7559 &            nan &    0.9076 &    0.4865 &       nan \\
6 &      PH &       NaN &    0.2549 &         0.9051 &    0.7908 &    0.1327 &    0.7573 \\
7 &  SAPHON &       NaN &    0.0287 &         0.4856 &    0.3496 &    0.8520 &    0.7113 \\
4 &  Median &  0.578074 &  0.254949 &       0.610642 &  0.836724 &  0.517375 &  0.725022 \\
\bottomrule
\end{tabular}



In [28]:
II = df[['Dataset', 'lateral', 'labial', 'round', 'labiodental', 'distributed', 'strident']]
print(II.to_latex())

\begin{tabular}{llllllll}
\toprule
{} & Dataset &   lateral &    labial &     round & labiodental & distributed & strident \\
\midrule
0 &   UPSID &    0.1174 &       nan &    0.2667 &      0.8925 &      0.8872 &   0.5576 \\
1 &     SPA &    0.5463 &    0.3787 &    0.3787 &      0.1592 &      0.2771 &   0.7159 \\
2 &      GM &    0.6415 &       nan &    0.1603 &      0.5869 &      0.4575 &   0.3861 \\
3 &      RA &    0.9301 &       nan &       nan &      0.9249 &         nan &   0.3215 \\
5 &      AA &    0.0491 &       nan &       nan &      0.1428 &      0.8365 &      nan \\
6 &      PH &    0.3205 &       nan &       nan &      0.8006 &      0.0753 &   0.4896 \\
7 &  SAPHON &    0.0000 &       nan &       nan &      0.8457 &      0.0139 &   0.3705 \\
4 &  Median &  0.320519 &  0.378695 &  0.266709 &    0.800579 &    0.367317 &  0.43784 \\
\bottomrule
\end{tabular}



In [30]:
III = df[['Dataset', 'low', 'front', 'back', 'tense', 'retractedTongueRoot', 'advancedTongueRoot']]
print(III.to_latex())

\begin{tabular}{llllllll}
\toprule
{} & Dataset &       low &     front &      back &     tense & retractedTongueRoot & advancedTongueRoot \\
\midrule
0 &   UPSID &    0.2667 &       nan &       nan &    0.2667 &              0.1243 &                NaN \\
1 &     SPA &    0.3787 &       nan &       nan &       nan &              0.8936 &             0.3787 \\
2 &      GM &    0.4430 &    0.1603 &    0.1603 &    0.1603 &              0.8242 &                NaN \\
3 &      RA &    0.3215 &       nan &       nan &       nan &              0.9301 &                NaN \\
5 &      AA &       nan &       nan &       nan &       nan &              0.2252 &                NaN \\
6 &      PH &    0.5906 &       nan &       nan &    0.2552 &              0.8665 &             0.2552 \\
7 &  SAPHON &       nan &       nan &       nan &       nan &                 NaN &             0.1864 \\
4 &  Median &  0.378695 &  0.160319 &  0.160319 &  0.255246 &            0.845344 &           0.255246 \\
\

In [31]:
IV = df[['Dataset', 'epilaryngealSource', 'spreadGlottis', 'constrictedGlottis', 'fortis', 'loweredLarynxImplosive', 'click']]
print(IV.to_latex())

\begin{tabular}{llllllll}
\toprule
{} & Dataset & epilaryngealSource & spreadGlottis & constrictedGlottis &    fortis & loweredLarynxImplosive &     click \\
\midrule
0 &   UPSID &                NaN &        0.3624 &             0.1280 &       NaN &                 0.5654 &       NaN \\
1 &     SPA &                NaN &        0.8858 &             0.1328 &    0.8083 &                 0.8776 &       NaN \\
2 &      GM &             0.1603 &        0.0480 &             0.0057 &       NaN &                 0.2245 &    0.1603 \\
3 &      RA &                NaN &        0.8941 &             0.1244 &       NaN &                 0.3215 &       NaN \\
5 &      AA &                NaN &        0.1302 &             0.6491 &       NaN &                 0.5679 &       NaN \\
6 &      PH &             0.2552 &        0.8090 &             0.1432 &       NaN &                 0.9455 &       NaN \\
7 &  SAPHON &                NaN &        0.0090 &             0.3423 &       NaN &                 0