In [25]:
import numpy as np
import pandas as pd
import scipy
from scipy import stats

In [5]:
data = pd.read_csv('illiteracy.txt', sep='\t')

In [6]:
data.head()

Unnamed: 0,Country,Illit,Births
0,Albania,20.5,1.78
1,Algeria,39.1,2.44
2,Bahrain,15.0,2.34
3,Belize,5.9,2.97
4,Benin,73.5,5.6


In [12]:
stats.spearmanr(data['Illit'], data['Births'])

SpearmanrResult(correlation=0.752962213732534, pvalue=2.085857122146067e-18)

In [15]:
data = pd.read_csv('water.txt', sep='\t')

In [16]:
data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [17]:
data.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.654849
hardness,-0.654849,1.0


In [18]:
stats.spearmanr(data['mortality'], data['hardness'])

SpearmanrResult(correlation=-0.6316646189166502, pvalue=4.79546153722838e-08)

In [19]:
data_south = data[data['location'] == 'South']
data_north = data[data['location'] == 'North']

In [20]:
data_south.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.602153
hardness,-0.602153,1.0


In [21]:
data_north.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.368598
hardness,-0.368598,1.0


In [26]:
bars_sex = np.array([[203., 239.], [718., 515.]])

In [27]:
print('Matthews significance p-value: %f' % stats.chi2_contingency(bars_sex)[1])

Matthews significance p-value: 0.000011


In [28]:
def proportions_diff_confint_ind(sample1, sample2, alpha = 0.05):
    z = stats.norm.ppf(1 - alpha / 2.)

    p1 = sample1[0] / np.sum(sample1)
    p2 = sample2[0] / np.sum(sample2)

    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ np.sum(sample1) + p2 * (1 - p2)/ np.sum(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ np.sum(sample1) + p2 * (1 - p2)/ np.sum(sample2))

    return (left_boundary, right_boundary)

In [29]:
print('95%% confidence interval for a difference of men and women: [%.4f, %.4f]' %
      proportions_diff_confint_ind(bars_sex[:,1], bars_sex[:,0]))

95% confidence interval for a difference of men and women: [0.0539, 0.1392]


In [32]:
def proportions_diff_z_stat_ind(sample1, sample2):
    n1 = np.sum(sample1)
    n2 = np.sum(sample2)

    p1 = sample1[0] / n1
    p2 = sample2[0] / n2
    P = float(p1*n1 + p2*n2) / (n1 + n2)

    return (p1 - p2) / np.sqrt(P * (1 - P) * (1. / n1 + 1. / n2))

In [30]:
def proportions_diff_z_test(z_stat, alternative = 'two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")

    if alternative == 'two-sided':
        return 2 * (1 - stats.norm.cdf(np.abs(z_stat)))

    if alternative == 'less':
        return stats.norm.cdf(z_stat)

    if alternative == 'greater':
        return 1 - stats.norm.cdf(z_stat)

In [33]:
proportions_diff_z_test(proportions_diff_z_stat_ind(bars_sex[:,1], bars_sex[:,0]))

8.153453089576601e-06

In [34]:
happiness = np.array( [[197., 111., 33. ],
                       [382., 685., 331.],
                       [110., 342., 333.]] )

In [35]:
stats.chi2_contingency(happiness) # статистика хи-квадрат
print('Chi2 stat value: %.4f' % stats.chi2_contingency(happiness)[0])

Chi2 stat value: 293.6831


In [36]:
print('Chi2 stat p-value: %.62f' % stats.chi2_contingency(happiness)[1])

Chi2 stat p-value: 0.00000000000000000000000000000000000000000000000000000000000002


In [37]:
def cramers_stat(confusion_matrix):
    chi2 = stats.chi2_contingency(confusion_matrix)[0]
    n = confusion_matrix.sum()
    return np.sqrt(chi2 / (n*(min(confusion_matrix.shape)-1)))

In [38]:
print('V Cramer stat value: %.4f' % cramers_stat(happiness))

V Cramer stat value: 0.2412
