In [1]:
import pandas as pd
import numpy as np

import scipy.stats as sts
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('illiteracy.txt', sep='\t')
data.head()

Unnamed: 0,Country,Illit,Births
0,Albania,20.5,1.78
1,Algeria,39.1,2.44
2,Bahrain,15.0,2.34
3,Belize,5.9,2.97
4,Benin,73.5,5.6


In [4]:
data.corr(method='spearman')

Unnamed: 0,Illit,Births
Illit,1.0,0.752962
Births,0.752962,1.0


In [5]:
data = pd.read_csv('water.txt', sep='\t')
data.head()

Unnamed: 0,location,town,mortality,hardness
0,South,Bath,1247,105
1,North,Birkenhead,1668,17
2,South,Birmingham,1466,5
3,North,Blackburn,1800,14
4,North,Blackpool,1609,18


In [7]:
data.corr(method='spearman')

Unnamed: 0,mortality,hardness
mortality,1.0,-0.631665
hardness,-0.631665,1.0


In [8]:
north = data[data['location']=='North']
south = data[data['location']=='South']

In [9]:
north.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.368598
hardness,-0.368598,1.0


In [10]:
south.corr()

Unnamed: 0,mortality,hardness
mortality,1.0,-0.602153
hardness,-0.602153,1.0


In [12]:
sts.chi2_contingency([[203, 239], [718, 515]])

(19.40753078854304,
 1.0558987006638725e-05,
 1,
 array([[243.03402985, 198.96597015],
        [677.96597015, 555.03402985]]))

In [19]:
n1 = (203+718)
n2 = (239+515)

p1 = 203 / n1
p2 = 239 / n2

In [20]:
p1, p2

(0.22041259500542887, 0.3169761273209549)

In [29]:
P = (203 + 209)/(n1 + n2) 
(p2 - p1) - sts.norm.ppf(1-0.05/2) * np.sqrt(P*(1-P)*(1/n1 + 1/n2))

0.055108669662276615

In [31]:
z_obs = (p2 - p1) / np.sqrt(P*(1-P)*(1/n1 + 1/n2))

In [34]:
2*(1 - sts.norm.cdf(z_obs))

4.98369948909172e-06

In [35]:
data = [
    [197, 111, 33],
    [382, 685, 331],
    [110, 342, 333]
]

In [36]:
data

[[197, 111, 33], [382, 685, 331], [110, 342, 333]]

In [37]:
sts.chi2_contingency(data)

(293.68311039689746,
 2.4964299580093467e-62,
 4,
 array([[ 93.08597464, 153.74722662,  94.16679873],
        [381.6251981 , 630.318542  , 386.0562599 ],
        [214.28882726, 353.93423138, 216.77694136]]))

In [39]:
np.sqrt(293.68311039689746 / (np.sum(data) * 2))

0.2412013934500338

In [40]:
data = pd.read_csv('AUCs.txt', sep='\t')
data.head()

Unnamed: 0.1,Unnamed: 0,C4.5,C4.5+m,C4.5+cf,C4.5+m+cf
0,adult (sample),0.763,0.768,0.771,0.798
1,breast cancer,0.599,0.591,0.59,0.569
2,breast cancer wisconsin,0.954,0.971,0.968,0.967
3,cmc,0.628,0.661,0.654,0.657
4,ionosphere,0.882,0.888,0.886,0.898


In [44]:
from itertools import combinations
combs = list(combinations(data.columns[1:], 2))

In [54]:
data[list(combs[0])].values[:, 0]

array([0.763, 0.599, 0.954, 0.628, 0.882, 0.936, 0.661, 0.583, 0.775,
       1.   , 0.94 , 0.619, 0.972, 0.957])

In [57]:
subset = data[list(combs[0])].values
x = subset[:, 0]
y = subset[:, 1]

sts.wilcoxon(x, y)[1]

0.01075713311978963

In [58]:
ps = []
for comb in combs:
    subset = data[list(comb)].values
    x = subset[:, 0]
    y = subset[:, 1]

    ps.append((comb, sts.wilcoxon(x, y)[1]))

In [59]:
ps

[(('C4.5', 'C4.5+m'), 0.01075713311978963),
 (('C4.5', 'C4.5+cf'), 0.861262330095348),
 (('C4.5', 'C4.5+m+cf'), 0.015906444101703374),
 (('C4.5+m', 'C4.5+cf'), 0.046332729793395394),
 (('C4.5+m', 'C4.5+m+cf'), 0.3278256758446406),
 (('C4.5+cf', 'C4.5+m+cf'), 0.022909099354356588)]

In [60]:
from statsmodels.stats.multitest import multipletests

In [63]:
ps_ = []
for pp in ps:
    ps_.append(pp[1])

In [64]:
ps_

[0.01075713311978963,
 0.861262330095348,
 0.015906444101703374,
 0.046332729793395394,
 0.3278256758446406,
 0.022909099354356588]

In [65]:
multipletests(ps_, method='holm')

(array([False, False, False, False, False, False]),
 array([0.0645428 , 0.86126233, 0.07953222, 0.13899819, 0.65565135,
        0.0916364 ]),
 0.008512444610847103,
 0.008333333333333333)

In [66]:
multipletests(ps_, method='fdr_bh')

(array([ True, False,  True, False, False,  True]),
 array([0.0458182 , 0.86126233, 0.0458182 , 0.06949909, 0.39339081,
        0.0458182 ]),
 0.008512444610847103,
 0.008333333333333333)