In [2]:
from scipy import stats

In [3]:
data = [49,58,75,110,112,132,151,276,281,362]
m0 = 200

In [4]:
dt = map(lambda x: x-m0, data)

In [5]:
stats.wilcoxon(dt)

WilcoxonResult(statistic=17.0, pvalue=0.2845026979112075)

In [6]:
dt_1 = [22,22,15,13,19,19,18,20,21,13,13,15]
dt_2 = [17,18,18,15,12,4,14,15,10]

In [7]:
stats.mannwhitneyu(dt_1, dt_2)

MannwhitneyuResult(statistic=27.0, pvalue=0.02900499272087373)

In [8]:
import pandas as pd
import numpy as np
df = pd.read_csv('challenger.csv', sep='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,Temperature,Incident
0,Apr12.81,18.9,0
1,Nov12.81,21.1,1
2,Mar22.82,20.6,0
3,Nov11.82,20.0,0
4,Apr04.83,19.4,0


In [9]:
X1 = df[df['Incident']==1]['Temperature'].values
X2 = df[df['Incident']==0]['Temperature'].values

In [10]:
print X1
print X2

[21.1 13.9 17.2 21.1 11.7 23.9 14.4]
[18.9 20.6 20.  19.4 22.2 22.8 21.1 25.6 19.4 19.4 23.9 21.1 27.2 24.4
 26.1 24.4]


In [11]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [24]:
from plotly.subplots import make_subplots
fig = go.Figure()
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "histogram"}, {"type": "histogram"}]], subplot_titles=[])
fig.add_trace(go.Histogram(x=np.array(X1), nbinsx=7), 1,1)
fig.add_trace(go.Histogram(x=np.array(X2), nbinsx=7), 1,2)
fig.show()

In [13]:
def get_bootstrap_samples(data, n_samples): #функция генерации n_samples выборок из data, методом bootstrap
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples
def stat_intervals(stat, alpha): #функция генерации доверительного интервала для статисики
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [14]:
n_samples = 1000
np.random.seed(0)
X1_b = get_bootstrap_samples(X1, n_samples)
X2_b = get_bootstrap_samples(X2, n_samples)

In [15]:
X1_b.shape

(1000L, 7L)

In [22]:
x1_mean_scores = map(np.mean, X1_b)
x2_mean_scores = map(np.mean, X2_b)

In [28]:
x1_mean_scores = np.array(x1_mean_scores)
x2_mean_scores = np.array(x2_mean_scores)
stat_intervals(x1_mean_scores-x2_mean_scores, alpha=0.05)

array([-8.06457589, -1.45040179])

In [35]:
def permutation_t_stat_ind(sample1, sample2): #функция вычисления T-статистики
    return np.mean(sample1) - np.mean(sample2)

def get_random_combinations(n1, n2, max_combinations): #функция генерирования индексов
    index = range(n1 + n2)
    indices = set([tuple(index)])
    for i in range(max_combinations - 1):
        np.random.shuffle(index)
        indices.add(tuple(index))
    return [(index[:n1], index[n1:]) for index in indices]

def permutation_zero_dist_ind(sample1, sample2, max_combinations = None): #функция построения нулевого распределения
    joined_sample = np.hstack((sample1, sample2))
    n1 = len(sample1)
    n = len(joined_sample)
    
    if max_combinations:
        indices = get_random_combinations(n1, len(sample2), max_combinations)
    else:
        indices = [(list(index), filter(lambda i: i not in index, range(n))) \
                    for index in itertools.combinations(range(n), n1)]
    
    distr = [joined_sample[list(i[0])].mean() - joined_sample[list(i[1])].mean() \
             for i in indices]
    return distr

def permutation_test(sample, mean, max_permutations = None, alternative = 'two-sided'):  #функция вычисления уровня значимости
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")
    
    t_stat = permutation_t_stat_ind(sample, mean)
    
    zero_distr = permutation_zero_dist_ind(sample, mean, max_permutations)
    
    if alternative == 'two-sided':
        return sum([1. if abs(x) >= abs(t_stat) else 0. for x in zero_distr]) / len(zero_distr)
    
    if alternative == 'less':
        return sum([1. if x <= t_stat else 0. for x in zero_distr]) / len(zero_distr)

    if alternative == 'greater':
        return sum([1. if x >= t_stat else 0. for x in zero_distr]) / len(zero_distr)

In [36]:
print "p-value: %f" % permutation_test(X1, X2, max_permutations = 10000)

p-value: 0.004200
