In [4]:
import numpy as np
import pandas as pd

import scipy
from statsmodels.stats.weightstats import *
from statsmodels.stats.proportion import proportion_confint

# 1

In [5]:
scipy.stats.norm.ppf(0.9985)

2.9677379253417944

# 5

In [6]:
aspirine_all = 11037.0
aspirine_bad = 104.0

placebo_all = 11034.0
placebo_bad = 189.0

In [7]:
abs(aspirine_bad/aspirine_all - placebo_bad/placebo_all)

0.0077060239760047815

# 6

## Доверительный интервал для разности долей (независимые выборки)

   | $X_1$ | $X_2$  
  ------------- | -------------|
  1  | a | b 
  0  | c | d 
  $\sum$ | $n_1$| $n_2$
  
$$ \hat{p}_1 = \frac{a}{n_1}$$

$$ \hat{p}_2 = \frac{b}{n_2}$$


$$\text{Доверительный интервал для }p_1 - p_2\colon \;\; \hat{p}_1 - \hat{p}_2 \pm z_{1-\frac{\alpha}{2}}\sqrt{\frac{\hat{p}_1(1 - \hat{p}_1)}{n_1} + \frac{\hat{p}_2(1 - \hat{p}_2)}{n_2}}$$

In [9]:
def proportions_confint_diff_ind(sample1, sample2, alpha = 0.05):    
    z = scipy.stats.norm.ppf(1 - alpha / 2.)   
    p1 = float(sum(sample1)) / len(sample1)
    p2 = float(sum(sample2)) / len(sample2)
    
    left_boundary = (p1 - p2) - z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    right_boundary = (p1 - p2) + z * np.sqrt(p1 * (1 - p1)/ len(sample1) + p2 * (1 - p2)/ len(sample2))
    
    return (left_boundary, right_boundary)

In [10]:
aspirine_bad = 104
no_aspirine_bad = 189

In [49]:
population_test = np.array([float(x < aspirine_bad) for x in range(11037)])
population_control = np.array([float(x < no_aspirine_bad) for x in range(11034)])

In [60]:
print "confidence interval: [%f, %f]" % proportions_confint_diff_ind(population_test, population_control)

confidence interval: [0.004688, 0.010724]


# 7

In [14]:
no_aspirine_odd = 189.0/10845.0
aspirine_odd = 104.0/10933.0

In [15]:
no_aspirine_odd

0.017427385892116183

In [16]:
aspirine_odd

0.009512485136741973

In [17]:
no_aspirine_odd/aspirine_odd

1.8320539419087138

# 8

In [50]:
def get_bootstrap_samples(data, n_samples = 1000):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

In [51]:
def stat_intervals(stat, alpha = 0.05):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [52]:
np.random.seed(0)

In [53]:
population_test_choice = get_bootstrap_samples(population_test)
population_control_choice = get_bootstrap_samples(population_control)

In [57]:
odd_scores = zip(map(lambda x: sum(x)/(len(x) - sum(x)), population_test_choice), \
                 map(lambda x: sum(x)/(len(x) - sum(x)), population_control_choice))

In [58]:
delta_odd_scores = map(lambda x: x[1]/x[0], odd_scores)

In [59]:
print "95% confidence interval:",  stat_intervals(delta_odd_scores)

95% confidence interval: [ 1.44419465  2.34321168]
