In [34]:
import itertools
import pandas as pd
from matplotlib import pylab
from scipy.stats import stats, probplot, wilcoxon, mannwhitneyu
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import matplotlib as plt
import scipy
import numpy as np
from statsmodels.stats.weightstats import *
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


In [7]:
data = pd.read_csv('challenger.txt', sep='\t')

In [8]:
data.head()

Unnamed: 0.1,Unnamed: 0,Temperature,Incident
0,Apr12.81,18.9,0
1,Nov12.81,21.1,1
2,Mar22.82,20.6,0
3,Nov11.82,20.0,0
4,Apr04.83,19.4,0


In [9]:
data.describe()

Unnamed: 0,Temperature,Incident
count,23.0,23.0
mean,20.86087,0.304348
std,3.919501,0.470472
min,11.7,0.0
25%,19.4,0.0
50%,21.1,0.0
75%,23.9,1.0
max,27.2,1.0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   23 non-null     object 
 1   Temperature  23 non-null     float64
 2   Incident     23 non-null     int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 680.0+ bytes


In [11]:
data.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
data.head()

Unnamed: 0,Date,Temperature,Incident
0,Apr12.81,18.9,0
1,Nov12.81,21.1,1
2,Mar22.82,20.6,0
3,Nov11.82,20.0,0
4,Apr04.83,19.4,0


In [14]:
challenger_broken = data.loc[data.loc[:, 'Incident'] == 1, :].drop(['Date', 'Incident'], axis=1)
challenger_broken.head()

Unnamed: 0,Temperature
1,21.1
8,13.9
9,17.2
10,21.1
13,11.7


In [15]:
challenger_not_broken = data.loc[data.loc[:, 'Incident'] != 1, :].drop(['Date', 'Incident'], axis=1)
challenger_not_broken.head()

Unnamed: 0,Temperature
0,18.9
2,20.6
3,20.0
4,19.4
5,22.2


In [16]:
def get_bootstrap_samples(data, n_samples):
    indices = np.random.randint(0, len(data), (n_samples, len(data)))
    samples = data[indices]
    return samples

In [17]:
def stat_intervals(stat, alpha):
    boundaries = np.percentile(stat, [100 * alpha / 2., 100 * (1 - alpha / 2.)])
    return boundaries

In [27]:
np.random.seed(0)
challenger_broken_bs_mean = np.array([np.mean(li) for li in get_bootstrap_samples(challenger_broken['Temperature'].values, 1000)])
challenger_not_broken_bs_mean = np.array([np.mean(li) for li in get_bootstrap_samples(challenger_not_broken['Temperature'].values, 1000)])

In [28]:
challenger_broken_bs_mean

array([19.12857143, 18.64285714, 15.22857143, 18.5       , 19.98571429,
       19.51428571, 16.11428571, 17.68571429, 18.25714286, 15.55714286,
       16.1       , 18.64285714, 14.52857143, 14.77142857, 20.4       ,
       18.88571429, 18.01428571, 19.75714286, 18.87142857, 17.6       ,
       16.65714286, 14.67142857, 20.54285714, 17.67142857, 19.6       ,
       19.42857143, 19.98571429, 16.51428571, 18.57142857, 22.3       ,
       16.18571429, 19.12857143, 19.91428571, 18.        , 17.14285714,
       17.68571429, 19.98571429, 19.04285714, 18.57142857, 19.6       ,
       15.94285714, 18.97142857, 16.98571429, 20.        , 19.02857143,
       19.04285714, 16.04285714, 16.34285714, 19.45714286, 20.4       ,
       15.3       , 17.85714286, 17.14285714, 16.97142857, 17.45714286,
       20.54285714, 18.57142857, 15.87142857, 14.6       , 21.74285714,
       18.65714286, 17.21428571, 18.8       , 16.67142857, 19.02857143,
       16.02857143, 17.61428571, 19.11428571, 14.84285714, 18.25

In [29]:
stat_intervals(challenger_broken_bs_mean - challenger_not_broken_bs_mean, 0.05)

array([-8.06457589, -1.45040179])

На данных предыдущей задачи проверьте гипотезу об одинаковой средней температуре воздуха в дни, когда уплотнительный кольца повреждались, и дни, когда повреждений не было. Используйте перестановочный критерий и двустороннюю альтернативу. Чему равен достигаемый уровень значимости? Округлите до четырёх знаков после десятичной точки.

Чтобы получить такое же значение, как мы:

установите random seed = 0; возьмите 10000 перестановок.

In [30]:
def permutation_t_stat_ind(sample1, sample2):
    return np.mean(sample1) - np.mean(sample2)

In [40]:
def get_random_combinations(n1, n2, max_combinations):
    index = list(range(n1 + n2))
    indices = {tuple(index)}
    for i in range(max_combinations - 1):
        np.random.shuffle(index)
        indices.add(tuple(index))
    return [(index[:n1], index[n1:]) for index in indices]

In [35]:
def permutation_zero_dist_ind(sample1, sample2, max_combinations=None):
    joined_sample = np.hstack((sample1, sample2))
    n1 = len(sample1)
    n = len(joined_sample)

    if max_combinations:
        indices = get_random_combinations(n1, len(sample2), max_combinations)
    else:
        indices = [(list(index), filter(lambda i: i not in index, range(n))) \
                    for index in itertools.combinations(range(n), n1)]

    distr = [joined_sample[list(i[0])].mean() - joined_sample[list(i[1])].mean() \
             for i in indices]
    return distr

In [36]:
def permutation_test(sample, mean, max_permutations = None, alternative='two-sided'):
    if alternative not in ('two-sided', 'less', 'greater'):
        raise ValueError("alternative not recognized\n"
                         "should be 'two-sided', 'less' or 'greater'")

    t_stat = permutation_t_stat_ind(sample, mean)

    zero_distr = permutation_zero_dist_ind(sample, mean, max_permutations)

    if alternative == 'two-sided':
        return sum([1. if abs(x) >= abs(t_stat) else 0. for x in zero_distr]) / len(zero_distr)

    if alternative == 'less':
        return sum([1. if x <= t_stat else 0. for x in zero_distr]) / len(zero_distr)

    if alternative == 'greater':
        return sum([1. if x >= t_stat else 0. for x in zero_distr]) / len(zero_distr)

In [41]:
np.random.seed(0)
permutation_test(challenger_broken['Temperature'].values,
                                         challenger_not_broken['Temperature'].values, max_permutations=10000)

0.0057