In [1]:
import numpy as np
import pandas as pd
from scipy import stats
%precision 3
np.random.seed(1111)

In [2]:
df = pd.read_csv('../data/ch11_potato.csv')
sample = np.array(df['重さ'])
sample

array([122.02, 131.73, 130.6 , 131.82, 132.05, 126.12, 124.43, 132.89,
       122.79, 129.95, 126.14, 134.45, 127.64, 125.68])

In [3]:
s_mean = np.mean(sample)
s_mean

128.4507142857143

In [4]:
rv = stats.norm(130, np.sqrt(9/14))
rv.isf(0.95)

128.68118313069039

In [16]:
rv.ppf(0.05)

128.68118313069039

In [17]:
z = (s_mean - 130) / np.sqrt(9/14)
z

-1.932298779026813

In [18]:
rv = stats.norm()
rv.isf(0.95)

-1.6448536269514722

In [19]:
rv.cdf(z) # p値は累積分布関数を使っても求まる

0.026661319523126635

In [21]:
z = (s_mean - 130) / np.sqrt(9/14)
z

-1.932298779026813

In [22]:
rv = stats.norm()
rv.interval(0.95)

(-1.959963984540054, 1.959963984540054)

In [23]:
rv.cdf(z) * 2

0.05332263904625327

In [24]:
rv = stats.norm(130, 3)

In [27]:
c = stats.norm().isf(0.95)
n_samples = 10000
cnt = 0
for _ in range(n_samples):
    sample_ = np.round(rv.rvs(14), 2)
    s_mean_ = np.mean(sample_)
    z = (s_mean_ - 130) / np.sqrt(9/14)
    if z < c:
        cnt += 1
cnt / n_samples

0.052

In [29]:
rv = stats.norm(128, 3)

In [31]:
c = stats.norm().isf(0.95)
n_samples = 10000
cnt = 0
for _ in range(n_samples):
    sample_ = np.round(rv.rvs(14), 2)
    s_mean_ = np.mean(sample_)
    z = (s_mean_ - 130) / np.sqrt(9/14)
    if z >= c:
        cnt += 1
cnt / n_samples

0.197

In [32]:
def pmean_test(sample, mean0, p_var, alpha=0.05):
    s_mean = np.mean(sample)
    n = len(sample)
    rv = stats.norm()
    interval = rv.interval(1-alpha)
    
    z = (s_mean - mean0) / np.sqrt(p_var/n)
    if interval[0] <= z <= interval[1]:
        print('帰無仮説を採択')
    else:
        print('帰無仮説を棄却')
    if z < 0:
        p = rv.cdf(z) * 2
    else:
        p = (1-rv.cdf(z)) * 2
    print(f'p値は{p:.3f}')

In [34]:
pmean_test(sample, 130, 9)

帰無仮説を採択
p値は0.053


In [39]:
def pvar_test(sample, var0, alpha=0.05):
    u_var = np.var(sample, ddof=1)
    n = len(sample)
    rv = stats.chi2(df=n-1)
    interval = rv.interval(1-alpha)
    y = (n-1) * u_var / var0
    if interval[0] <= y <= interval[1]:
        print('帰無仮説を採択')
    else:
        print('帰無仮説を棄却')
    if y < rv.isf(0.5):
        p = rv.cdf(y) * 2
    else:
        p = (1-rv.cdf(y)) * 2
    print(f'p値は{p:.3f}')

In [40]:
pvar_test(sample, 9)

帰無仮説を採択
p値は0.085


In [42]:
def pmean_test(sample, mean0, alpha=0.05):
    s_mean = np.mean(sample)
    u_var = np.var(sample, ddof=1)
    n = len(sample)
    rv = stats.t(df=n-1)
    interval = rv.interval(1-alpha)
    t = (s_mean-mean0) / np.sqrt(u_var/n)
    if interval[0] <= t <= interval[1]:
        print('帰無仮説を採択')
    else:
        print('帰無仮説を棄却')
    if t < 0:
        p = rv.cdf(t) * 2
    else:
        p = (1-rv.cdf(t)) * 2
    print(f'p値は{p:.3f}')

In [43]:
pmean_test(sample, 130)

帰無仮説を採択
p値は0.169


In [44]:
t, p = stats.ttest_1samp(sample, 130)
t, p

(-1.4551960206404198, 0.16933464230414275)

In [46]:
training_rel = pd.read_csv('../data/ch11_training_rel.csv')
print(training_rel.shape)
training_rel.head()

(20, 2)


Unnamed: 0,前,後
0,59,41
1,52,63
2,55,68
3,61,59
4,59,84


In [48]:
training_rel['差'] = training_rel['後'] - training_rel['前']
training_rel.head()

Unnamed: 0,前,後,差
0,59,41,-18
1,52,63,11
2,55,68,13
3,61,59,-2
4,59,84,25


In [49]:
t, p = stats.ttest_1samp(training_rel['差'], 0)
p

0.04004419061842953

In [50]:
t, p = stats.ttest_rel(training_rel['後'], training_rel['前'])

In [51]:
p

0.04004419061842953

In [52]:
training_ind = pd.read_csv('../data/ch11_training_ind.csv')
print(training_ind.shape)
training_ind.head()

(20, 2)


Unnamed: 0,A,B
0,47,49
1,50,52
2,37,54
3,60,48
4,39,51


In [53]:
t, p = stats.ttest_ind(training_ind['A'], training_ind['B'], equal_var = False)
p

0.08695731107259361

In [66]:
training_rel = pd.read_csv('../data/ch11_training_rel.csv')
toy_df = training_rel[:6].copy()
toy_df

Unnamed: 0,前,後
0,59,41
1,52,63
2,55,68
3,61,59
4,59,84
5,45,37


In [67]:
diff = toy_df['後'] - toy_df['前']
toy_df['差'] = diff
toy_df

Unnamed: 0,前,後,差
0,59,41,-18
1,52,63,11
2,55,68,13
3,61,59,-2
4,59,84,25
5,45,37,-8


In [68]:
rank = stats.rankdata(abs(diff)).astype(int)
toy_df['順位'] = rank
toy_df

Unnamed: 0,前,後,差,順位
0,59,41,-18,5
1,52,63,11,3
2,55,68,13,4
3,61,59,-2,1
4,59,84,25,6
5,45,37,-8,2


In [69]:
r_minus = np.sum((diff<0)*rank)
r_plus = np.sum((diff>0)*rank)
r_minus, r_plus

(8, 13)

In [70]:
toy_df['後'] = toy_df['前'] + np.arange(1, 7)
diff = toy_df['後'] - toy_df['前']
rank = stats.rankdata(abs(diff)).astype(int)
toy_df['差'] = diff
toy_df['順位'] = rank
toy_df

Unnamed: 0,前,後,差,順位
0,59,60,1,1
1,52,54,2,2
2,55,58,3,3
3,61,65,4,4
4,59,64,5,5
5,45,51,6,6


In [72]:
r_minus = np.sum((diff<0)*rank)
r_plus = np.sum((diff>0)*rank)
r_minus, r_plus

(0, 21)

In [73]:
toy_df['後'] = toy_df['前'] + [1,-2,-3,4,5,-6]
diff = toy_df['後'] - toy_df['前']
rank = stats.rankdata(abs(diff)).astype(int)
toy_df['差'] = diff
toy_df['順位'] = rank
toy_df

Unnamed: 0,前,後,差,順位
0,59,60,1,1
1,52,50,-2,2
2,55,52,-3,3
3,61,65,4,4
4,59,64,5,5
5,45,39,-6,6


In [74]:
T, p = stats.wilcoxon(training_rel['前'], training_rel['後'])
p

0.037999792729223686

In [77]:
T, p = stats.wilcoxon(training_rel['後']-training_rel['前'])
p

0.037999792729223686

In [79]:
n = 10000
diffs = np.round(stats.norm(3,4).rvs(size=(n,20)))

In [131]:
cnt = 0
alpha = 0.05
for diff in diffs:
    t, p = stats.ttest_1samp(diff, 0)
    if p < alpha:
        cnt += 1
cnt / n

0.780

In [132]:
cnt = 0
alpha = 0.05
for diff in diffs:
    t, p = stats.wilcoxon(diff)
    if p < alpha:
        cnt += 1
cnt / n

0.770

In [93]:
# 以下オリジナル

In [159]:
import numpy as np
from scipy import stats

n = 10000
sample_size = 10
diffs = 1 + np.random.normal(size=(n, sample_size))
result = []
alpha = 0.05

# t検定
for diff in diffs:
    _, p = stats.ttest_1samp(diff, 0)
    result.append(p < alpha)
print ('検出力: {}'.format(np.mean(result)))

# ウィルコクソンの符号付き順位検定
result = []
for diff in diffs:
    _, p = stats.wilcoxon(diff)
    result.append(p < alpha)
print ('検出力: {}'.format(np.mean(result)))

検出力: 0.8069
検出力: 0.7882


In [151]:
# 終了

In [160]:
training_ind = pd.read_csv('../data/ch11_training_ind.csv')
toy_df = training_ind[:5].copy()
toy_df

Unnamed: 0,A,B
0,47,49
1,50,52
2,37,54
3,60,48
4,39,51


In [162]:
rank = stats.rankdata(np.concatenate([toy_df['A'],toy_df['B']]))
rank_df = pd.DataFrame({'A':rank[:5],'B':rank[5:10]}).astype(int)
rank_df

Unnamed: 0,A,B
0,3,5
1,6,8
2,1,9
3,10,4
4,2,7


In [163]:
n1 = len(rank_df['A'])
u = rank_df['A'].sum() - (n1*(n1+1))/2
u

7.0

In [164]:
rank_df = pd.DataFrame(np.arange(1,11).reshape(2, 5).T, columns=['A','B'])
rank_df

Unnamed: 0,A,B
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [165]:
u = rank_df['A'].sum() - (n1*(n1+1))/2
u

0.0

In [166]:
rank_df = pd.DataFrame(np.arange(1,11).reshape(2,5)[::-1].T, columns=['A','B'])
rank_df

Unnamed: 0,A,B
0,6,1
1,7,2
2,8,3
3,9,4
4,10,5


In [167]:
u = rank_df['A'].sum() - (n1*(n1+1))/2
u

25.0

In [169]:
u, p = stats.mannwhitneyu(training_ind['A'], training_ind['B'], alternative='two-sided')
p

0.05948611166127324

In [170]:
# カイ二乗検定

In [171]:
ad_df = pd.read_csv('../data/ch11_ad.csv')
n = len(ad_df)
print(n)
ad_df.head()

1000


Unnamed: 0,広告,購入
0,B,しなかった
1,B,しなかった
2,A,した
3,A,した
4,B,しなかった


In [172]:
ad_cross = pd.crosstab(ad_df['広告'], ad_df['購入'])
ad_cross

購入,した,しなかった
広告,Unnamed: 1_level_1,Unnamed: 2_level_1
A,49,351
B,51,549


In [174]:
ad_cross['した'] / (ad_cross['した']+ad_cross['しなかった'])

広告
A    0.1225
B    0.0850
dtype: float64

In [175]:
n_yes, n_not = ad_cross.sum()
n_yes, n_not

(100, 900)

In [176]:
n_adA, n_adB = ad_cross.sum(axis=1)
n_adA, n_adB

(400, 600)

In [178]:
ad_ef = pd.DataFrame({'した':[n_adA*n_yes/n,n_adB*n_yes/n], 'しなかった':[n_adA*n_not/n,n_adB*n_not/n]}, index=['A','B'])
ad_ef

Unnamed: 0,した,しなかった
A,40.0,360.0
B,60.0,540.0


In [179]:
y = ((ad_cross-ad_ef)**2/ad_ef).sum().sum()
y

3.75

In [180]:
rv = stats.chi2(1)
1 - rv.cdf(y)

0.052807511416113395

In [181]:
chi2, p, dof, ef = stats.chi2_contingency(ad_cross, correction=False)
chi2, p, dof

(3.75, 0.052807511416113395, 1)

In [182]:
ef

array([[ 40., 360.],
       [ 60., 540.]])