In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

### 1. 독립표본 T-test

- 소표본일 경우 t-test를 사용, 자유도 n1+n2-2의 t분포 / 만일 모분산의 등분산이 가정되지 않을 경우 특정 자유도 '세타'를 따르는 t분포 이용
- 대표본일 경우 t분포는 정규분포에 근사하기 때문에 Z검정으로 계산 가능
- equal_val = 등분산 여부
- alternative = 양측/단측 검정

In [47]:
female = [63.8, 56.4, 55.2, 58.5, 64.0, 51.6, 54.6, 71.0]
male = [75.5, 83.9, 75.7, 72.5, 56.2, 73.4, 67.7, 87.9]

# 정규성(소표본이기 때문에)
print(stats.shapiro(df1.before))
print(stats.shapiro(df1.after))

# 등분산 검정
print(stats.levene(df1.before,df1.after))
print(stats.bartlett(df1.before,df1.after))

# 만일 이분산 검정일 경우 equal_var = False 설정
stats.ttest_ind(male, female)

ShapiroResult(statistic=0.9147204160690308, pvalue=0.4295216202735901)
ShapiroResult(statistic=0.9612342715263367, pvalue=0.8292515873908997)
LeveneResult(statistic=0.10869565217391308, pvalue=0.7473178000070186)
BartlettResult(statistic=0.05642329335305127, pvalue=0.8122410888700984)


Ttest_indResult(statistic=3.587521542539111, pvalue=0.002971151455277398)

### 2. 대응표본 T-test

In [48]:
bf = [72,80,83,63,66,76,82]
af = [78,82,82,68,70,75,88]
df1 = pd.DataFrame({'before' : bf, 'after' : af}, index = np.arange(1,len(bf)+1))

# 정규성(소표본이기 때문에)
print(stats.shapiro(df1.before))
print(stats.shapiro(df1.after))

# 등분산 검정
print(stats.levene(df1.before,df1.after))
print(stats.bartlett(df1.before,df1.after))

# ttest_rel모듈은 대응표본에 사용
stats.ttest_rel(df1.before, df1.after)

ShapiroResult(statistic=0.9147204160690308, pvalue=0.4295216202735901)
ShapiroResult(statistic=0.9612342715263367, pvalue=0.8292515873908997)
LeveneResult(statistic=0.10869565217391308, pvalue=0.7473178000070186)
BartlettResult(statistic=0.05642329335305127, pvalue=0.8122410888700984)


Ttest_relResult(statistic=-2.5980762113533156, pvalue=0.04076740686322286)

### 3. ANOVA

- 독립성(카이제곱검정)
- 정규성(shapiro, kstest)
- 등분산성(levene, bartlett)

#### 3-1 one-way ANOVA

In [98]:
df2 = pd.DataFrame({'X' : [84,83,82,85,89,86,93,94,96,89,89,87],
                   'type' : [1,1,1,2,2,2,3,3,3,4,4,4]})

print(stats.levene(df2.X[df2.type==1], 
             df2.X[df2.type==2],
             df2.X[df2.type==3],
             df2.X[df2.type==4]))

# 음수가 나왔으나 간단한 분석이니 넘어간다.
print(stats.shapiro(df2.X[df2.type==1]),'\n',stats.shapiro(df2.X[df2.type==2]),'\n',stats.shapiro(df2.X[df2.type==3]),'\n',stats.shapiro(df2.X[df2.type==4]))

df2_ols = ols('X~C(type)', data = df2).fit()
df2_anova = sm.stats.anova_lm(df2_ols)
display(df2_anova)
print('-'*50)

df2_tukey=pairwise_tukeyhsd(df2.X, df2.type, alpha = 0.05)
df2_tukey.summary()

LeveneResult(statistic=0.24444444444444444, pvalue=0.8630385124791773)
ShapiroResult(statistic=1.0, pvalue=0.999998927116394) 
 ShapiroResult(statistic=0.9230769872665405, pvalue=0.46326181292533875) 
 ShapiroResult(statistic=0.9642858505249023, pvalue=0.6368862986564636) 
 ShapiroResult(statistic=0.7499999403953552, pvalue=-1.1383646096874145e-06)


Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(type),3.0,200.916667,66.972222,29.765432,0.000109
Residual,8.0,18.0,2.25,,


--------------------------------------------------


group1,group2,meandiff,p-adj,lower,upper,reject
1,2,3.6667,0.0671,-0.2558,7.5891,False
1,3,11.3333,0.001,7.4109,15.2558,True
1,4,5.3333,0.0104,1.4109,9.2558,True
2,3,7.6667,0.0011,3.7442,11.5891,True
2,4,1.6667,0.5516,-2.2558,5.5891,False
3,4,-6.0,0.0052,-9.9225,-2.0775,True


#### 3-2 two-way ANOVA

In [117]:
df3 = pd.DataFrame({'X' : [97.8,97.5,96.9,98.5,98.8,97.1,99.2,98.4,98.1,98.2,97.5,96.8],
                    'type1' : [1,1,1,2,2,2,3,3,3,4,4,4],
                   'type2' : [1,2,3,1,2,3,1,2,3,1,2,3]})

df3_ols = ols('X~C(type1)+C(type2)', data = df3).fit()
df3_anova = sm.stats.anova_lm(df3_ols)
display(df3_anova)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(type1),3.0,2.726667,0.908889,8.039312,0.015947
C(type2),2.0,3.015,1.5075,13.334152,0.006195
Residual,6.0,0.678333,0.113056,,
