In [34]:
import pandas as pd
import numpy as np
import pingouin as pg
from scipy import stats
from statsmodels.stats import weightstats as ws
from pydataset import data

In [35]:
df = data('iris')

# T-test

Assumptions: Independent samples are taken from a normal distribution. In this case, the normalized version of $\bar{X}$ using the sample standard deviation $S$ instead of the population standard deviation $\sigma$ follows the $t$-distribution with $n-1$ Degree of Freedom (dof).
- It is robust to the normality assumption

Test statistic: $T=\frac{\bar{X}-\mu_0}{S/\sqrt{n}}$



## 1-sample *t*-test

Population: Sepal.Length of all 3 species (without differentiation among species).
- $H_0$: $\mu=5$
- $H_a$: $\mu\neq5$

In [44]:
pg.ttest(
    x=df['Sepal.Length'], 
    y=5, 
    paired=False,
    tail='two-sided', #'greater', 'less'
    correction='auto', #True, False
    confidence=0.95
)

Unnamed: 0,T,dof,tail,p-val,CI95%,cohen-d,BF10,power
T-test,12.473257,149,two-sided,6.670742e-25,"[5.71, 5.98]",1.018437,5.88e+21,1.0


## 2-sample *t*-test

- Population 1: Sepal.Length of setosa
- Population 2: Sepal.Length of versicolor
- $H_0$: $D_0 = \mu_1 − \mu_2 = 0$
- $H_a$: $D_a = \mu_1 − \mu_2 \neq 0$

In [53]:
pg.ttest(
    x=df.loc[df['Species']=='setosa','Sepal.Length'], 
    y=df.loc[df['Species']=='versicolor','Sepal.Length'],
    paired=False, # True for paired samples
    tail='two-sided', #'greater', 'less'
    correction='auto', #True, False
    confidence=0.95
)

Unnamed: 0,T,dof,tail,p-val,CI95%,cohen-d,BF10,power
T-test,-10.520986,98,two-sided,8.985235e-18,"[-1.11, -0.75]",2.104197,419000000000000.0,1.0


# *Z*-Test

Assumptions: When sample size is large, the distribution of sample mean $(\bar{X})$ is approximately normal with
- $E(\bar{X}) = \mu$ 
- $Var(\bar{X}) = \sigma^2/n$

Test statistic: $Z=\frac{\bar{X}-\mu_0}{\sigma/\sqrt{N}}$

## 1-sample *Z*-test

Population: Sepal.Length of all 3 species (without differentiation among species).
- $H_0$: $\mu=5$
- $H_a$: $\mu\neq5$

In [36]:
zstat, pval = ws.ztest(
    x1=df['Sepal.Length'], 
    x2=None, 
    value=5,
    alternative='two-sided' #'larger', 'smaller'
)
print("z-stat:", ztest, "p-value:", pval)

z-stat: 0.04930141164702255 p-value: 1.0446696695008141e-35


## 2-sample *Z*-test

- Population 1: Sepal.Length of setosa
- Population 2: Sepal.Length of versicolor
- $H_0$: $D_0 = \mu_1 − \mu_2 = 0$
- $H_a$: $D_a = \mu_1 − \mu_2 \neq 0$

In [37]:
zstat, pval = ws.ztest(
    x1 = df.loc[df['Species']=='setosa','Sepal.Length'], 
    x2 = df.loc[df['Species']=='versicolor','Sepal.Length'],
    value = 0,
    alternative = 'two-sided' #'larger', 'smaller'
)
print("z-stat:", ztest, "p-value:", pval)

z-stat: 0.04930141164702255 p-value: 6.914595261207391e-26


# Chi-Square test

## Hypotheses Concerning a Population Variance

- Assumption: Independent samples are taken from a normal distribution. In this case, $\chi^2 = \frac{(n-1)S^2}{\sigma^2}$
    - not robust to the normality assumption
- Test Statistic: $\chi^2 = \frac{(n-1)S^2}{\sigma_0^2}$
- Population: Sepal.Length of all 3 species (without differentiation among species).
- $H_0$: $\sigma=0.6857$
- $H_a$: $\sigma\neq0.6857$

In [216]:
def calculate_chi2_stat(x, var0):
    return (len(x)-1)*np.var(x)/var0

def var_test(x, var0, tail='two-sided'):
    chi2_stat = calculate_chi2_stat(x, var0)
    left_p = stats.chi2.cdf(chi2_stat, df=len(x)-1) # P(chi2<=chi2_stat)
    if tail=='greater':
        return chi2_stat, 1-left_p
    if tail=='less':
        return chi2_stat, left_p
    else:
        return chi2_stat, (left_p if left_p<0.5 else 1-left_p) / 2

In [217]:
var_test(
    x = df['Sepal.Length'], 
    var0 = 1,
    tail='two-sided'
)

(101.48721111111111, 0.0005252940518779105)

## Comparison of Two Variances

- Assumption: Independent samples are taken from a normal distribution. In this case: $F = \frac{(n-1)S_1^2/\sigma_1^2}{(n-1)S_2^2/\sigma_2^2} = \frac{S_1^2/\sigma_1^2}{S_2^2/\sigma_2^2}$
    - not robust to the normality assumption
- Test Statistic: $F = \frac{S_1^2}{S_2^2}$

- Population 1: Sepal.Length of setosa
- Population 2: Sepal.Length of versicolor
- $H_0$: $\sigma_1=\sigma_2$
- $H_a$: $\sigma_1\neq\sigma_2$

In [222]:
def var_F_test(x, y, tail='two-sided'):
    f_stat = np.var(x)/np.var(y)
    left_p = stats.f.cdf(f_stat, dfn=len(x)-1, dfd=len(y)-1) # P(chi2<=chi2_stat)
    if tail=='greater':
        return f_stat, 1-left_p
    if tail=='less':
        return f_stat, left_p
    else:
        return f_stat, (left_p if left_p<0.5 else 1-left_p) / 2

In [223]:
var_F_test(
    x = df.loc[df['Species']=='setosa','Sepal.Length'], 
    y = df.loc[df['Species']=='versicolor','Sepal.Length'],
    tail = 'two-tail'
)

(0.4663429131686986, 0.002164297090674954)