In [4]:
from scipy import stats
import numpy as np 
import pandas as pd
import math

one sample t-test

In [5]:
N=30
a = np.random.randn(N)+2

In [6]:
stats.ttest_1samp(a, popmean=2)

Ttest_1sampResult(statistic=-0.1550199600010982, pvalue=0.877879516831054)

In [7]:
def My1sampTTest(sample, popmean, tail=2):
    N = len(sample)
    sample_mean = np.mean(sample)
    sample_std = np.std(sample, ddof=1)
    sample_se = sample_std/np.sqrt(N)
    
    t = (sample_mean - popmean)/sample_se
    df = N - 1 # degrees of freedom
    if tail == 1:
        p = 1 - stats.t.cdf(abs(t), df=df) # one-sided test p-value
    elif tail == 2:
        p = 1 - stats.t.cdf(abs(t), df=df) # two-sided test p-value
        p = 2*p
    print("t:", t)
    print("p:", p) # two-sided test p-value

In [8]:
# null hypothesis: population mean is 2
# p = 0.87; which means under null hypothesis, there is 87% probability you could obtain a t value that is at least as 
# extreme as the observed t value (so the observed t value is not large enough to show population mean 
# is different from 2). Therefore, we could not reject the null hypothesis.
My1sampTTest(a, popmean=2)

t: -0.1550199600010982
p: 0.8778795168310538


two sample t-test

In [9]:
b = np.random.randn(N)+2
c = np.random.randn(N)+3

In [10]:
stats.ttest_ind(b, c)

Ttest_indResult(statistic=-4.306047655531677, pvalue=6.489551717501184e-05)

In [11]:
stats.ttest_ind(b, c, equal_var=False)

Ttest_indResult(statistic=-4.306047655531677, pvalue=6.88923265397696e-05)

In [18]:
def My2SampleTTest(sample1, sample2, equal_var=True, tail=2):
    N1= len(sample1)
    N2= len(sample2)
    
    mean_diff = np.mean(sample1) - np.mean(sample2)
    var_1 = np.var(sample1, ddof=1)
    var_2 = np.var(sample2, ddof=1)
    
    ## equal variance assumption
    if equal_var:
        var_1 = np.var(sample1, ddof=1)
        var_2 = np.var(sample2, ddof=1)
        pooled_std = np.sqrt((var_1+var_2)/2)
        se_diff = pooled_std*np.sqrt(2/N1)
        t = mean_diff/se_diff
        df = 2*(N - 1) # degrees of freedom
    
    ## unequal varaince assumption: Welch's test
    else:
        pooled_variance = var_1/N1 + var_2/N2
        pooled_se = np.sqrt(pooled_variance)
        t = mean_diff/pooled_se
        df = (var_1/N1 + var_2/N2)**2 / ((var_1/N1)**2/(N1-1) + (var_2/N2)**2/(N2-1))

    if tail == 1:
        p = 1 - stats.t.cdf(abs(t), df=df) # one-sided test p-value
    elif tail == 2:
        p = 1 - stats.t.cdf(abs(t), df=df) # two-sided test p-value
        p = 2*p
    print("t:", t)
    print("p:", p) # two-sided test p-value

In [19]:
My2SampleTTest(b, c)

t: -4.306047655531678
p: 6.48955171751453e-05


In [20]:
My2SampleTTest(b, c, equal_var=False)

t: -4.306047655531677
p: 6.889232653972677e-05
