In [1]:
import scipy.stats as stats
import numpy as np

In [2]:
# In this notebook we compare made up statistics on a vaccine clinical trial
# the example is based on code from here 
# (see the section that begins with "Suppose we instead have binary data...")
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind_from_stats.html

In [22]:
# let's make up some fake numbers for a Covid vaccine trial.  These aren't the real numbers for any trial we are just testing 
# different possible scenarios

# in a Vaccine trial roughly half of the participants get a 'placebo' (no Vaccine but they get an empty shot) and half 
# the real Vaccine.  They are then left to live their lives for a few weeks or months and tested to see if they got the Disease.
# If the Vaccine works we expect less people to get the disease in the 'Vaccine' group as compared to the Placebo group

# number of people who got the 'placebo'
placebosize = 15000
# number of poeple who got covid in the 'placebo group'
placebocovid = 200
# average number of people who got covid in 'placebo group'
placebomean = placebocovid/15000
# variance in placebo group
placebovariance = (placebomean*(1-placebomean))

# now the same numbers for the Vaccine group 
vaccinesize = 15000
vaccinecovid = 180
vaccinemean = vaccinecovid/15000
vaccinevariance = (vaccinemean*(1-vaccinemean))


In [23]:
# now lets run the ttest from the statistics.  In this case the p value should be low 
stats.ttest_ind_from_stats(mean1=placebomean, std1=np.sqrt(placebovariance), nobs1=placebosize, mean2=vaccinemean, std2=np.sqrt(vaccinevariance), nobs2=vaccinesize)

Ttest_indResult(statistic=1.032556951173647, pvalue=0.30181960713232897)

In [20]:
# Now what if our sample size was much smaller and the numbers looked as below
# note in this case only half as many got Covid in the Vaccine group.  It might be a relationship.  Is it?

placebosize = 100
placebocovid = 10
placebomean = placebocovid/placebosize
placebovariance = (placebomean*(1-placebomean))
vaccinesize = 100
vaccinecovid = 0
vaccinemean = vaccinecovid/vaccinesize
vaccinevariance = (vaccinemean*(1-vaccinemean))

In [21]:
# in this case after running the t-test we should get a much higher p value.  150 samples is not enough to detect 
# the relationship. 
stats.ttest_ind_from_stats(mean1=placebomean, std1=np.sqrt(placebovariance), nobs1=placebosize, mean2=vaccinemean, std2=np.sqrt(vaccinevariance), nobs2=vaccinesize)

Ttest_indResult(statistic=3.333333333333333, pvalue=0.0010241414728258344)