# Hypothesis Testing for one & two Proportions

<br>1. Test based on P_value formula
<br>2. Test based on confidence interval
<br>3. Z test (statsmodel)
<br>4. Bionomial Test
<br>5. Test based on bootstrapping
<br>6. Chi Square test

In [1]:
# Required libraries
import numpy as np
from scipy import stats
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest

<h2> Single proportion

<p> <b> PROBLEM: Iphone claims 43% of people use iphone. A survey is done : 44 user out of 83 said they use iphone. Statistically verify the claim.

<h3>Test by using formula

<b> Std Error </b>
$$
std_{error} = \sqrt{\frac{\hat{p}(1-\hat{p})}{n}}
$$
<br>
<b>Z score :</b>
$$
Z_{score} = \frac{\hat{p} - p}{std_{error}} 
$$
p = population proportion
<br>p_hat = sample proportion
<br> n = sample size

In [2]:
# H0 : p = 0.43 
# Ha : p != 0.43 ( two tailed test ) 

alpha  = 0.05   # critical value upto which null hypothesis will be rejected
p = 0.43        # claimed proportion

n = 83                   # number of people taken into sample
p_survey = 44 / 83       # proportion in the sample
q_survey = 1 - p_survey  
std_error = np.sqrt( p_survey * q_survey /n )

z = (p_survey - p) / std_error

# calculate P_value
p_value =  ( 1 - stats.norm.cdf(z) ) * 2  # double it as two tailed test


print('----------- Result based on formula ---------')
print('P_value :', p_value)
print('alpha :', alpha)
print ('Conclusion : Fail to reject H0' if p_value > alpha else 'Conclusion : Reject H0')


----------- Result based on formula ---------
P_value : 0.06760942957382698
alpha : 0.05
Conclusion : Fail to reject H0


<h3> Test by using confidence interval

In [3]:
# z critical for 95% confidence interval 
z_critical = stats.norm.ppf(0.975)

error = z_critical * std_error 

# Confidence interval is sample proportion +- error
low =  p_survey - error
high = p_survey + error
interval = (low, high)

# If Claimed proportion lies between the interval then null hypothesis can't be rejected

print('----------- Result based on confidence interval ---------')
print("Claimed Proportion :", p)
print('interval :',interval)
print ('Conclusion : Fail to reject H0' if p > low and p < high  else 'Conclusion : Reject H0')

----------- Result based on confidence interval ---------
Claimed Proportion : 0.43
interval : (0.42274884004489, 0.6374921238105318)
Conclusion : Fail to reject H0


<h3> Using statsmodel Z test for proprtion 

In [4]:
statistics , p_value = proportions_ztest(count= 44, nobs = 83, value=0.43, alternative='two-sided')

alpha = 0.05

print('----------- Result based on proportion Z test ---------')
print('P_value :', p_value)
print('alpha :', alpha)
print ('Conclusion : Fail to reject H0' if p_value > alpha else 'Conclusion : Reject H0')


----------- Result based on proportion Z test ---------
P_value : 0.06760942957382698
alpha : 0.05
Conclusion : Fail to reject H0


<h3> Using single sample T-test & Bionomial Test

In [5]:
p = 0.43
n = 83
alpha = 0.05
positive = 44
survey = [1] * positive + [0] * (n-positive)

result = stats.ttest_1samp(a = survey ,popmean = 0.43)
p_value = result.pvalue

print('----------- Result based on T single sample test ---------')
print('P_value :', p_value)
print('alpha :', alpha)
print ('Conclusion : Fail to reject H0' if p_value > alpha else 'Conclusion : Reject H0')

p_value = stats.binom_test(44, n=83, p = 0.43, alternative='two-sided')
print('')
print('----------- Result based on Bionomial test ---------')
print('P_value :', p_value)
print('alpha :', alpha)
print ('Conclusion : Fail to reject H0' if p_value > alpha else 'Conclusion : Reject H0')

----------- Result based on T single sample test ---------
P_value : 0.07293884375456926
alpha : 0.05
Conclusion : Fail to reject H0

----------- Result based on Bionomial test ---------
P_value : 0.07571259733234031
alpha : 0.05
Conclusion : Fail to reject H0


<h3> Test by usning bootstapping

In [6]:
p = 0.43
n = 83
alpha = 0.05
positive = 44
survey = ["Yes"] * positive + ["No"] * (n-positive)

n_trials = 100000
proportions = []
for trial in range(n_trials):
    np.random.seed(trial)
    sample = list(np.random.choice(survey, size=n, replace= True))
    p_sample =  sample.count('Yes')/n
    proportions.append(p_sample)
    
greater = [x for x in proportions if x > p ]
p_value  = 2 * ( 1 - len(greater)/n_trials) # two tailed test
    
print('----------- Result based on random walk ---------')
print('P_value :', p_value)
print('alpha :', alpha)
print ('Conclusion : Fail to reject H0' if p_value > alpha else 'Conclusion : Reject H0')

----------- Result based on random walk ---------
P_value : 0.06152000000000002
alpha : 0.05
Conclusion : Fail to reject H0


# Two Proportions

<p> <b> A resturant wants to know are teens more likely to orders deserts more than old people.In a servey:
    <br> 84 out of old 33 order desert
    <br> 91 out of teens 46 order desert.

<h3> Test by using formula </h3>
<br> <b> proportion pool </b>
$$
P_{pool} = \frac{n_{teen} * p_{teen} + n_{old}* p_{old}}{n_{teen} + n_{old}}
$$

<br><b> Std Error </b>
$$
  std_{error} = \sqrt{P_{pool} * (1-P_{pool}) * (\frac{1}{n_{teen}} + \frac{1}{n_{old}})}
$$
<br><b> Z Score </b>
$$
Z_{score} = \frac{p_{teen} - p_{old}}{std_{error}}
$$
    
<br>n = sample size 

In [7]:
# H0 : p_teen = p_old
# Ha : p_teen > p_old

alpha = 0.1

n_teen = 91
x_teen = 46
p_teen = x_teen / n_teen

n_old = 89
x_old = 33
p_old = x_old / n_old

p_pool = (n_teen * p_teen + n_old *p_old)/ (n_teen + n_old)

std_error = np.sqrt( p_pool * (1-p_pool) * (1/n_teen + 1/n_old))

z = (p_teen - p_old) / std_error # either use argument in cdf function or Z value
p_value = 1 - stats.norm.cdf(z)

print('----------- Result based on formula ---------')
print('P_value :', p_value)
print('alpha :', alpha)
print ('Conclusion : Fail to reject H0' if p_value > alpha else 'Conclusion : Reject H0')


----------- Result based on formula ---------
P_value : 0.034315818633500306
alpha : 0.1
Conclusion : Reject H0


In [8]:
# H0 : p_teen = p_old
# Ha : p_teen > p_old

alpha = 0.1
count_array = np.array([x_teen,x_old])
sample_array = np.array([n_teen,n_old])

statistics , p_value = proportions_ztest(count= count_array, nobs = sample_array, value= 0, alternative='larger')

print('----------- Result based on formula ---------')
print('P_value :', p_value)
print('alpha :', alpha)
print ('Conclusion : Fail to reject H0' if p_value > alpha else 'Conclusion : Reject H0')

----------- Result based on formula ---------
P_value : 0.03431581863350027
alpha : 0.1
Conclusion : Reject H0


<h3>Test by using bootstrapping

In [9]:
# H0 : p_teen = p_old
# Ha : p_teen > p_old

# creating servey samples for teen and old customers

teen = ['yes'] * 46 + ['no'] * (n_teen - 46) # teen customers
old = ['yes'] * 33 + ['no'] * (n_old - 33)   # old customers
p_diff =  p_teen - p_old

# creating samples and random walk with n_trial

prop = []
n_trials = 100000
mix = teen + old # H0 considers two group to be same so we mix servey result data together

for trial in range(n_trials):
    np.random.seed(trial)
    mix = np.random.permutation(mix)
    
    mix_teen = list(mix[0:n_teen])  # creating sample from mixed pool with provided number of data points
    p_mix_teen = mix_teen.count('yes') / n_teen
    
    mix_old = list(mix[n_teen:])
    p_mix_old = mix_old.count('yes') / n_old
    
    mix_diff = p_mix_teen - p_mix_old
    prop.append(mix_diff)
    
more = [x for x in prop if x > p_diff]
p_value = len(more)/n_trials

print('----------- Result based on random walk ---------')
print('P_value :', p_value)
print('alpha :', alpha)
print ('Conclusion : Fail to reject H0' if p_value > alpha else 'Conclusion : Reject H0')

----------- Result based on random walk ---------
P_value : 0.02393
alpha : 0.1
Conclusion : Reject H0


<h3> Using Chi Square Test</h3>
<br><br><b> Ho : </b> There is no relationship between Age and ordering desert
<br><b> Ha :</b> There is relationship between Age and ordering desert

In [10]:
teen_respose = ['yes'] * 46 + ['no'] * (n_teen - 46) # teen customers
old_response = ['yes'] * 33 + ['no'] * (n_old - 33)   # old customers
response = teen_respose + old_response

customer = ['teen']* n_teen + ['old']* n_old

survey = pd.DataFrame({'customer':customer,'response':response})

observed = pd.crosstab(survey.customer,survey.response)
observed.index = ['old','teen']
observed.columns = ['No','Yes']
print(observed)

result = stats.chi2_contingency(observed = observed)
chi_square_stats = result [0] 
p_value = result [1] 
alpha = 0.1
print('----------- Result based on P Value ---------')
print('P_value :', p_value)
print('alpha :', alpha)
print ('Conclusion : Fail to reject H0' if p_value > alpha else 'Conclusion : Reject H0')

      No  Yes
old   56   33
teen  45   46
----------- Result based on P Value ---------
P_value : 0.09479469022476349
alpha : 0.1
Conclusion : Reject H0
