In [24]:
import pandas as pd
import numpy as np
from scipy.stats import chi2,norm,f
import warnings
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sampling import random_sampling, stratified_sampling

In [25]:
PATH = "/kaggle/input/heart-closed-dataset/heart_2022_cleaned.csv"
PATH="dataset/heart_2022_cleaned.csv"

In [26]:
data = pd.read_csv(PATH)

# Z-test for proportion  #

In [27]:
def do_proportion_test(data,column,threshold,alpha,po=0.5):
    p = (data[column]>threshold).sum()/len(data[column])
    z = (p-po)/np.sqrt(po*(1-po)/len(data[column]))
    print("z stat : ",z)
    critical_value = norm.ppf(1-alpha)
    print("critical_value : ",critical_value)

    if z<=critical_value:
        print("Reject H0")
    else:
        print("Fail to reject H0")

### Stratified_Sampling ###

In [28]:
sample = stratified_sampling(data,'GeneralHealth',8000)

In [29]:
do_proportion_test(sample,"BMI",25,0.05,0.4)  

z stat :  118.81045873224572
critical_value :  1.6448536269514722
Fail to reject H0


### Random Sampling ###

In [30]:
sample_random = random_sampling(data,40000)

In [31]:
do_proportion_test(sample_random,"BMI",25,0.05,0.4)

z stat :  117.58571386085413
critical_value :  1.6448536269514722
Fail to reject H0


# ANOVA Test #

In [32]:
def perform_anova_test(data,column):
    grouped_data = [group[column].values for name,group in data.groupby('GeneralHealth')]   
    x_bars = [np.mean(group) for group in grouped_data]
    print(x_bars)
    x_bar_ka_bar = np.mean(x_bars)
    print(x_bar_ka_bar)
    SSC = sum([len(group)*(x_bar-x_bar_ka_bar)**2 for group,x_bar in zip(grouped_data,x_bars)])
    SSE = sum([sum((group-x_bar)**2) for group,x_bar in zip(grouped_data,x_bars)])
    n1 = 4
    n2 = len(data[column])-n1
    MSC = SSC/n1
    MSE = SSE/n2
    f_stat = MSC/MSE
    f_critical = f.ppf(0.95,n1,n2)
    print("f_stat : ",f_stat)
    print("f_critical : ",f_critical)
    if f_stat<f_critical:
        print("Fail to reject H0")
    else:
        print("Reject H0")



### Stratified Sampling ###

In [33]:
samples1 = stratified_sampling(data,'GeneralHealth',4000)
samples2 = stratified_sampling(data,'GeneralHealth',8000)

### Sample Size : 20000 ###

In [34]:
perform_anova_test(samples1,"BMI")

[25.98209, 30.88079, 29.530625, 30.545012500000002, 27.597955]
28.9072945
f_stat :  360.86329146631385
f_critical :  2.372376403974728
Reject H0


### Sample Size : 40000

In [35]:
perform_anova_test(samples2,"BMI")

[25.998510000000003, 30.625445, 29.553262500000002, 30.4096925, 27.601425]
28.837667000000003
f_stat :  680.1707533920603
f_critical :  2.3721542977937946
Reject H0


### Random Sampling ###

In [36]:
samples1_random = random_sampling(data,20000)
samples2_random = random_sampling(data,40000)

### Sample Size : 20000 ###

In [37]:
perform_anova_test(samples1_random,"BMI")

[26.11230817413091, 30.742379323451157, 29.642923994494573, 30.88330264672037, 27.660585106382978]
29.008299849035996
f_stat :  328.5781668887431
f_critical :  2.372376403974728
Reject H0


### Sample Size : 40000 ###

In [38]:
perform_anova_test(samples2_random,"BMI")

[26.00219413233458, 30.653567577616652, 29.495302238514935, 30.67286627906977, 27.579561146869516]
28.88069827488109
f_stat :  662.1529635171618
f_critical :  2.3721542977937946
Reject H0
