In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chi2,norm
import warnings
import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [8]:
PATH = "/kaggle/input/heart-closed-dataset/heart_2022_cleaned.csv"
PATH="dataset/heart_2022_cleaned.csv"

In [9]:
warnings.filterwarnings("ignore")

In [10]:
def stratified_sampling(df, column_name, num_samples):
    return df.groupby(column_name, group_keys=False).apply(lambda x: x.sample(n = num_samples, replace=False))

In [11]:
def random_sampling(df, num_samples):
    return df.sample(n = num_samples, replace=False)

In [12]:
data = pd.read_csv(PATH)

In [13]:
def do_chi2_test(data,column1,column2,alpha = 0.01):
    # print("***************************************************************************************************************************")
    # print("Chi2 Test for ",column1 ," and " ,column2)
    observed={}
    cols=data[column1].unique()
    for i in range(len(data[column2])):
        if data[column2].iloc[i] not in observed:
            observed[data[column2].iloc[i]]={}
            for j in cols:
                observed[data[column2].iloc[i]][j]=0
        observed[data[column2].iloc[i]][data[column1].iloc[i]]+=1

    observed=pd.DataFrame(observed).T
    observed=observed.sort_index()
    observed=observed[sorted(observed.columns)]
    # print("Observed")
    # print(observed)
    # print()
    observed=observed.to_numpy()
    row_sums = observed.sum(axis=1)
    col_sums = observed.sum(axis=0)
    total = row_sums.sum()
    # expected = np.outer(row_sums, col_sums) / total

    expected=np.zeros(observed.shape)
    for i in range(observed.shape[0]):
        for j in range(observed.shape[1]):
            expected[i][j]=row_sums[i]*col_sums[j]/total
    exp=pd.DataFrame(expected)
    exp.columns=sorted(data[column1].unique())
    exp.index=sorted(data[column2].unique())
    # print("Expected") 
    # print(exp)
    chi2_stat = ((observed - expected)**2 / expected).sum()
   
    dof=(len(row_sums)-1)*(len(col_sums)-1)
    critical = chi2.ppf(1-alpha, dof)
    # print("chi2_stat: ",chi2_stat)
    # print("critical: ",critical)
    if chi2_stat>=critical:
        # print("Dependent (reject H0)")
        return [chi2_stat,critical,1]
    else:
        # print("Independent (fail to reject H0)")
        return [chi2_stat,critical,-1]

In [14]:
categorical_columns = data.select_dtypes(include=['object']).columns
categorical_columns

Index(['State', 'Sex', 'GeneralHealth', 'LastCheckupTime',
       'PhysicalActivities', 'RemovedTeeth', 'HadHeartAttack', 'HadAngina',
       'HadStroke', 'HadAsthma', 'HadSkinCancer', 'HadCOPD',
       'HadDepressiveDisorder', 'HadKidneyDisease', 'HadArthritis',
       'HadDiabetes', 'DeafOrHardOfHearing', 'BlindOrVisionDifficulty',
       'DifficultyConcentrating', 'DifficultyWalking',
       'DifficultyDressingBathing', 'DifficultyErrands', 'SmokerStatus',
       'ECigaretteUsage', 'ChestScan', 'RaceEthnicityCategory', 'AgeCategory',
       'AlcoholDrinkers', 'HIVTesting', 'FluVaxLast12', 'PneumoVaxEver',
       'TetanusLast10Tdap', 'HighRiskLastYear', 'CovidPos'],
      dtype='object')

### $\chi^2$ Test for independence  ###

In [None]:
def create_files(sampling_type,range_lower,range_upper,step):
    for col in categorical_columns:
        if col=='GeneralHealth':
            continue
        for num_samples in range(range_lower,range_upper,step):
            os.makedirs(f'/kaggle/working/output/{sampling_type}/{range_lower}_to_{range_upper}/{num_samples}', exist_ok=True)
            f=open(f"output/{sampling_type}/{range_lower}_to_{range_upper}/{num_samples}/{col}.txt","w")
            for iteration in range(1,6):
                if sampling_type=="stratified_sampling":
                    sample = stratified_sampling(data,'GeneralHealth',num_samples)
                else:
                    sample = random_sampling(data,num_samples)
                f.write(f"<Iteration-{iteration}>\n")
                chi2_stat,chi2_critical,result = do_chi2_test(sample,col,'GeneralHealth',0.01)
                f.write(f"{chi2_stat}\n")
                f.write(f"{chi2_critical}\n")
                f.write(f"{result}\n")
                print(f"Iteration-{iteration} done")
            print(f"Sample Size-{num_samples} done")    
            f.close()
        print(f"{col} done")

In [None]:
def make_experiment_plots(sampling_technique,lower_bound,upper_bound,step_size):
    fig, axes = plt.subplots(len(categorical_columns), 1, figsize=(10, 100))
    for i,categorical_feature in enumerate(categorical_columns):
        if categorical_feature=='GeneralHealth':continue
        feature_output = []
        for num_samples in range(lower_bound,upper_bound,step_size):
            with open(f"output/{sampling_technique}/{lower_bound}_to_{upper_bound}/{num_samples}/{categorical_feature}.txt", "r") as f:
                lines = f.readlines()
            every_4th_line = [float(lines[j].strip().split()[0]) for j in range(3, len(lines), 4)]
            feature_output.append(sum(every_4th_line))
        data = pd.DataFrame({'Num Samples': range(lower_bound,upper_bound,step_size), 'Test Result': feature_output})
        sns.barplot(ax=axes[i],x="Num Samples",y="Test Result", data = data)
        axes[i].set_yticks(range(-5,6))
        axes[i].axhline(0,color='black',linewidth=0.5)
        axes[i].set_title(f"{categorical_feature} vs Num Samples")
    plt.tight_layout()


### Stratified Sampling ###

### Sample Size : 1000 to 10000

In [None]:
create_files('stratified_sampling',1000,10001,1000)

In [None]:
make_experiment_plots("stratified_sampling",1000,10001,1000)

### Sample Size : 100 to 1000 ###

In [None]:
create_files('stratified_sampling',100,1001,100)

In [None]:
make_experiment_plots("stratified_sampling",100,1001,100)

### Random Sampling ###

### Sample Size : 5000 to 50000 ###

In [None]:
create_files("random_sampling",5000,50001,5000)

In [None]:
make_experiment_plots("random_sampling",500,50001,5000)   

### Sample Size : 500 to 5000 ###

In [None]:
create_files("random_sampling",500,5001,500)

In [None]:
make_experiment_plots("random_sampling",500,5001,500)   