In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from itertools import combinations

# Cell 2: Load the dataset
# Replace with the path to your dataset
file_path = '/content/test-dataset.xlsx'
dataset = pd.read_excel(file_path)
dataset.head()

# Cell 3: Define helper functions
def is_categorical(series, threshold=10):
    return series.nunique() < threshold or series.dtype.name == "category"

def interpret_p_value(p_val):
    return "Significant" if p_val < 0.05 else "Not Significant"

def chi_square_test(data, var1, var2):
    table = pd.crosstab(data[var1], data[var2])
    if table.size == 0 or np.any(table.sum(axis=0) == 0) or np.any(table.sum(axis=1) == 0):
        return None, None, None, "Invalid Test (Empty or Zero Row/Column)"
    chi2_stat, p_val, dof, _ = chi2_contingency(table)
    return chi2_stat, p_val, dof, interpret_p_value(p_val)

# Cell 4: Perform Chi-square tests for all pairs
# Filter the dataset to include only categorical columns
categorical_columns = [col for col in dataset.columns if is_categorical(dataset[col])]

# Create all possible pairs of these categorical columns
categorical_pairs = list(combinations(categorical_columns, 2))

# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame(columns=["Feature 1", "Feature 2", "Chi-square Statistic", "P-value", "Degrees of Freedom", "Interpretation"])

# Perform Chi-square test for each pair and store the results
for pair in categorical_pairs:
    feature1, feature2 = pair
    chi2_stat, p_val, dof, interpretation = chi_square_test(dataset, feature1, feature2)
    if chi2_stat is not None:
        new_row = pd.DataFrame({
            "Feature 1": [feature1],
            "Feature 2": [feature2],
            "Chi-square Statistic": [chi2_stat],
            "P-value": [p_val],
            "Degrees of Freedom": [dof],
            "Interpretation": [interpretation]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df.head()

# Cell 5: Analyze significant results
significant_results = results_df[results_df['Interpretation'] == 'Significant']
significant_results.reset_index(drop=True)


Unnamed: 0,Feature 1,Feature 2,Chi-square Statistic,P-value,Degrees of Freedom,Interpretation
0,total_income,is_freedom_fighter,14.214156,0.002627679,3,Significant
1,total_income,diabetic,38.433059,2.288488e-08,3,Significant
2,total_income,profile_hypertensive,9.958818,0.01891946,3,Significant
3,total_income,RESULT_STAT_BP,159.239608,1.173648e-24,18,Significant
4,total_income,RESULT_STAT_BMI,45.668069,6.003162e-05,15,Significant
5,total_income,TAG_NAME,14.035484,0.02924229,6,Significant
6,total_income,RESULT_STAT_SUGAR,33.876211,0.01304617,18,Significant
7,total_income,RESULT_STAT_PR,38.247122,1.005064e-06,6,Significant
8,total_income,RESULT_STAT_SPO2,12.897644,0.04469047,6,Significant
9,gender,is_freedom_fighter,4.402061,0.03589552,1,Significant


In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from itertools import combinations

# Cell 2: Load the dataset
# Replace with the path to your dataset
file_path = '/content/test-dataset.xlsx'
dataset = pd.read_excel(file_path)
dataset.head()

# Cell 3: Define helper functions
def is_categorical(series, threshold=10):
    return series.nunique() < threshold or series.dtype.name == "category"

def interpret_p_value(p_val):
    return "Significant" if p_val < 0.05 else "Not Significant"

def chi_square_test(data, var1, var2):
    table = pd.crosstab(data[var1], data[var2])
    if table.size == 0 or np.any(table.sum(axis=0) == 0) or np.any(table.sum(axis=1) == 0):
        return None, None, None, "Invalid Test (Empty or Zero Row/Column)"
    chi2_stat, p_val, dof, _ = chi2_contingency(table)
    return chi2_stat, p_val, dof, interpret_p_value(p_val)

# Cell 4: Perform Chi-square tests for all pairs
# Filter the dataset to include only categorical columns
categorical_columns = [col for col in dataset.columns if is_categorical(dataset[col])]

# Create all possible pairs of these categorical columns
categorical_pairs = list(combinations(categorical_columns, 2))

# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame(columns=["Feature 1", "Feature 2", "Chi-square Statistic", "P-value", "Degrees of Freedom", "Interpretation"])

# Perform Chi-square test for each pair and store the results
for pair in categorical_pairs:
    feature1, feature2 = pair
    chi2_stat, p_val, dof, interpretation = chi_square_test(dataset, feature1, feature2)
    if chi2_stat is not None:
        new_row = pd.DataFrame({
            "Feature 1": [feature1],
            "Feature 2": [feature2],
            "Chi-square Statistic": [chi2_stat],
            "P-value": [p_val],
            "Degrees of Freedom": [dof],
            "Interpretation": [interpretation]
        })
        results_df = pd.concat([results_df, new_row], ignore_index=True)

# Cell 5: Display all results
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(results_df)


Unnamed: 0,Feature 1,Feature 2,Chi-square Statistic,P-value,Degrees of Freedom,Interpretation
0,total_income,gender,1.189601,0.7554994,3,Not Significant
1,total_income,is_poor,0.0,1.0,0,Not Significant
2,total_income,is_freedom_fighter,14.214156,0.002627679,3,Significant
3,total_income,had_stroke,0.900317,0.8253513,3,Not Significant
4,total_income,has_cardiovascular_disease,4.929155,0.1770589,3,Not Significant
5,total_income,disabilities_name,8.481028,0.9030546,15,Not Significant
6,total_income,diabetic,38.433059,2.288488e-08,3,Significant
7,total_income,profile_hypertensive,9.958818,0.01891946,3,Significant
8,total_income,RESULT_STAT_BP,159.239608,1.173648e-24,18,Significant
9,total_income,RESULT_STAT_BMI,45.668069,6.003162e-05,15,Significant


In [None]:
# Cell 1: Import necessary libraries
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from itertools import combinations

# Cell 2: Load the dataset
# Replace with the path to your dataset
file_path = '/content/test-dataset.xlsx'
dataset = pd.read_excel(file_path)
dataset.head()

# Cell 3: Define helper functions
def is_categorical(series, threshold=10):
    return series.nunique() < threshold or series.dtype.name == "category"

def interpret_p_value(p_val):
    return "Significant" if p_val < 0.05 else "Not Significant"

def chi_square_test(data, var1, var2):
    table = pd.crosstab(data[var1], data[var2])
    if table.size == 0 or np.any(table.sum(axis=0) == 0) or np.any(table.sum(axis=1) == 0):
        return None, None, None, "Invalid Test (Empty or Zero Row/Column)"
    chi2_stat, p_val, dof, _ = chi2_contingency(table)
    return chi2_stat, p_val, dof, interpret_p_value(p_val)

# Cell 4: Perform Chi-square tests for all pairs
# Filter the dataset to include only categorical columns
categorical_columns = [col for col in dataset.columns if is_categorical(dataset[col])]

# Create all possible pairs of these categorical columns
categorical_pairs = list(combinations(categorical_columns, 2))

# Initialize empty DataFrames to store significant and non-significant results
significant_df = pd.DataFrame(columns=["Feature 1", "Feature 2", "Chi-square Statistic", "P-value", "Degrees of Freedom", "Interpretation"])
non_significant_df = pd.DataFrame(columns=["Feature 1", "Feature 2", "Chi-square Statistic", "P-value", "Degrees of Freedom", "Interpretation"])

# Perform Chi-square test for each pair and store the results
for pair in categorical_pairs:
    feature1, feature2 = pair
    chi2_stat, p_val, dof, interpretation = chi_square_test(dataset, feature1, feature2)
    if chi2_stat is not None:
        new_row = pd.DataFrame({
            "Feature 1": [feature1],
            "Feature 2": [feature2],
            "Chi-square Statistic": [chi2_stat],
            "P-value": [p_val],
            "Degrees of Freedom": [dof],
            "Interpretation": [interpretation]
        })
        if interpretation == "Significant":
            significant_df = pd.concat([significant_df, new_row], ignore_index=True)
        else:
            non_significant_df = pd.concat([non_significant_df, new_row], ignore_index=True)

# Cell 5: Display significant results
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print("Significant Results:")
    display(significant_df)

# Cell 6: Display non-significant results
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print("Non-Significant Results:")
    display(non_significant_df)


Significant Results:


Unnamed: 0,Feature 1,Feature 2,Chi-square Statistic,P-value,Degrees of Freedom,Interpretation
0,total_income,is_freedom_fighter,14.214156,0.002627679,3,Significant
1,total_income,diabetic,38.433059,2.288488e-08,3,Significant
2,total_income,profile_hypertensive,9.958818,0.01891946,3,Significant
3,total_income,RESULT_STAT_BP,159.239608,1.173648e-24,18,Significant
4,total_income,RESULT_STAT_BMI,45.668069,6.003162e-05,15,Significant
5,total_income,TAG_NAME,14.035484,0.02924229,6,Significant
6,total_income,RESULT_STAT_SUGAR,33.876211,0.01304617,18,Significant
7,total_income,RESULT_STAT_PR,38.247122,1.005064e-06,6,Significant
8,total_income,RESULT_STAT_SPO2,12.897644,0.04469047,6,Significant
9,gender,is_freedom_fighter,4.402061,0.03589552,1,Significant


Non-Significant Results:


Unnamed: 0,Feature 1,Feature 2,Chi-square Statistic,P-value,Degrees of Freedom,Interpretation
0,total_income,gender,1.189601,0.755499,3,Not Significant
1,total_income,is_poor,0.0,1.0,0,Not Significant
2,total_income,had_stroke,0.900317,0.825351,3,Not Significant
3,total_income,has_cardiovascular_disease,4.929155,0.177059,3,Not Significant
4,total_income,disabilities_name,8.481028,0.903055,15,Not Significant
5,total_income,RESULT_STAT_MUAC,0.1647,0.92095,2,Not Significant
6,gender,is_poor,0.0,1.0,0,Not Significant
7,gender,has_cardiovascular_disease,0.117574,0.731681,1,Not Significant
8,gender,disabilities_name,5.837379,0.322366,5,Not Significant
9,gender,diabetic,1.174407,0.278498,1,Not Significant
