In [5]:
import pandas as pd
from scipy.stats import chi2_contingency
import numpy as np

# Load the dataset
df = pd.read_excel("Desktop\test dataset.xlsx")


# --- Data Preprocessing ---

# 1. Age Categorization
def categorize_age(age):
    if age < 18:
        return 'Child/Adolescent'
    elif 18 <= age <= 39:
        return 'Young Adult'
    elif 40 <= age <= 59:
        return 'Middle-aged Adult'
    else: # age >= 60
        return 'Older Adult'

df['age_group'] = df['age'].apply(categorize_age)

# 2. Recode 'disabilities_name'
df['disabilities_name'] = df['disabilities_name'].replace(0, 'No Disability')

# Convert boolean columns to string/object type for consistent categorical handling
boolean_cols = ['diabetic', 'profile_hypertensive', 'is_poor', 'is_freedom_fighter', 'had_stroke', 'has_cardiovascular_disease']
for col in boolean_cols:
    df[col] = df[col].astype(str)

# Identify categorical variables for Chi-square testing
# These are based on the previous analysis and variable descriptions.
categorical_features = []

# Define target variables
target_variables = ['has_cardiovascular_disease', 'diabetic']

# --- Chi-square Test Function ---
def perform_chi_square_test(dataframe, feature1, feature2):
    """
    Performs a Chi-square test of independence between two categorical features.

    Args:
        dataframe (pd.DataFrame): The input DataFrame.
        feature1 (str): Name of the first categorical column.
        feature2 (str): Name of the second categorical column.

    Returns:
        tuple: (chi2_statistic, p_value, degrees_of_freedom, contingency_table)
               Returns (None, None, None, None) if data is insufficient.
    """
    # Drop rows with missing values for the specific pair of features
    temp_df = dataframe[[feature1, feature2]].dropna()

    if temp_df.empty:
        return None, None, None, None

    # Create a contingency table
    contingency_table = pd.crosstab(temp_df[feature1], temp_df[feature2])

    # Check assumptions for Chi-square test (expected frequencies)
    # scipy.stats.chi2_contingency handles this internally and warns if violated.
    # However, it's good practice to be aware.
    # For simplicity, we proceed with the test and rely on scipy's robustness.

    try:
        chi2, p_value, dof, expected = chi2_contingency(contingency_table)
        return chi2, p_value, dof, contingency_table
    except ValueError as e:
        # This can happen if a row/column in the contingency table has all zeros
        # or if there's only one unique value in a feature after dropping NaNs.
        print(f"Could not perform Chi-square test for {feature1} vs {feature2}: {e}")
        return None, None, None, None

# --- Perform and Display Tests ---
results = []

print("Performing Chi-square tests...\n")

for target in target_variables:
    print(f"--- Analyzing features against: {target} ---")
    for feature in categorical_features:
        if feature == target: # Skip testing a variable against itself
            continue

        chi2, p_value, dof, _ = perform_chi_square_test(df, feature, target)

        if chi2 is not None:
            significance = "Significant" if p_value <= 0.05 else "Not Significant"
            results.append({
                'Independent Variable': feature,
                'Dependent Variable': target,
                'Chi2 Statistic': f"{chi2:.3f}",
                'df': dof,
                'p-value': f"{p_value:.3f}",
                'Significance (alpha=0.05)': significance
            })
            print(f"  {feature} vs {target}: Chi2={chi2:.3f}, p={p_value:.3f}, df={dof} ({significance})")
        else:
            results.append({
                'Independent Variable': feature,
                'Dependent Variable': target,
                'Chi2 Statistic': 'N/A',
                'df': 'N/A',
                'p-value': 'N/A',
                'Significance (alpha=0.05)': 'Insufficient Data'
            })
            print(f"  {feature} vs {target}: Insufficient data for test.")
    print("-" * 40)

# Display results in a structured table
results_df = pd.DataFrame(results)
print("\n--- Summary of Chi-square Test Results ---")
print(results_df.to_markdown(index=False))

print("\nNote: 'N/A' or 'Insufficient Data' indicates that the Chi-square test could not be performed due to missing values or lack of variance in the selected columns after dropping NaNs.")

OSError: [Errno 22] Invalid argument: 'Desktop\test dataset.xlsx'