In [4]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway

# Load the data
data = pd.read_csv('/Users/apple/Desktop/Project/Reprocessing/data_with_cluster_info.csv')

print(data.head())
print(data['Diabetes_012'].value_counts())

   GenHlth  HighBP  Age  HighChol  DiffWalk  Income  HeartDiseaseorAttack  \
0        5       1    4         1         1       2                     0   
1        3       0    3         0         0       1                     0   
2        5       1    4         1         1       5                     0   
3        2       1    5         0         0       3                     0   
4        2       1    5         1         0       2                     0   

   Membership  Diabetes_012   BMI  PhysHlth  
0          41           0.0  40.0      15.0  
1          20           0.0  25.0       0.0  
2           5           0.0  28.0      30.0  
3          17           0.0  27.0       0.0  
4          26           0.0  24.0       0.0  
0.0    213703
2.0     35346
1.0      4631
Name: Diabetes_012, dtype: int64


In [5]:
# Identify numerical features
numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()
p_values = []
features = []

# Perform ANOVA for each numerical feature
for feature in numerical_features:
    # Prepare the data for each category
    group0 = data[data['Diabetes_012'] == 0][feature]
    group1 = data[data['Diabetes_012'] == 1][feature]
    group2 = data[data['Diabetes_012'] == 2][feature]
    
    # Perform ANOVA and collect p-values
    f_stat, p = f_oneway(group0.dropna(), group1.dropna(), group2.dropna())
    p_values.append(p)
    features.append(feature)

# Create a DataFrame to hold feature names and their corresponding p-values
p_value_df = pd.DataFrame({
    'Feature': features,
    'P_value': p_values
})



In [6]:
# Sort p-values
p_value_df = p_value_df.sort_values(by='P_value')

# Apply the Benjamini-Hochberg correction
m = len(p_value_df)  # Total number of hypotheses
p_value_df['Rank'] = range(1, m + 1)
p_value_df['Adjusted_P_value'] = (p_value_df['P_value'] * m) / p_value_df['Rank']

# Set an FDR threshold
fdr_threshold = 0.05
p_value_df['Reject_H0'] = p_value_df['Adjusted_P_value'] < fdr_threshold

# View the results
print(p_value_df)

                 Feature       P_value  Rank  Adjusted_P_value  Reject_H0
0                GenHlth  0.000000e+00     1      0.000000e+00       True
1                 HighBP  0.000000e+00     2      0.000000e+00       True
2                    Age  0.000000e+00     3      0.000000e+00       True
3               HighChol  0.000000e+00     4      0.000000e+00       True
4               DiffWalk  0.000000e+00     5      0.000000e+00       True
5                 Income  0.000000e+00     6      0.000000e+00       True
6   HeartDiseaseorAttack  0.000000e+00     7      0.000000e+00       True
8           Diabetes_012  0.000000e+00     8      0.000000e+00       True
9                    BMI  0.000000e+00     9      0.000000e+00       True
10              PhysHlth  0.000000e+00    10      0.000000e+00       True
7             Membership  1.023434e-15    11      1.023434e-15       True
