In [None]:
import pandas as pd
from itertools import combinations
from scipy.stats import chi2_contingency, fisher_exact
from scipy.stats import mannwhitneyu
from statsmodels.stats.multitest import multipletests
df1 = pd.read_csv(r"D:\临床数据\NHANES数据清洗\df1.csv", low_memory=False)
marital_counts_by_group = df1.groupby('5 year heart death')['Smoked at least 100 cigarettes in life'].value_counts().unstack(fill_value=0)
marital_percentage_by_group = df1.groupby('5 year heart death')['Smoked at least 100 cigarettes in life'].value_counts(normalize=True).unstack(fill_value=0) * 100
rounded_percentage = marital_percentage_by_group.round()
percentage_sum = rounded_percentage.sum(axis=1)
for idx, row in rounded_percentage.iterrows():
    diff = 100 - row.sum()
    rounded_percentage.loc[idx, row.idxmax()] += diff
formatted_counts = marital_counts_by_group.astype(str) + " (" + rounded_percentage.astype(int).astype(str) + "%)"
print("调整后的Smoked at least 100 cigarettes in life类别数量和百分比在不同分组中的分布:")
print(formatted_counts)
group_combinations = list(combinations(df1['5 year heart death'].unique(), 2))
p_values = []
comparison_labels = []
for group1, group2 in group_combinations:
    print(f"\n比较组别: {group1} vs {group2}")
    subset = df1[df1['5 year heart death'].isin([group1, group2])]
    contingency_table = pd.crosstab(subset['5 year heart death'], subset['Smoked at least 100 cigarettes in life'])
    print("列联表：")
    print(contingency_table)
    if (contingency_table.values < 5).any():
        print("使用 Fisher 精确检验")
        if contingency_table.shape == (2, 2):
            _, p_value = fisher_exact(contingency_table)
        else:
            print("表格不是 2x2，Fisher 精确检验无法应用，改用卡方检验")
            _, p_value, _, _ = chi2_contingency(contingency_table)
    else:
        print("使用卡方检验")
        _, p_value, _, _ = chi2_contingency(contingency_table)
    p_values.append(p_value)
    comparison_labels.append(f"{group1} vs {group2}")
_, corrected_p_values, _, _ = multipletests(p_values, method='Bonferroni')
print("\n两两比较的结果（包括 Bonferroni 校正）：")
for comparison, p, corrected_p in zip(comparison_labels, p_values, corrected_p_values):
    print(f"{comparison}: 原始 p 值 = {p:.3f}, 校正后的 p 值 = {corrected_p:.3f}")
    if corrected_p < 0.05:
        print(f"结果显著：{comparison} 存在显著差异")
    else:
        print(f"结果不显著：{comparison} 没有显著差异")
print("各组的 Globulin (g/L) 中位数及百分位数:")
summary_stats = {}
for group, group_data in df1.groupby('5 year heart death'):
    median = group_data['Globulin (g/L)'].median()
    median = round(median)
    q1 = group_data['Globulin (g/L)'].quantile(0.25)
    q3 = group_data['Globulin (g/L)'].quantile(0.75)
    summary_stats[group] = f"{median} ({int(q1)}–{int(q3)})"
    print(f"{group}: {summary_stats[group]}")
group_combinations = list(combinations(df1['5 year heart death'].unique(), 2))
p_values = []
print("\n两两组别比较结果:")
for group1, group2 in group_combinations:
    print(f"\n比较组别: {group1} vs {group2}")    
    data1 = df1[df1['5 year heart death'] == group1]['Globulin (g/L)']
    data2 = df1[df1['5 year heart death'] == group2]['Globulin (g/L)']
    stat, p_value = mannwhitneyu(data1, data2, alternative='two-sided')
    p_values.append((group1, group2, p_value))
    print(f"{group1}: {summary_stats[group1]}")
    print(f"{group2}: {summary_stats[group2]}")
    print(f"未经校正的 p 值: {p_value:.3f}")
corrected_p_values = [(g1, g2, min(p * len(group_combinations), 1.0)) for g1, g2, p in p_values]
print("\n两两比较的结果（Bonferroni 校正后）:")
for g1, g2, corrected_p in corrected_p_values:
    print(f"{g1} vs {g2}: 校正后的 p 值: {corrected_p:.3f}")
    if corrected_p < 0.05:
        print("结果显著：两组的 Globulin (g/L) 存在显著差异")
    else:
        print("结果不显著：两组的 Globulin (g/L) 没有显著差异")