In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 设置中文字体和图表样式
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)

# ============================================================================
# 1. 数据加载与初步探索
# ============================================================================

print("=" * 80)
print("1. 数据加载与初步探索")
print("=" * 80)

# 读取数据
df = pd.read_csv('diabetes_012_health_indicators_BRFSS2021.csv')

# 显示数据基本信息
print(f"数据集形状: {df.shape} (行数, 列数)")
print(f"列名列表: {list(df.columns)}")
print("\n前5行数据预览:")
print(df.head())

# 检查数据类型
print("\n数据信息:")
print(df.info())

# 检查缺失值
print("\n缺失值统计:")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    '缺失数量': missing_data,
    '缺失百分比%': missing_percent
})
print(missing_df[missing_df['缺失数量'] > 0])

# 检查重复值
duplicates = df.duplicated().sum()
print(f"\n重复行数量: {duplicates}")

# ============================================================================
# 2. 数据清理
# ============================================================================

print("\n" + "=" * 80)
print("2. 数据清理")
print("=" * 80)

# 2.1 删除重复行（如果有）
if duplicates > 0:
    df_clean = df.drop_duplicates()
    print(f"删除 {duplicates} 个重复行")
else:
    df_clean = df.copy()
    print("没有发现重复行")

# 2.2 检查并处理异常值
print("\n数值型变量的描述性统计:")
numerical_cols = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
print(df_clean[numerical_cols].describe().T)

# 2.3 检查分类变量的唯一值
print("\n分类变量的唯一值:")
categorical_cols = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 
                    'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 
                    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']

for col in categorical_cols:
    unique_values = df_clean[col].unique()
    print(f"{col}: {sorted(unique_values)}")

# 2.4 处理可能的异常值（基于业务逻辑）
# 例如，BMI的正常范围通常在15-60之间
bmi_outliers = df_clean[(df_clean['BMI'] < 15) | (df_clean['BMI'] > 60)]
print(f"\nBMI异常值数量 (BMI < 15 或 BMI > 60): {len(bmi_outliers)}")

# 心理健康和身体健康天数应该在0-30之间
mental_outliers = df_clean[(df_clean['MentHlth'] < 0) | (df_clean['MentHlth'] > 30)]
physical_outliers = df_clean[(df_clean['PhysHlth'] < 0) | (df_clean['PhysHlth'] > 30)]
print(f"心理健康天数异常值数量: {len(mental_outliers)}")
print(f"身体健康天数异常值数量: {len(physical_outliers)}")

# 2.5 创建清理后的数据副本
df_clean_copy = df_clean.copy()
print(f"\n清理后数据形状: {df_clean_copy.shape}")

# ============================================================================
# 3. 数据可视化
# ============================================================================

print("\n" + "=" * 80)
print("3. 数据可视化")
print("=" * 80)

# 3.1 目标变量分布
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# 糖尿病状态分布
diabetes_counts = df_clean_copy['Diabetes_012'].value_counts()
diabetes_labels = {0: '无糖尿病', 1: '前期糖尿病', 2: '糖尿病'}
colors = ['#66c2a5', '#fc8d62', '#8da0cb']

ax1 = axes[0]
bars = ax1.bar(range(len(diabetes_counts)), diabetes_counts.values, color=colors)
ax1.set_title('糖尿病状态分布', fontsize=14, fontweight='bold')
ax1.set_xlabel('糖尿病状态', fontsize=12)
ax1.set_ylabel('样本数量', fontsize=12)
ax1.set_xticks(range(len(diabetes_counts)))
ax1.set_xticklabels([diabetes_labels[i] for i in diabetes_counts.index])

# 添加数量标签
for i, (bar, count) in enumerate(zip(bars, diabetes_counts.values)):
    percentage = (count / len(df_clean_copy)) * 100
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 100,
            f'{count:,}\n({percentage:.1f}%)', ha='center', fontsize=10)

# 饼图展示比例
ax2 = axes[1]
wedges, texts, autotexts = ax2.pie(diabetes_counts.values, labels=[diabetes_labels[i] for i in diabetes_counts.index],
                                   autopct='%1.1f%%', colors=colors, startangle=90)
ax2.set_title('糖尿病状态比例', fontsize=14, fontweight='bold')

# 美化饼图文本
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')

plt.suptitle('目标变量：糖尿病状态分析', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# 3.2 数值特征分布
print("\n数值特征分布可视化...")
fig, axes = plt.subplots(2, 4, figsize=(18, 10))
axes = axes.flatten()

numerical_features = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income']
feature_names = {
    'BMI': '身体质量指数',
    'GenHlth': '总体健康状况',
    'MentHlth': '心理健康天数',
    'PhysHlth': '身体健康天数',
    'Age': '年龄分组',
    'Education': '教育程度',
    'Income': '收入水平'
}

for i, col in enumerate(numerical_features[:7]):
    ax = axes[i]
    
    # 创建直方图
    sns.histplot(data=df_clean_copy, x=col, bins=30, kde=True, ax=ax, color='skyblue', alpha=0.7)
    
    # 计算并显示统计信息
    mean_val = df_clean_copy[col].mean()
    median_val = df_clean_copy[col].median()
    
    ax.axvline(mean_val, color='red', linestyle='--', linewidth=2, label=f'均值: {mean_val:.2f}')
    ax.axvline(median_val, color='green', linestyle='--', linewidth=2, label=f'中位数: {median_val:.2f}')
    
    ax.set_title(f'{feature_names[col]}分布', fontsize=12, fontweight='bold')
    ax.set_xlabel(feature_names[col], fontsize=10)
    ax.set_ylabel('频数', fontsize=10)
    ax.legend(fontsize=9)
    
    # 添加箱线图子图
    if i == 7:  # 最后一个位置放箱线图汇总
        ax_box = axes[7]
        box_data = [df_clean_copy[col] for col in numerical_features]
        ax_box.boxplot(box_data, labels=[feature_names[col] for col in numerical_features])
        ax_box.set_title('数值特征箱线图汇总', fontsize=12, fontweight='bold')
        ax_box.set_ylabel('数值范围', fontsize=10)
        ax_box.tick_params(axis='x', rotation=45)

plt.suptitle('数值特征分布分析', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# 3.3 分类特征分布
print("\n分类特征分布可视化...")
fig, axes = plt.subplots(3, 5, figsize=(20, 15))
axes = axes.flatten()

categorical_features = ['HighBP', 'HighChol', 'Smoker', 'Stroke', 'HeartDiseaseorAttack',
                        'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 
                        'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex']

feature_names_cat = {
    'HighBP': '高血压',
    'HighChol': '高胆固醇',
    'Smoker': '吸烟者',
    'Stroke': '中风史',
    'HeartDiseaseorAttack': '心脏病史',
    'PhysActivity': '体力活动',
    'Fruits': '水果摄入',
    'Veggies': '蔬菜摄入',
    'HvyAlcoholConsump': '重度饮酒',
    'AnyHealthcare': '医疗保险',
    'NoDocbcCost': '因费用不看医生',
    'DiffWalk': '行走困难',
    'Sex': '性别'
}

for i, col in enumerate(categorical_features):
    if i >= len(axes):
        break
        
    ax = axes[i]
    
    # 计算各类别的数量和比例
    value_counts = df_clean_copy[col].value_counts().sort_index()
    percentages = (value_counts / len(df_clean_copy)) * 100
    
    # 创建条形图
    bars = ax.bar(range(len(value_counts)), value_counts.values, color=plt.cm.Set2(np.linspace(0, 1, len(value_counts))))
    
    ax.set_title(f'{feature_names_cat[col]}分布', fontsize=11, fontweight='bold')
    ax.set_xlabel(feature_names_cat[col], fontsize=10)
    ax.set_ylabel('样本数量', fontsize=10)
    
    # 设置x轴标签
    if len(value_counts) == 2:
        labels = ['否', '是'] if col != 'Sex' else ['女', '男']
        ax.set_xticks(range(len(value_counts)))
        ax.set_xticklabels(labels, fontsize=9)
    else:
        ax.set_xticks(range(len(value_counts)))
        ax.set_xticklabels([f'类别{j}' for j in value_counts.index], fontsize=9, rotation=45)
    
    # 在条形上添加数值标签
    for j, (bar, count, pct) in enumerate(zip(bars, value_counts.values, percentages.values)):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50,
               f'{count:,}\n({pct:.1f}%)', ha='center', fontsize=8)

# 隐藏多余的子图
for i in range(len(categorical_features), len(axes)):
    axes[i].set_visible(False)

plt.suptitle('分类特征分布分析', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# 3.4 特征与目标变量的关系
print("\n特征与糖尿病状态的关系分析...")

# 选择几个关键特征进行分析
key_features = ['BMI', 'HighBP', 'HighChol', 'Age', 'PhysActivity', 'GenHlth']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, feature in enumerate(key_features):
    ax = axes[i]
    
    if feature in ['BMI', 'Age', 'GenHlth']:  # 数值特征
        # 按糖尿病状态分组查看分布
        for diabetes_status in [0, 1, 2]:
            subset = df_clean_copy[df_clean_copy['Diabetes_012'] == diabetes_status][feature]
            sns.kdeplot(subset, label=diabetes_labels[diabetes_status], ax=ax, linewidth=2)
        
        ax.set_title(f'{feature_names.get(feature, feature)} vs 糖尿病状态', fontsize=12, fontweight='bold')
        ax.set_xlabel(feature_names.get(feature, feature), fontsize=10)
        ax.set_ylabel('密度', fontsize=10)
        ax.legend(title='糖尿病状态', fontsize=9)
        
    else:  # 分类特征
        # 创建堆叠条形图
        cross_tab = pd.crosstab(df_clean_copy[feature], df_clean_copy['Diabetes_012'])
        cross_tab_percent = cross_tab.div(cross_tab.sum(axis=1), axis=0) * 100
        
        bottom = np.zeros(len(cross_tab_percent))
        for j, status in enumerate([0, 1, 2]):
            values = cross_tab_percent[status].values
            ax.bar(range(len(values)), values, bottom=bottom, 
                  label=diabetes_labels[status], color=colors[j], alpha=0.8)
            bottom += values
        
        ax.set_title(f'{feature_names_cat.get(feature, feature)} vs 糖尿病状态', fontsize=12, fontweight='bold')
        ax.set_xlabel(feature_names_cat.get(feature, feature), fontsize=10)
        ax.set_ylabel('百分比%', fontsize=10)
        
        # 设置x轴标签
        if len(cross_tab_percent) == 2:
            labels = ['否', '是'] if feature != 'Sex' else ['女', '男']
            ax.set_xticks(range(len(labels)))
            ax.set_xticklabels(labels, fontsize=10)
        else:
            ax.set_xticks(range(len(cross_tab_percent)))
            ax.set_xticklabels([f'类别{j}' for j in cross_tab_percent.index], fontsize=9)
        
        ax.legend(title='糖尿病状态', fontsize=9)

plt.suptitle('关键特征与糖尿病状态的关系', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

# 3.5 相关性分析
print("\n特征相关性分析...")

# 计算相关性矩阵
correlation_matrix = df_clean_copy[numerical_features + ['Diabetes_012']].corr()

# 创建热力图
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', 
           cmap='coolwarm', center=0, square=True, linewidths=0.5,
           cbar_kws={'shrink': 0.8})

plt.title('特征相关性热力图', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# 3.6 糖尿病相关风险因素总结
print("\n糖尿病相关风险因素总结...")

# 按糖尿病状态计算关键指标的平均值
diabetes_summary = df_clean_copy.groupby('Diabetes_012').agg({
    'BMI': 'mean',
    'Age': 'mean',
    'HighBP': 'mean',
    'HighChol': 'mean',
    'GenHlth': 'mean',
    'PhysActivity': 'mean'
}).round(2)

diabetes_summary.index = [diabetes_labels[i] for i in diabetes_summary.index]
print("\n按糖尿病状态分组的关键指标平均值:")
print(diabetes_summary.T)

# 创建总结可视化
fig, ax = plt.subplots(figsize=(14, 8))

# 准备数据
metrics = ['BMI', 'Age', 'HighBP', 'HighChol', 'GenHlth', 'PhysActivity']
x = np.arange(len(metrics))
width = 0.25

for i, status in enumerate([0, 1, 2]):
    values = [diabetes_summary.loc[diabetes_labels[status], metric] for metric in metrics]
    ax.bar(x + i*width - width, values, width, label=diabetes_labels[status], 
           color=colors[i], alpha=0.8)

ax.set_xlabel('指标', fontsize=12)
ax.set_ylabel('平均值', fontsize=12)
ax.set_title('不同糖尿病状态下的关键指标对比', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(['BMI', '年龄', '高血压比例', '高胆固醇比例', '总体健康状况', '体力活动比例'], fontsize=10)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)

# 添加数值标签
for i, status in enumerate([0, 1, 2]):
    for j, metric in enumerate(metrics):
        value = diabetes_summary.loc[diabetes_labels[status], metric]
        ax.text(j + i*width - width, value + 0.5, f'{value:.2f}', 
                ha='center', fontsize=8, fontweight='bold')

plt.tight_layout()
plt.show()

# ============================================================================
# 4. 数据质量报告
# ============================================================================

print("\n" + "=" * 80)
print("4. 数据质量报告")
print("=" * 80)

print(f"原始数据集大小: {df.shape}")
print(f"清理后数据集大小: {df_clean_copy.shape}")
print(f"删除的重复行: {duplicates}")
print(f"缺失值总数: {df.isnull().sum().sum()}")

# 计算数据质量分数
quality_score = 100
if duplicates > 0:
    quality_score -= (duplicates / len(df)) * 100
if df.isnull().sum().sum() > 0:
    quality_score -= (df.isnull().sum().sum() / (df.shape[0] * df.shape[1])) * 100

print(f"\n数据质量评分: {quality_score:.1f}/100")
print(f"数据可用性: {'优秀' if quality_score >= 90 else '良好' if quality_score >= 80 else '一般' if quality_score >= 70 else '较差'}")

print("\n建议:")
if duplicates > 0:
    print(f"- 已删除 {duplicates} 个重复样本")
if df.isnull().sum().sum() > 0:
    print(f"- 数据集包含缺失值，建议进一步处理")
if len(bmi_outliers) > 0:
    print(f"- 发现 {len(bmi_outliers)} 个BMI异常值，建议检查或处理")

print("\n数据清理和可视化完成！")

# 保存清理后的数据
df_clean_copy.to_csv('diabetes_cleaned_data.csv', index=False)
print(f"\n清理后的数据已保存为: diabetes_cleaned_data.csv")

1. 数据加载与初步探索


FileNotFoundError: [Errno 2] No such file or directory: 'diabetes_012_health_indicators_BRFSS2021.csv'