In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

In [8]:
# 1. 设置路径 (当前目录)
base_path = '.' 
save_dir = './analysis_plots'
os.makedirs(save_dir, exist_ok=True)

print("Starting data loading...")
print("Current directory:", os.getcwd())

# 2. 加载数据 (直接加载，不套 try 语句)
X_path = os.path.join(base_path, 'X_all_features.npy')
y_path = os.path.join(base_path, 'y_all_features.npy')
feat_path = os.path.join(base_path, 'feature_names_all.npy')
targ_path = os.path.join(base_path, 'target_names.npy')

X_data = np.load(X_path, allow_pickle=True)
y_data = np.load(y_path, allow_pickle=True)
feature_names = np.load(feat_path, allow_pickle=True)
target_names = np.load(targ_path, allow_pickle=True)

# 3. 转 DataFrame
df_X = pd.DataFrame(X_data, columns=feature_names)
df_y = pd.DataFrame(y_data, columns=target_names)

print("Data Loaded Successfully.")
print("Samples:", df_X.shape[0])
print("Features:", df_X.shape[1])

Starting data loading...
Current directory: /root
Data Loaded Successfully.
Samples: 780
Features: 78


In [10]:
# 4. 绘制热图 
print("Generating Vertical Heatmap...")

fig_height = max(10, df_X.shape[1] * 0.4) 
plt.figure(figsize=(8, fig_height)) 

# 使用 .T 进行转置
# df_X.isnull().T 把行列互换了
# 现在：行(Y轴)是特征，列(X轴)是样本
heatmap_data = df_X.isnull().T

# 调整坐标轴标签显示
# xticklabels=False: 因为现在X轴是样本，太多了，不显示
# yticklabels=True: 现在Y轴是特征名，必须显示
sns.heatmap(heatmap_data, cbar=False, xticklabels=False, yticklabels=True, cmap='viridis')

plt.title('Vertical Heatmap of Missingness')
# 现在的X轴是样本数
plt.xlabel(f'Samples (Patients, N={df_X.shape[0]})', fontsize=12)
# 现在的Y轴是特征
plt.ylabel('Features', fontsize=12)

plt.tight_layout()
# 保存为新名字
save_path_1 = os.path.join(save_dir, 'fig_missing_heatmap_vertical.png')
plt.savefig(save_path_1, dpi=300)
plt.close()

# 5. 绘制相关性图 (Correlation)
print("Generating Correlation Matrix...")
missing_indicators = df_X.isnull().astype(int)
cols_with_missing = missing_indicators.columns[missing_indicators.sum() > 0]

if len(cols_with_missing) > 0:
    # 拼接数据
    combined_data = pd.concat([missing_indicators[cols_with_missing], df_y], axis=1)
    
    # 计算相关性
    corr_matrix = combined_data.corr()
    
    # 提取子矩阵
    target_cols = list(target_names)
    feature_missing_cols = list(cols_with_missing)
    plot_data = corr_matrix.loc[feature_missing_cols, target_cols]
    
    # 排序
    if len(target_cols) > 0:
        sort_idx = plot_data.abs().sort_values(by=target_cols[0], ascending=False).index
        plot_data = plot_data.loc[sort_idx]

    # 绘图
    plt.figure(figsize=(6, max(4, len(feature_missing_cols) * 0.3)))
    sns.heatmap(plot_data, annot=True, cmap='RdBu_r', center=0, fmt='.2f')
    plt.title('Missingness vs Outcome')
    plt.tight_layout()
    plt.savefig(os.path.join(save_dir, 'fig_missing_correlation.png'), dpi=300)
    plt.close()
    
    print("Done! Check the analysis_plots folder.")
else:
    print("No missing values found.")

Generating Vertical Heatmap...
Generating Correlation Matrix...
Done! Check the analysis_plots folder.
