# 📋 模型预测报告：GHQ心理健康预测

In [9]:
# 初始化环境与路径配置
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from utils.config import load_config, abspath
cfg, root = load_config()


In [15]:
# 加载模型预测与指标
preds_path = abspath(root, cfg["modeling"]["outputs"]["predictions_file"])
metrics_path = abspath(root, cfg["modeling"]["outputs"]["metrics_file"])

df_pred = pd.read_parquet(preds_path)
import json
with open(metrics_path, "r", encoding="utf-8") as f:
    metrics = json.load(f)

pd.DataFrame(metrics, index=["score"]).T  # 转置成一列格式

Unnamed: 0,score
accuracy,1.0
precision,1.0
recall,1.0
f1,1.0
balanced_accuracy,1.0
roc_auc,1.0


## 🎯 目标变量分布

In [17]:
# 目标变量分布图
sns.countplot(data=df_pred, x="target_cls")
plt.title("GHQ Caseness 标签分布")
plt.xlabel("GHQ ≥ 4 是否心理困扰")
plt.ylabel("样本数")
plt.show()


ValueError: Could not interpret value `target_cls` for `x`. An entry with this name does not appear in `data`.

## 📊 模型混淆矩阵与准确率

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(df_pred["target_cls"], df_pred["y_pred"])
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()


## 🧠 特征重要性（若适用）

In [None]:
# 示例代码（如使用基于 coef_ 或 feature_importances_ 的模型）
try:
    model_path = abspath(root, cfg["modeling"]["outputs"]["model_file"])
    import joblib
    model = joblib.load(model_path)

    importances = None
    if hasattr(model, "coef_"):
        importances = model.coef_[0]
    elif hasattr(model, "feature_importances_"):
        importances = model.feature_importances_

    if importances is not None:
        features = df_pred.drop(columns=["target_cls", "y_pred", "y_proba"]).columns
        fi_df = pd.DataFrame({"feature": features, "importance": importances})
        fi_df = fi_df.sort_values("importance", ascending=False).head(20)

        sns.barplot(data=fi_df, x="importance", y="feature")
        plt.title("Top 20 特征重要性")
        plt.show()
except Exception as e:
    print("模型不支持特征重要性可视化:", e)
