In [1]:
import os
import numpy as np
from numpy.linalg import lstsq

AMINO_ACID_ATOM_COUNTS = {
    "ALA": 12, "ARG": 26, "ASN": 16, "ASP": 14, "CYS": 13, "GLN": 19, "GLU": 17,
    "GLY": 9, "HID": 19, "ILE": 21, "LEU": 21, "LYS": 24, "MET": 19, "PHE": 22,
    "PRO": 16, "SER": 13, "THR": 16, "TRP": 26, "TYR": 23, "VAL": 18, "ACE": 7, "NME": 7,
}

def _read_feature_from_ef(ef_file: str, n_atoms: int, aggregate: bool = True):
    """
    从单个 .ef 文件中读取特征。
    如果 aggregate=True，返回 5 维聚合特征（对 n_atoms 求和）：
      [sum(E_paral^2), sum(E_verti^2), sum(E_paral*E_verti), sum(|E_paral|), sum(|E_verti|)]
    如果 aggregate=False，返回按原子拼接的 5*n_atoms 维特征（与原实现相同）。
    """
    ef_data = np.loadtxt(ef_file, skiprows=1)
    ef_data = np.atleast_2d(ef_data)

    if ef_data.shape[0] < n_atoms:
        raise ValueError(f"EF文件行数不足: 需要 {n_atoms}, 实际 {ef_data.shape[0]} -> {ef_file}")

    # 列索引: 11 -> |E_parallel|, 12 -> |E_vertical|
    E_paral = ef_data[:n_atoms, 11]
    E_verti = ef_data[:n_atoms, 12]

    paral_sq = E_paral**2
    verti_sq = E_verti**2
    prod = E_paral * E_verti
    paral_abs = np.abs(E_paral)
    verti_abs = np.abs(E_verti)

    if aggregate:
        # 聚合为 5 个全分子特征（默认用 sum，若你更偏好均值可改为 mean）
        return np.array([
            paral_sq.sum(),
            verti_sq.sum(),
            prod.sum(),
            paral_abs.sum(),
            verti_abs.sum(),
        ])
    else:
        # 原始逐原子拼接
        return np.concatenate([paral_sq, verti_sq, prod, paral_abs, verti_abs])

def fit_and_predict_U(work_dir: str, residue_name: str = "ALA", n_train: int = 1000):
    """
    用 5 个聚合特征拟合 5 个极化率参数，并在同一文件中输出：
    name, real_U, fit_U, 5个特征, 5个alpha
    """
    if residue_name not in AMINO_ACID_ATOM_COUNTS:
        raise KeyError(f"未在 AMINO_ACID_ATOM_COUNTS 中找到残基 {residue_name} 的原子数")
    n_atoms = AMINO_ACID_ATOM_COUNTS[residue_name]

    ef_dir = os.path.join(work_dir, residue_name, "ef")
    raw_realU_file = os.path.join(work_dir, residue_name, "raw_realU")
    output_alpha_file = os.path.join(work_dir, residue_name, "fit_alpha")
    output_combined_file = os.path.join(work_dir, residue_name, "raw_realU_fitU")

    # 读取 name 与真实 U
    data = []
    with open(raw_realU_file, "r") as f:
        for line in f:
            if not line.strip():
                continue
            parts = line.split()
            if len(parts) < 2:
                continue
            data.append((parts[0].strip(), float(parts[1])))

    if not data:
        raise RuntimeError("未能从 raw_realU 读取到有效数据")

    print(f"从 {residue_name} 总共读取到 {len(data)} 个样本, 其中前 {n_train} 个用于拟合")

    # ---------------- 拟合阶段（5 维特征） ----------------
    train_data = data[:n_train]
    A_all, U_all = [], []

    for name, real_U in train_data:
        ef_file = os.path.join(ef_dir, name + ".ef")
        if not os.path.isfile(ef_file):
            print(f"⚠️ 缺少EF文件, 跳过: {ef_file}")
            continue
        try:
            feats5 = _read_feature_from_ef(ef_file, n_atoms, aggregate=True)  # 5维
        except Exception as e:
            print(f"⚠️ 无法读取EF文件, 跳过: {ef_file}, 错误: {e}")
            continue

        A_all.append(feats5)
        U_all.append(real_U)

    A_all = np.array(A_all)
    U_all = np.array(U_all)

    if A_all.shape[0] == 0:
        raise RuntimeError("没有可用的训练样本, 无法进行拟合")

    print("开始进行最小二乘拟合...")
    alpha, residuals, rank, s = lstsq(A_all, U_all, rcond=None)  # alpha 为长度 5 的向量

    # 保存 alpha
    np.savetxt(
        output_alpha_file, alpha.reshape(1, -1),
        header="alpha_paral_sq alpha_verti_sq alpha_prod alpha_paral_abs alpha_verti_abs"
    )
    print(f"✅ 极化率参数已保存至: {output_alpha_file}")

    # ---------------- 预测与合并输出 ----------------
    # 写入表头
    header_cols = [
        "name", "real_U", "fit_U",
        "feat_paral_sq", "feat_verti_sq", "feat_prod", "feat_paral_abs", "feat_verti_abs",
        "alpha_paral_sq", "alpha_verti_sq", "alpha_prod", "alpha_paral_abs", "alpha_verti_abs"
    ]
    with open(output_combined_file, "w") as f:
        f.write("# " + " ".join(header_cols) + "\n")

        for (name, real_U) in data:
            ef_file = os.path.join(ef_dir, name + ".ef")
            if not os.path.isfile(ef_file):
                print(f"⚠️ 缺少EF文件, 跳过: {ef_file}")
                continue
            try:
                feats5 = _read_feature_from_ef(ef_file, n_atoms, aggregate=True)  # 5维
            except Exception as e:
                print(f"⚠️ 无法读取EF文件, 跳过: {ef_file}, 错误: {e}")
                continue

            U_fit = float(feats5 @ alpha)

            # 逐行输出：name, real_U, fit_U, 5个特征, 5个alpha
            f.write(
                f"{name} {real_U:.6f} {U_fit:.6f} "
                + " ".join([f"{v:.6f}" for v in feats5.tolist()])
                + " "
                + " ".join([f"{a:.6f}" for a in alpha.tolist()])
                + "\n"
            )

    print(f"✅ 所有结果已写入: {output_combined_file}")
    return alpha


In [None]:
work_dir = "/mnt/xyz_folder"
for amino in ["ALA"]:
    fit_and_predict_U(work_dir, residue_name=amino, n_train=1000)

从 ALA 总共读取到 6921 个样本, 其中前 1000 个用于拟合
开始进行最小二乘拟合...
✅ 极化率参数已保存至: /home/wsren/Codes/proteinff-model/dipole/alpha/alpha_mono/pdb_amber2_del4c_new/ALA/fit_alpha
✅ 所有结果已写入: /home/wsren/Codes/proteinff-model/dipole/alpha/alpha_mono/pdb_amber2_del4c_new/ALA/raw_realU_fitU
