In [3]:
import re
import pandas as pd
from IPython.display import display

In [None]:
# Q3 of work_QA NBO
txt_path = "NBO/C24_results/C24-7-1-1.txt"
node_template_path = "node_features.xlsx"


In [None]:
import re
import pandas as pd
from pathlib import Path

# ---------- 1) Tiny parser for "per-atom lines" ----------
def parse_atom_scalar_from_lines(txt_path: str, value_col: str, mol_id: str):
    """
    Parse lines like: 'Atom   1(C ):   -0.0487'
    Returns a DataFrame with columns:
      mol_id, atom_idx (0-based), element, is_H, <value_col>
    """
    pat = re.compile(
        r"Atom\s+(\d+)\((\w)\s*\):\s*([+-]?\d+(?:\.\d+)?(?:[Ee][+-]?\d+)?)"
    )
    rows = []
    with open(txt_path, "r", encoding="utf-8", errors="ignore") as f:
        raw = f.read()
    for m in pat.finditer(raw):
        atom_1based = int(m.group(1))
        elem = m.group(2)
        val = float(m.group(3))
        rows.append({
            "mol_id": mol_id,
            "atom_idx": atom_1based - 1,   # 1-based → 0-based
            "element": elem,
            "is_H": 1 if elem == "H" else 0,
            value_col: val
        })
    if not rows:
        raise ValueError(f"No atom lines parsed in {txt_path}—check the format/regex.")
    return pd.DataFrame(rows).sort_values("atom_idx").reset_index(drop=True)

# ---------- 2) Accumulator that merges into your Excel ----------
def merge_into_node_excel(template_xlsx: str, out_xlsx: str, df_feature: pd.DataFrame):
    """
    Loads the node template, merges df_feature by (mol_id, atom_idx), 
    and writes out a new Excel with README preserved.
    """
    sheets = pd.read_excel(template_xlsx, sheet_name=None)
    nodes = sheets["nodes"].copy()
    # Ensure key columns exist
    for c in ["mol_id", "atom_idx"]:
        if c not in nodes.columns:
            raise ValueError(f"Template missing required column: {c}")
    # Left-join template with new feature values (align on mol_id + atom_idx)
    nodes = nodes.merge(
        df_feature, on=["mol_id", "atom_idx"], how="outer", suffixes=("", "_new")
    )
    # Prefer non-null new values (useful if re-running)
    for col in df_feature.columns:
        if col in ["mol_id", "atom_idx"]: 
            continue
        if col in nodes.columns:
            # If column already existed, fill only where null
            nodes[col] = nodes[col].where(nodes[col].notna(), nodes[col + "_new"])
            nodes.drop(columns=[col + "_new"], inplace=True, errors="ignore")

    with pd.ExcelWriter(out_xlsx, engine="xlsxwriter") as w:
        nodes.to_excel(w, sheet_name="nodes", index=False)
        if "README" in sheets:
            sheets["README"].to_excel(w, sheet_name="README", index=False)

# ---------- 3) Example: parse C24 Hirshfeld (7-1-1) and save ----------
if __name__ == "__main__":
    mol_id = "C24"
    template = "/mnt/data/node_features_template.xlsx"
    out = "/mnt/data/node_features_C24.xlsx"

    # A) Hirshfeld charge (7-1-1) → q_hirsh
    df_hirsh = parse_atom_scalar_from_lines(
        "/mnt/data/C24-7-1-1.txt", value_col="q_hirsh", mol_id=mol_id
    )
    merge_into_node_excel(template, out, df_hirsh)
    print(f"Saved: {out}")
