In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

H3_FEATURES = "h3_features.csv"

df = pd.read_csv(H3_FEATURES)

if "base_pdb_id" not in df.columns:
    if "id" in df.columns:
        df["base_pdb_id"] = (
            df["id"].astype(str).str.strip().str.upper().str.split("_").str[0]
        )
    elif "pdb_id" in df.columns:
        df["base_pdb_id"] = (
            df["pdb_id"].astype(str).str.strip().str.upper().str.split("_").str[0]
        )
    else:
        raise SystemExit(f"Can't derive base_pdb_id: columns are {list(df.columns)}")

df["base_pdb_id"] = df["base_pdb_id"].astype(str).str.strip().str.upper()

print("Loaded rows:", len(df))
print("Columns:", len(df.columns))

Loaded rows: 1388
Columns: 59


In [2]:
def pick_one_row(group: pd.DataFrame) -> pd.Series:
    g = group.copy()
    if "h3_seq" in g.columns:
        g["h3_seq"] = g["h3_seq"].fillna("").astype(str)
        nonempty = g[g["h3_seq"].str.len() > 0]
        if len(nonempty) > 0:
            return nonempty.iloc[0]
    return g.iloc[0]

df_target = df.groupby("base_pdb_id", as_index=False).apply(pick_one_row).reset_index(drop=True)
df_target[["base_pdb_id"] + ([c for c in ["h3_seq","h3_len"] if c in df_target.columns])].head()

feature_panel = [
    "h3_len", "h3_entropy", "h3_num_unique",
    "h3_net_charge_pH7", "h3_kd_mean", "h3_frac_hydrophobic",
    "h3_frac_gly_pro", "h3_frac_aromatic",
    "h3_has_cys", "h3_has_cys_pair", "h3_has_glyco_motif",
    "h3_p_count", "h3_p_max_run",
    "h3_kmer3_rarity_dataset",
    "bound_state"
]

feature_panel = [c for c in feature_panel if c in df_target.columns]
print("Using features:", feature_panel)

Using features: ['h3_len', 'h3_entropy', 'h3_num_unique', 'h3_net_charge_pH7', 'h3_kd_mean', 'h3_frac_hydrophobic', 'h3_frac_gly_pro', 'h3_frac_aromatic', 'h3_has_cys', 'h3_has_cys_pair', 'h3_has_glyco_motif', 'h3_p_count', 'h3_p_max_run', 'h3_kmer3_rarity_dataset', 'bound_state']


  df_target = df.groupby("base_pdb_id", as_index=False).apply(pick_one_row).reset_index(drop=True)


In [3]:
feature_panel = [
    "h3_len", "h3_entropy", "h3_num_unique",
    "h3_net_charge_pH7", "h3_kd_mean", "h3_frac_hydrophobic",
    "h3_frac_gly_pro", "h3_frac_aromatic",
    "h3_has_cys", "h3_has_cys_pair", "h3_has_glyco_motif",
    "h3_p_count", "h3_p_max_run",
    "h3_kmer3_rarity_dataset",
]

feature_panel = [c for c in feature_panel if c in df_target.columns]
print("Using features:", feature_panel)

Using features: ['h3_len', 'h3_entropy', 'h3_num_unique', 'h3_net_charge_pH7', 'h3_kd_mean', 'h3_frac_hydrophobic', 'h3_frac_gly_pro', 'h3_frac_aromatic', 'h3_has_cys', 'h3_has_cys_pair', 'h3_has_glyco_motif', 'h3_p_count', 'h3_p_max_run', 'h3_kmer3_rarity_dataset']


In [4]:
def feature_card(pdb_id: str):
    pid = str(pdb_id).strip().upper()
    row = df_target[df_target["base_pdb_id"] == pid]
    if row.empty:
        raise ValueError(f"{pid} not found in df_target. Available example IDs: {df_target['base_pdb_id'].head(10).tolist()}")
    row = row.iloc[0]

    numeric_cols = []
    for c in feature_panel:
        if pd.api.types.is_numeric_dtype(df_target[c]) or c in [
            "h3_len","h3_entropy","h3_num_unique","h3_net_charge_pH7","h3_kd_mean","h3_frac_hydrophobic",
            "h3_frac_gly_pro","h3_frac_aromatic","h3_p_count","h3_p_max_run","h3_kmer3_rarity_dataset"
        ]:
            numeric_cols.append(c)

    dist = df_target.copy()
    for c in numeric_cols:
        dist[c] = pd.to_numeric(dist[c], errors="coerce")
    vals = {c: pd.to_numeric(row.get(c, np.nan), errors="coerce") for c in numeric_cols}

    perc = {}
    for c, v in vals.items():
        x = dist[c].dropna().to_numpy(dtype=float)
        if len(x) == 0 or not np.isfinite(v):
            perc[c] = np.nan
        else:
            perc[c] = float((x < v).mean() * 100.0)

    print("="*90)
    print("TARGET:", pid)

    if "h3_seq" in df_target.columns:
        seq = str(row.get("h3_seq",""))
        print("H3 seq:", seq)
        print("H3 length:", len(seq))

    out_rows = []
    for c in feature_panel:
        v = row.get(c, np.nan)
        if c in perc:
            out_rows.append({"feature": c, "value": v, "percentile_vs_dataset": perc[c]})
        else:
            out_rows.append({"feature": c, "value": v, "percentile_vs_dataset": ""})

    display(pd.DataFrame(out_rows))

In [5]:
def is_af3_method(m: str) -> bool:
    m = str(m).strip().upper()
    return ("AF3" in m) or ("ALPHAFOLD3" in m) or ("ALPHAFOLD 3" in m) or (m == "ALPHAFOLD3")

def show_target(target_id: str):
    tid = str(target_id).strip().upper()
    base = tid.split("_")[0]

    d = all_methods[all_methods["base_pdb_id"] == base].copy()
    if d.empty:
        print(f"No rows found for base_pdb_id={base}")
        return

    if "_" in tid and "id_norm" in d.columns and "method" in d.columns:
        d_af3_exact = d[d["method"].apply(is_af3_method) & (d["id_norm"] == tid)]
        if not d_af3_exact.empty:
            d = pd.concat([d[~d["method"].apply(is_af3_method)], d_af3_exact], ignore_index=True)

    if "rmsd_h3_ctx" in d.columns:
        d["rmsd_h3_ctx"] = pd.to_numeric(d["rmsd_h3_ctx"], errors="coerce")
        d = d.sort_values(["method", "rmsd_h3_ctx", "id_norm"], ascending=[True, True, True])
    else:
        d = d.sort_values(["method","id_norm"], ascending=[True, True])

    d = d.drop_duplicates(subset=["method"], keep="first")

    cols = [
        "method","id","label",
        "rmsd_fv_noh3_ctx","rmsd_fv_all_ctx","rmsd_h3_ctx","rmsd_h3_local",
        "pred_conf_h3_bfac_mean","pred_conf_fv_bfac_mean",
        "best_model","ranking_score","ptm","iptm",
        "chain_map","error",
        "h3_seq","n_h3_atoms"
    ]
    cols = [c for c in cols if c in d.columns]

    disp = d[cols].copy()
    for c in ["rmsd_fv_noh3_ctx","rmsd_fv_all_ctx","rmsd_h3_ctx","rmsd_h3_local","ranking_score","ptm","iptm"]:
        if c in disp.columns:
            disp[c] = pd.to_numeric(disp[c], errors="coerce").map(lambda x: round(x, 3) if np.isfinite(x) else x)

    print(f"Target base PDB: {base}")
    display(disp)

    print("\nInterpretation (ctx vs local):")
    for _, r in d.iterrows():
        ctx = r.get("rmsd_h3_ctx", np.nan)
        local = r.get("rmsd_h3_local", np.nan)
        print(f"- {r.get('method','?')}: {interpret_ctx_vs_local(ctx, local)}")

In [6]:
R_IGAB = "results_backbone_fvnoh3fit.csv"
R_AF3  = "results_af3_backbone_fvnoh3fit.csv"

igab = pd.read_csv(R_IGAB)
af3  = pd.read_csv(R_AF3)

def base_id(series):
    s = series.astype(str).str.strip().str.upper()
    return s.str.split("_").str[0]

igab["id_norm"] = igab["id"].astype(str).str.strip().str.upper()
af3["id_norm"]  = af3["id"].astype(str).str.strip().str.upper()

igab["base_pdb_id"] = base_id(igab["id_norm"])
af3["base_pdb_id"]  = base_id(af3["id_norm"])

for c in ["rmsd_h3_ctx","rmsd_h3_local","rmsd_fv_noh3_ctx","rmsd_fv_all_ctx"]:
    if c in igab.columns: igab[c] = pd.to_numeric(igab[c], errors="coerce")
    if c in af3.columns:  af3[c]  = pd.to_numeric(af3[c], errors="coerce")

def ok_rows(df):
    if "error" in df.columns:
        return df[df["error"].fillna("").astype(str).str.strip() == ""].copy()
    return df.copy()

igab_ok = ok_rows(igab)
af3_ok  = ok_rows(af3)

print("IGAB methods:", sorted(igab_ok["method"].astype(str).unique())[:50])
print("AF3 methods:", sorted(af3_ok["method"].astype(str).unique())[:50])

M_IGFOLD = "IGFold"
M_ABB2   = "ABodyBuilder2"

pv = igab_ok.pivot_table(
    index="base_pdb_id",
    columns="method",
    values="rmsd_h3_ctx",
    aggfunc="first"
)

need = [M_IGFOLD, M_ABB2]
missing_cols = [m for m in need if m not in pv.columns]
if missing_cols:
    raise ValueError(
        f"Expected methods {need} not found in pivot columns. "
        f"Missing={missing_cols}. Available={list(pv.columns)}"
    )

pv = pv.dropna(subset=need, how="any").copy()

pv["delta_abs"]    = (pv[M_ABB2] - pv[M_IGFOLD]).abs()
pv["delta_signed"] = (pv[M_ABB2] - pv[M_IGFOLD])

delta_tbl = pv.reset_index().sort_values("delta_abs", ascending=False)

DELTA_CUTOFF = 2.0
sel = delta_tbl[delta_tbl["delta_abs"] >= DELTA_CUTOFF].copy()

af3_best = (
    af3_ok.sort_values("rmsd_h3_ctx", ascending=True)
          .groupby("base_pdb_id", as_index=False)
          .first()
)

af3_cols = ["base_pdb_id","id","pred_pdb","rmsd_h3_ctx","rmsd_h3_local","rmsd_fv_noh3_ctx","rmsd_fv_all_ctx"]
af3_cols = [c for c in af3_cols if c in af3_best.columns]
af3_best = af3_best[af3_cols].copy()

af3_best = af3_best.rename(columns={
    "id":"af3_id",
    "pred_pdb":"af3_pred",
    "rmsd_h3_ctx":"af3_h3_ctx",
    "rmsd_h3_local":"af3_h3_local",
    "rmsd_fv_noh3_ctx":"af3_fv_noh3_ctx",
    "rmsd_fv_all_ctx":"af3_fv_all_ctx",
})

sel = sel.merge(af3_best, on="base_pdb_id", how="left")

print("n delta>=2:", len(sel))
display(sel.sort_values("delta_abs", ascending=False).head(40))

IGAB methods: ['ABodyBuilder2', 'IGFold']
AF3 methods: ['AlphaFold3']
n delta>=2: 40


Unnamed: 0,base_pdb_id,ABodyBuilder2,IGFold,delta_abs,delta_signed,af3_id,af3_pred,af3_h3_ctx,af3_h3_local,af3_fv_noh3_ctx,af3_fv_all_ctx
0,9GP2,8.820386,3.034516,5.78587,5.78587,9GP2,af3_output_pdb/9gp2/fold_9gp2_model_0.pdb,1.872975,1.344418,0.887079,0.958524
1,7SL5,12.891693,7.902927,4.988766,4.988766,7SL5,af3_output_pdb/7sl5/fold_7sl5_model_0.pdb,10.033195,3.34561,0.931789,3.236063
2,9BDI,8.638313,3.857547,4.780766,4.780766,9BDI,af3_output_pdb/9bdi/fold_9bdi_model_0.pdb,3.370812,0.862255,0.554973,1.112175
3,7TTM,6.581204,2.72968,3.851524,3.851524,7TTM,af3_output_pdb/7ttm/fold_7ttm_model_0.pdb,4.643055,1.556689,0.740228,1.51617
4,9ML8,10.169141,6.520046,3.649095,3.649095,9ML8,af3_output_pdb/9ml8/fold_9ml8_model_0.pdb,1.900328,0.783667,2.374316,2.343899
5,8T9Y,3.147281,6.778035,3.630754,-3.630754,8T9Y,af3_output_pdb/8t9y/fold_8t9y_model_0.pdb,5.745231,4.395714,1.28323,2.072424
6,9ML9,6.932764,3.351221,3.581543,3.581543,9ML9,af3_output_pdb/9ml9/fold_9ml9_model_0.pdb,0.640337,0.465056,0.631488,0.632274
7,8SIT,6.461388,2.989526,3.471862,3.471862,8SIT_2,af3_output_pdb/8sit_2/fold_8sit_model_0.pdb,2.837694,1.732381,0.944952,1.213175
8,8HRD,5.652535,2.196168,3.456367,3.456367,8HRD,af3_output_pdb/8hrd/fold_8hrd_model_0.pdb,4.301584,0.899998,0.697309,1.366124
9,7T7B,6.505959,3.049807,3.456152,3.456152,7T7B,af3_output_pdb/7t7b/fold_7t7b_model_0.pdb,5.90988,3.304038,0.615023,1.821687


In [7]:
targets = ["9GP2", "7SL5"]
pid = [t.strip().upper() for t in targets]

cols = ["base_pdb_id"] + [c for c in ["h3_seq"] + feature_panel if c in df_target.columns]
comp = df_target[df_target["base_pdb_id"].isin(pid)][cols].copy()
comp = comp.set_index("base_pdb_id")
comp

Unnamed: 0_level_0,h3_seq,h3_len,h3_entropy,h3_num_unique,h3_net_charge_pH7,h3_kd_mean,h3_frac_hydrophobic,h3_frac_gly_pro,h3_frac_aromatic,h3_has_cys,h3_has_cys_pair,h3_has_glyco_motif,h3_p_count,h3_p_max_run,h3_kmer3_rarity_dataset
base_pdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
7SL5,GLDVLRFLDLSTPSGERLDAFDI,23,3.323143,12,-3.0,0.169565,0.434783,0.130435,0.086957,False,False,False,1,1,3.192689
9GP2,LYYDYGDAMDY,11,2.299896,6,-3.0,-0.781818,0.636364,0.090909,0.363636,False,False,False,0,0,2.599282


In [22]:
feature_card("9GP2")   # AF3 predicts higher accuracy

TARGET: 9GP2
H3 seq: LYYDYGDAMDY
H3 length: 11


Unnamed: 0,feature,value,percentile_vs_dataset
0,h3_len,11,49.852941
1,h3_entropy,2.299896,26.764706
2,h3_num_unique,6,27.647059
3,h3_net_charge_pH7,-3.0,2.205882
4,h3_kd_mean,-0.781818,29.117647
5,h3_frac_hydrophobic,0.636364,88.382353
6,h3_frac_gly_pro,0.090909,36.470588
7,h3_frac_aromatic,0.363636,79.117647
8,h3_has_cys,False,0.0
9,h3_has_cys_pair,False,0.0


In [23]:
feature_card("7SL5")   # AF3 also fails

TARGET: 7SL5
H3 seq: GLDVLRFLDLSTPSGERLDAFDI
H3 length: 23


Unnamed: 0,feature,value,percentile_vs_dataset
0,h3_len,23,99.558824
1,h3_entropy,3.323143,93.088235
2,h3_num_unique,12,95.882353
3,h3_net_charge_pH7,-3.0,2.205882
4,h3_kd_mean,0.169565,85.147059
5,h3_frac_hydrophobic,0.434783,48.823529
6,h3_frac_gly_pro,0.130435,52.205882
7,h3_frac_aromatic,0.086957,24.852941
8,h3_has_cys,False,0.0
9,h3_has_cys_pair,False,0.0


### References

[1] Brennan Abanades, Wing Ki Wong, Fergus Boyles, and Charlotte M. Deane. 2023.
ImmuneBuilder: Deep-Learning models for predicting the structures of immune proteins.
Communications Biology 6, 1 (2023), 575. https://doi.org/10.1038/s42003-023-04927-7

[2] Josh Abramson, Jonas Adler, Jack Dunger, et al. 2024.
Accurate structure prediction of biomolecular interactions with AlphaFold 3.
Nature 630 (2024), 493–500. https://doi.org/10.1038/s41586-024-07487-w

[3] Sharmila Anishetty, Gautam Pennathur, and Raghothama Anishetty. 2002.
Tripeptide Analysis of Protein Structures.
BMC Structural Biology 2 (2002), 9. https://doi.org/10.1186/1472-6807-2-9

[4] R. Anjana et al. 2012.
Aromatic–Aromatic Interactions in Structures of Proteins and Protein–DNA Complexes.
Bioinformation 8, 24 (2012), 1220–1224. https://doi.org/10.6026/97320630081220

[5] David J. Barlow and Janet M. Thornton. 1988.
Helix geometry in proteins.
Journal of Molecular Biology 201, 3 (1988), 601–619. https://doi.org/10.1016/0022-2836(88)90641-9

[6] Thomas M. Cover and Joy A. Thomas. 2006.
Elements of Information Theory (2nd ed.). Wiley.

[7] Janez Demšar. 2006.
Statistical Comparisons of Classifiers over Multiple Data Sets.
Journal of Machine Learning Research 7 (2006), 1–30.

[8] Jason E. Donald, Daniel W. Kulp, and William F. DeGrado. 2011.
Salt bridges: geometrically specific, designable interactions.
Proteins 79, 3 (2011), 898–915. https://doi.org/10.1002/prot.22927

[9] Rob J. Hyndman and Yanan Fan. 1996.
Sample Quantiles in Statistical Packages.
The American Statistician 50, 4 (1996), 361–365. https://doi.org/10.2307/2684934

[10] Daniel T. Infield et al. 2021.
Cation–π Interactions and their Functional Roles in Membrane Proteins.
Journal of Molecular Biology 433, 17 (2021), 167035. https://doi.org/10.1016/j.jmb.2021.167035

[11] J. Jacob, H. Duclohier, and David S. Cafiso. 1999.
The Role of Proline and Glycine in Determining the Backbone Flexibility of a Channel-Forming Peptide.
Biophysical Journal 76, 3 (1999), 1367–1376. https://doi.org/10.1016/S0006-3495(99)77298-X

[12] Wolfgang Kabsch. 1976.
A Solution for the Best Rotation to Relate Two Sets of Vectors.
Acta Crystallographica Section A 32, 5 (1976), 922–923. https://doi.org/10.1107/S0567739476001873

[13] Jack Kyte and Russell F. Doolittle. 1982.
A Simple Method for Displaying the Hydropathic Character of a Protein.
Journal of Molecular Biology 157, 1 (1982), 105–132. https://doi.org/10.1016/0022-2836(82)90515-0

[14] Tianyu Li et al. 2015.
Rigidity Emerges during Antibody Evolution in Three Distinct Antibody Systems.
PLoS Computational Biology 11, 7 (2015), e1004327. https://doi.org/10.1371/journal.pcbi.1004327

[15] Henry B. Mann and Donald R. Whitney. 1947.
On a Test of Whether One of Two Random Variables Is Stochastically Larger than the Other.
The Annals of Mathematical Statistics 18, 1 (1947), 50–60. https://doi.org/10.1214/aoms/1177730491

[16] Claire Marks and Charlotte M. Deane. 2017.
Antibody H3 Structure Prediction.
Computational and Structural Biotechnology Journal 15 (2017), 222–231. https://doi.org/10.1016/j.csbj.2017.01.010

[17] Gregory B. McGaughey, Marc Gagne, and Anthony K. Rappé. 1998.
π-Stacking interactions: Alive and well in proteins.
Journal of Biological Chemistry 273, 25 (1998), 15458–15463. https://doi.org/10.1074/jbc.273.25.15458

[18] Jiangbo Miao, Judith Klein-Seetharaman, and Hagai Meirovitch. 2004.
The Optimal Fraction of Hydrophobic Residues Required to Ensure Protein Collapse.
Journal of Molecular Biology 344, 3 (2004), 797–811. https://doi.org/10.1016/j.jmb.2004.09.061

[19] Meritxell Olivella et al. 2002.
Influence of the environment in the conformation of alpha-helices.
Biophysical Journal 82, 6 (2002), 3207–3213. https://doi.org/10.1016/S0006-3495(02)75663-4

[20] Hung-Pin Peng et al. 2022.
Antibody CDR amino acids underlying the functionality of antibody repertoires.
Scientific Reports 12 (2022), 12555. https://doi.org/10.1038/s41598-022-16841-9

[21] Cristian Regep et al. 2017.
The H3 loop of antibodies shows unique structural characteristics.
Proteins 85, 7 (2017), 1311–1318. https://doi.org/10.1002/prot.25291

[22] Bernard Rosner, Robert J. Glynn, and Mei-Ling T. Lee. 2006.
The Wilcoxon signed rank test for paired comparisons of clustered data.
Biometrics 62, 1 (2006), 185–192. https://doi.org/10.1111/j.1541-0420.2005.00389.x

[23] Jeffrey A. Ruffolo, Jeremias Sulam, and Jeffrey J. Gray. 2023.
Fast, accurate antibody structure prediction from deep learning on massive set of natural antibodies.
Nature Communications 14, 1 (2023), 2389. https://doi.org/10.1038/s41467-023-38063-x

[24] Amandeep K. Sangha et al. 2017.
Role of Non-local Interactions between CDR Loops in Binding Affinity.
Structure 25, 12 (2017), 1820–1828.e2. https://doi.org/10.1016/j.str.2017.10.005

[25] Claude E. Shannon. 1948.
A Mathematical Theory of Communication.
Bell System Technical Journal 27 (1948), 379–423, 623–656.

[26] Wouter van Loon. 2017.
The Power of the Benjamini–Hochberg Procedure. Master’s Thesis, Leiden University.

[27] U. Vignesh et al. 2024.
Ensemble Deep Learning Model for Protein Secondary Structure Prediction Using NLP Metrics and Explainable AI.
Results in Engineering 24 (2024), 103435. https://doi.org/10.1016/j.rineng.2024.103435

[28] Frank Wilcoxon. 1945.
Individual Comparisons by Ranking Methods.
Biometrics Bulletin 1, 6 (1945), 80–83. https://doi.org/10.2307/3001968

[29] Hao Xu et al. 2025.
In-Depth Study of Low-Complexity Domains.
Cells 14, 22 (2025), 1752. https://doi.org/10.3390/cells1422175