We gave AlphaFold3 two subsets of antibody targets:
1. 23 disagreement cases, defined here as difference of at least 2 Å (tbl_disagree)
2. 25 difficult/hard cases, defined as predicted RMSD ≥ 3.0 Å (tbl_hard)

In [1]:
import pandas as pd
import numpy as np

In [2]:
AF3_TARGETS = "af3_export/af3_targets.csv"
R_IGAB = "results_backbone_fvnoh3fit.csv"
R_AF3  = "results_af3_backbone_fvnoh3fit.csv"
PAIRS_AF3 = "pairs_af3.csv"

M_IGFOLD = "IGFold"
M_ABB2   = "ABodyBuilder2"

In [3]:
def ok_rows(df):
    if "error" in df.columns:
        return df[df["error"].fillna("").astype(str).str.strip() == ""].copy()
    return df.copy()

def norm_str(s: pd.Series) -> pd.Series:
    return s.astype(str).str.strip().str.upper()

def add_triage(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df["best_igab"]  = df[[M_IGFOLD, M_ABB2]].min(axis=1)
    df["worst_igab"] = df[[M_IGFOLD, M_ABB2]].max(axis=1)

    df["af3_category"] = np.select(
        [
            df["af3_h3_ctx"].notna() & (df["af3_h3_ctx"] < df["best_igab"]),
            df["af3_h3_ctx"].notna() & (df["af3_h3_ctx"] <= df["worst_igab"]),
            df["af3_h3_ctx"].notna() & (df["af3_h3_ctx"] > df["worst_igab"]),
        ],
        ["AF3_better_than_both", "AF3_between_methods", "AF3_worse_than_both"],
        default="AF3_missing"
    )
    return df

In [4]:
tgt = pd.read_csv(AF3_TARGETS)
tgt["label_norm"] = norm_str(tgt["label"])
tgt["base_pdb_id"] = norm_str(tgt["pdb_id"])
tgt = tgt[["label_norm", "base_pdb_id", "reason"]].drop_duplicates()

# ABB2 + IgFold RMSD results
igab = pd.read_csv(R_IGAB)
igab_ok = ok_rows(igab)

ig = igab_ok[igab_ok["method"].isin([M_IGFOLD, M_ABB2])].copy()
ig["label_norm"] = norm_str(ig["id"])

pv_label = (
    ig.pivot_table(index="label_norm", columns="method", values="rmsd_h3_ctx", aggfunc="first")
      .dropna(subset=[M_IGFOLD, M_ABB2])
      .reset_index()
)

pv_label["delta_abs"] = (pv_label[M_ABB2] - pv_label[M_IGFOLD]).abs()
pv_label["delta_signed"] = (pv_label[M_ABB2] - pv_label[M_IGFOLD])

pv_label = pv_label.merge(tgt, on="label_norm", how="inner")

tbl_disagree = pv_label[pv_label["reason"].eq("large_ABB2_vs_IgFold_delta_or_class_mismatch")].copy()
tbl_hard     = pv_label[pv_label["reason"].eq("hard_for_both_high_median_h3_ctx")].copy()

print("tbl_disagree rows:", len(tbl_disagree))
print("tbl_hard rows:", len(tbl_hard))

tbl_disagree rows: 23
tbl_hard rows: 25


In [5]:
# AF3 RMSD results
af3 = pd.read_csv(R_AF3)
af3_ok = ok_rows(af3)
af3_ok["id_norm"] = norm_str(af3_ok["id"])

# AF3 pairs
pairs_af3 = pd.read_csv(PAIRS_AF3)
pairs_af3["id_norm"] = norm_str(pairs_af3["id"])

needed = ["id_norm", "label_igab", "reason", "base_pdb_id"]
missing = [c for c in needed if c not in pairs_af3.columns]
if missing:
    raise RuntimeError(f"pairs_af3.csv is missing columns: {missing}. Did you run the updated make_pairs_af3.py?")

# attach cohort label to each AF3 RMSD row
af3_ok = af3_ok.merge(
    pairs_af3[needed],
    on="id_norm",
    how="left"
)

print("AF3 rows:", len(af3_ok))
print("Missing label_igab in AF3:", int(af3_ok["label_igab"].isna().sum()))

# pick the best AF3 row per label_igab (lowest AF3 H3 ctx RMSD)
# this usually is one per label, below is in case IF dubplicates exists
af3_best = (
    af3_ok.sort_values("rmsd_h3_ctx", ascending=True)
          .groupby("label_igab", as_index=False)
          .first()
          .rename(columns={
              "id": "af3_id",
              "pred_pdb": "af3_pred",
              "rmsd_h3_ctx": "af3_h3_ctx",
              "rmsd_h3_local": "af3_h3_local",
              "rmsd_fv_noh3_ctx": "af3_fv_noh3_ctx",
              "rmsd_fv_all_ctx": "af3_fv_all_ctx",
          })
)

af3_best = af3_best[["label_igab","af3_id","af3_pred","af3_h3_ctx","af3_h3_local","af3_fv_noh3_ctx","af3_fv_all_ctx"]].copy()

AF3 rows: 49
Missing label_igab in AF3: 0


In [6]:
# Merge AF3 by label
tbl_disagree = tbl_disagree.merge(
    af3_best,
    left_on="label_norm", right_on="label_igab",
    how="left"
)

tbl_hard = tbl_hard.merge(
    af3_best,
    left_on="label_norm", right_on="label_igab",
    how="left"
)

tbl_disagree = add_triage(tbl_disagree)
tbl_hard     = add_triage(tbl_hard)

print("Disagree triage:\n", tbl_disagree["af3_category"].value_counts(dropna=False))
print("Hard triage:\n", tbl_hard["af3_category"].value_counts(dropna=False))

Disagree triage:
 af3_category
AF3_better_than_both    9
AF3_between_methods     8
AF3_worse_than_both     5
AF3_missing             1
Name: count, dtype: int64
Hard triage:
 af3_category
AF3_worse_than_both     10
AF3_better_than_both     7
AF3_between_methods      6
AF3_missing              2
Name: count, dtype: int64


In [7]:
hard_table = (
    tbl_hard.rename(columns={
        M_ABB2: "abb2_h3_ctx",
        M_IGFOLD: "igfold_h3_ctx",
    })
    .assign(igab_delta_abs=lambda d: d["delta_abs"])
    .loc[:, [
        "label_norm", "base_pdb_id",
        "abb2_h3_ctx", "igfold_h3_ctx", "af3_h3_ctx",
        "igab_delta_abs",
        "af3_category",
        "af3_id"
    ]]
    .sort_values(["af3_category", "igab_delta_abs"], ascending=[True, False])
    .reset_index(drop=True)
)

hard_table

Unnamed: 0,label_norm,base_pdb_id,abb2_h3_ctx,igfold_h3_ctx,af3_h3_ctx,igab_delta_abs,af3_category,af3_id
0,9ML8_EF,9ML8,10.169141,6.520046,1.900328,3.649095,AF3_better_than_both,9ML8
1,7QT3_AB,7QT3,6.36419,8.415956,6.038097,2.051766,AF3_better_than_both,7QT3
2,7X8T_CB,7X8T,6.930013,8.149594,0.61741,1.219581,AF3_better_than_both,7X8T
3,8D36_HL,8D36,7.400057,8.550723,3.484241,1.150666,AF3_better_than_both,8D36
4,9BJG_HL,9BJG,10.692199,9.571576,1.263419,1.120623,AF3_better_than_both,9BJG
5,7UM3_IM,7UM3,8.261278,7.927745,7.070746,0.333533,AF3_better_than_both,7UM3_2
6,9JD1_BA,9JD1,7.709321,7.815595,4.962937,0.106274,AF3_better_than_both,9JD1
7,7SL5_AB,7SL5,12.891693,7.902927,10.033195,4.988766,AF3_between_methods,7SL5
8,7TOW_AB,7TOW,8.77651,6.41397,7.612217,2.36254,AF3_between_methods,7TOW
9,9MQV_HL,9MQV,8.763115,6.771911,6.852546,1.991204,AF3_between_methods,9MQV


A case is defined truly difficult to predict (hard) by all three models if both IgFold and ABodyBuilder2 predicted RMSD is ≥ 3.0 Å, and AlphaFold3 is worse than both antibody-specific methods

* In above table, this corresponds to: af3_category == "AF3_worse_than_both" (10 cases)
* Total evaluable cases excluding cases where AF3 is missing = 25 − 2 = 23

→ 43.5% of the cases were truly difficult to predict cases

In [8]:
disagree_table = (
    tbl_disagree.rename(columns={
        M_ABB2: "abb2_h3_ctx",
        M_IGFOLD: "igfold_h3_ctx",
    })
    .loc[:, [
        "label_norm", "base_pdb_id",
        "abb2_h3_ctx", "igfold_h3_ctx", "delta_abs", "delta_signed",
        "af3_h3_ctx",
        "af3_category",
        "af3_id"
    ]]
    .sort_values(["delta_abs"], ascending=False)
    .reset_index(drop=True)
)

disagree_table

Unnamed: 0,label_norm,base_pdb_id,abb2_h3_ctx,igfold_h3_ctx,delta_abs,delta_signed,af3_h3_ctx,af3_category,af3_id
0,9GP2_HL,9GP2,8.820386,3.034516,5.78587,5.78587,1.872975,AF3_better_than_both,9GP2
1,9BDI_CD,9BDI,8.638313,3.857547,4.780766,4.780766,3.370812,AF3_better_than_both,9BDI
2,7TTM_HL,7TTM,6.581204,2.72968,3.851524,3.851524,4.643055,AF3_between_methods,7TTM
3,8T9Y_HL,8T9Y,3.147281,6.778035,3.630754,-3.630754,5.745231,AF3_between_methods,8T9Y
4,9ML9_HL,9ML9,6.932764,3.351221,3.581543,3.581543,0.640337,AF3_better_than_both,9ML9
5,8SIT_QS,8SIT,6.461388,2.989526,3.471862,3.471862,2.837694,AF3_better_than_both,8SIT_2
6,8HRD_DF,8HRD,5.652535,2.196168,3.456367,3.456367,4.301584,AF3_between_methods,8HRD
7,7T7B_HL,7T7B,6.505959,3.049807,3.456152,3.456152,5.90988,AF3_between_methods,7T7B
8,7TP3_HL,7TP3,6.670439,3.433611,3.236828,3.236828,4.624733,AF3_between_methods,7TP3
9,8G8D_AB,8G8D,6.012437,2.830274,3.182163,3.182163,2.100694,AF3_better_than_both,8G8D


In the disagreement set, the differences between the predictions of the two antibody-specific models raised large, with absolute RMSD differences reaching up to 5.79 Å (e.g. 9GP2). Importantly, the signed differences show that neither predictor systematically dominates, some targets favor IgFold, while others favor ABodyBuilder2. This behavior indicates that these disagreements arise from method-specific inductive biases rather than random noise.

AlphaFold3 provides additional context for interpreting these disagreements. Among the 23 disagreement cases, AlphaFold3 produced a lower CDR-H3 RMSD than both IgFold and ABodyBuilder2 in multiple instances (e.g. 9GP2, 9BDI, 9ML9, 8G8D, 8VDL, 9SG2, 8BYU, 8P6H). This pattern suggests that, for these targets, the disagreement reflects limitations of the antibody-specific predictors rather than intrinsic structural ambiguity.

Additionally, several disagreement cases fall into the category AF3_worse_than_both (e.g. 8T58, 8FA7, 8TDX, 7X8P, 8QYA). In these instances, all three models struggle, implying that the underlying CDR-H3 conformations may be genuinely difficult to resolve.

In [9]:
print("Hard AF3_missing:")
display(hard_table[hard_table["af3_category"].eq("AF3_missing")][["label_norm","base_pdb_id"]])

print("Disagree AF3_missing:")
display(disagree_table[disagree_table["af3_category"].eq("AF3_missing")][["label_norm","base_pdb_id"]])

Hard AF3_missing:


Unnamed: 0,label_norm,base_pdb_id
13,7UXL_FE,7UXL
14,7UM3_HL,7UM3


Disagree AF3_missing:


Unnamed: 0,label_norm,base_pdb_id
14,8SIT_MN,8SIT


A small number of targets lack AlphaFold3 predictions due to representative-selection constraints at the PDB level. Specifically, two hard targets (7UXL, 7UM3) and one disagreement target (8SIT) fall into the AF3_missing category. These are technical rather than biological in origin and do not affect the interpretation of predictor behavior for the remaining cases.

Comparing the disagreement set (23 targets) with the hard set (25 targets) reveals that predictor disagreement is not sufficient for intrinsic difficulty. Some hard targets show agreement between IgFold and ABodyBuilder2 yet remain inaccurate, while some disagreement cases are readily resolved by AlphaFold3. This distinction reinforces the motivation for separating method disagreement from sequence-intrinsic difficulty when analyzing antibody structure prediction performance.

### References

[1] Brennan Abanades, Wing Ki Wong, Fergus Boyles, and Charlotte M. Deane. 2023.
ImmuneBuilder: Deep-Learning models for predicting the structures of immune proteins.
Communications Biology 6, 1 (2023), 575. https://doi.org/10.1038/s42003-023-04927-7

[2] Josh Abramson, Jonas Adler, Jack Dunger, et al. 2024.
Accurate structure prediction of biomolecular interactions with AlphaFold 3.
Nature 630 (2024), 493–500. https://doi.org/10.1038/s41586-024-07487-w

[3] Sharmila Anishetty, Gautam Pennathur, and Raghothama Anishetty. 2002.
Tripeptide Analysis of Protein Structures.
BMC Structural Biology 2 (2002), 9. https://doi.org/10.1186/1472-6807-2-9

[4] R. Anjana et al. 2012.
Aromatic–Aromatic Interactions in Structures of Proteins and Protein–DNA Complexes.
Bioinformation 8, 24 (2012), 1220–1224. https://doi.org/10.6026/97320630081220

[5] David J. Barlow and Janet M. Thornton. 1988.
Helix geometry in proteins.
Journal of Molecular Biology 201, 3 (1988), 601–619. https://doi.org/10.1016/0022-2836(88)90641-9

[6] Thomas M. Cover and Joy A. Thomas. 2006.
Elements of Information Theory (2nd ed.). Wiley.

[7] Janez Demšar. 2006.
Statistical Comparisons of Classifiers over Multiple Data Sets.
Journal of Machine Learning Research 7 (2006), 1–30.

[8] Jason E. Donald, Daniel W. Kulp, and William F. DeGrado. 2011.
Salt bridges: geometrically specific, designable interactions.
Proteins 79, 3 (2011), 898–915. https://doi.org/10.1002/prot.22927

[9] Rob J. Hyndman and Yanan Fan. 1996.
Sample Quantiles in Statistical Packages.
The American Statistician 50, 4 (1996), 361–365. https://doi.org/10.2307/2684934

[10] Daniel T. Infield et al. 2021.
Cation–π Interactions and their Functional Roles in Membrane Proteins.
Journal of Molecular Biology 433, 17 (2021), 167035. https://doi.org/10.1016/j.jmb.2021.167035

[11] J. Jacob, H. Duclohier, and David S. Cafiso. 1999.
The Role of Proline and Glycine in Determining the Backbone Flexibility of a Channel-Forming Peptide.
Biophysical Journal 76, 3 (1999), 1367–1376. https://doi.org/10.1016/S0006-3495(99)77298-X

[12] Wolfgang Kabsch. 1976.
A Solution for the Best Rotation to Relate Two Sets of Vectors.
Acta Crystallographica Section A 32, 5 (1976), 922–923. https://doi.org/10.1107/S0567739476001873

[13] Jack Kyte and Russell F. Doolittle. 1982.
A Simple Method for Displaying the Hydropathic Character of a Protein.
Journal of Molecular Biology 157, 1 (1982), 105–132. https://doi.org/10.1016/0022-2836(82)90515-0

[14] Tianyu Li et al. 2015.
Rigidity Emerges during Antibody Evolution in Three Distinct Antibody Systems.
PLoS Computational Biology 11, 7 (2015), e1004327. https://doi.org/10.1371/journal.pcbi.1004327

[15] Henry B. Mann and Donald R. Whitney. 1947.
On a Test of Whether One of Two Random Variables Is Stochastically Larger than the Other.
The Annals of Mathematical Statistics 18, 1 (1947), 50–60. https://doi.org/10.1214/aoms/1177730491

[16] Claire Marks and Charlotte M. Deane. 2017.
Antibody H3 Structure Prediction.
Computational and Structural Biotechnology Journal 15 (2017), 222–231. https://doi.org/10.1016/j.csbj.2017.01.010

[17] Gregory B. McGaughey, Marc Gagne, and Anthony K. Rappé. 1998.
π-Stacking interactions: Alive and well in proteins.
Journal of Biological Chemistry 273, 25 (1998), 15458–15463. https://doi.org/10.1074/jbc.273.25.15458

[18] Jiangbo Miao, Judith Klein-Seetharaman, and Hagai Meirovitch. 2004.
The Optimal Fraction of Hydrophobic Residues Required to Ensure Protein Collapse.
Journal of Molecular Biology 344, 3 (2004), 797–811. https://doi.org/10.1016/j.jmb.2004.09.061

[19] Meritxell Olivella et al. 2002.
Influence of the environment in the conformation of alpha-helices.
Biophysical Journal 82, 6 (2002), 3207–3213. https://doi.org/10.1016/S0006-3495(02)75663-4

[20] Hung-Pin Peng et al. 2022.
Antibody CDR amino acids underlying the functionality of antibody repertoires.
Scientific Reports 12 (2022), 12555. https://doi.org/10.1038/s41598-022-16841-9

[21] Cristian Regep et al. 2017.
The H3 loop of antibodies shows unique structural characteristics.
Proteins 85, 7 (2017), 1311–1318. https://doi.org/10.1002/prot.25291

[22] Bernard Rosner, Robert J. Glynn, and Mei-Ling T. Lee. 2006.
The Wilcoxon signed rank test for paired comparisons of clustered data.
Biometrics 62, 1 (2006), 185–192. https://doi.org/10.1111/j.1541-0420.2005.00389.x

[23] Jeffrey A. Ruffolo, Jeremias Sulam, and Jeffrey J. Gray. 2023.
Fast, accurate antibody structure prediction from deep learning on massive set of natural antibodies.
Nature Communications 14, 1 (2023), 2389. https://doi.org/10.1038/s41467-023-38063-x

[24] Amandeep K. Sangha et al. 2017.
Role of Non-local Interactions between CDR Loops in Binding Affinity.
Structure 25, 12 (2017), 1820–1828.e2. https://doi.org/10.1016/j.str.2017.10.005

[25] Claude E. Shannon. 1948.
A Mathematical Theory of Communication.
Bell System Technical Journal 27 (1948), 379–423, 623–656.

[26] Wouter van Loon. 2017.
The Power of the Benjamini–Hochberg Procedure. Master’s Thesis, Leiden University.

[27] U. Vignesh et al. 2024.
Ensemble Deep Learning Model for Protein Secondary Structure Prediction Using NLP Metrics and Explainable AI.
Results in Engineering 24 (2024), 103435. https://doi.org/10.1016/j.rineng.2024.103435

[28] Frank Wilcoxon. 1945.
Individual Comparisons by Ranking Methods.
Biometrics Bulletin 1, 6 (1945), 80–83. https://doi.org/10.2307/3001968

[29] Hao Xu et al. 2025.
In-Depth Study of Low-Complexity Domains.
Cells 14, 22 (2025), 1752. https://doi.org/10.3390/cells14221752