In [4]:
import pandas as pd
import numpy as np

H3_FEATURES = "h3_features.csv"
df = pd.read_csv(H3_FEATURES)

df["id_norm"] = df["id"].astype(str).str.strip().str.upper()
df["base_pdb_id"] = df["id_norm"].str.split("_").str[0]

for c in ["rmsd_fv_noh3_ctx", "rmsd_h3_ctx", "rmsd_fv_all_ctx", "rmsd_h3_local"]:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

df_ok = df[df["error"].fillna("").astype(str).str.strip() == ""].copy()

FV_GOOD = 3.0
H3_BAD  = 3.0

mask = (
    np.isfinite(df_ok["rmsd_fv_noh3_ctx"]) & (df_ok["rmsd_fv_noh3_ctx"] <= FV_GOOD) &
    np.isfinite(df_ok["rmsd_h3_ctx"])      & (df_ok["rmsd_h3_ctx"]      >= H3_BAD)
)

hard_h3 = df_ok[mask].copy()

print("OK rows:", len(df_ok))
print("goodFV<=%.1f & badH3>=%.1f rows:" % (FV_GOOD, H3_BAD), len(hard_h3))
print("unique targets:", hard_h3["base_pdb_id"].nunique())

cols = ["base_pdb_id","id","method","label","rmsd_fv_noh3_ctx","rmsd_fv_all_ctx","rmsd_h3_ctx","rmsd_h3_local","h3_len","h3_seq"]
cols = [c for c in cols if c in hard_h3.columns]
hard_h3[cols].sort_values(["rmsd_h3_ctx","rmsd_fv_noh3_ctx"], ascending=[False, True]).head(50)

OK rows: 1138
goodFV<=3.0 & badH3>=3.0 rows: 560
unique targets: 324


Unnamed: 0,base_pdb_id,id,method,label,rmsd_fv_noh3_ctx,rmsd_fv_all_ctx,rmsd_h3_ctx,rmsd_h3_local,h3_len,h3_seq
940,9ECX,9ECX_CD,ABodyBuilder2,9ECX_CD,1.134066,4.603998,15.899359,6.164166,19,AGAITGTPRNFYYYYGMDV
245,9ECX,9ECX_CD,IGFold,9ECX_CD,1.176787,4.431633,15.230512,6.571091,19,AGAITGTPRNFYYYYGMDV
1115,7SL5,7SL5_AB,ABodyBuilder2,7SL5_AB,0.952237,4.100394,12.891693,6.856882,23,GLDVLRFLDLSTPSGERLDAFDI
140,7U0E,7U0E_AB,IGFold,7U0E_AB,1.153402,3.543406,12.430763,4.248468,17,VGKDDDVLTGGNKYFDH
176,8VBM,8VBM_HL,IGFold,8VBM_HL,0.97148,2.90954,12.219879,6.334798,11,VHQQTRKSCDG
65,8VBK,8VBK_HL,IGFold,8VBK_HL,0.995232,2.920471,12.207476,6.632258,11,VHQQTRKGCDG
837,7U0E,7U0E_AB,ABodyBuilder2,7U0E_AB,1.053593,3.124251,10.916466,4.324964,17,VGKDDDVLTGGNKYFDH
905,9BJG,9BJG_HL,ABodyBuilder2,9BJG_HL,0.76187,3.19682,10.692199,6.216907,20,DRFRGGYNYPSDIYSHAPDH
876,9ML8,9ML8_EF,ABodyBuilder2,9ML8_EF,2.249785,3.472338,10.169141,4.441364,17,GVYFYAGHFVIMRYFVL
209,9BJG,9BJG_HL,IGFold,9BJG_HL,0.814149,2.893209,9.571576,3.701567,20,DRFRGGYNYPSDIYSHAPDH


In [5]:
hard_h3["method"].value_counts(dropna=False)

method
ABodyBuilder2    305
IGFold           255
Name: count, dtype: int64