In [7]:
import pandas as pd
import numpy as np

FILES = [
    "submission_member1.csv",
    "submission2_dedup.csv",
    "submission3.csv",
    "submission4.csv",
]

ID_COL = "id"
PRED_COL = "binds"

dfs = []
for path in FILES:
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]

    if ID_COL not in df.columns or PRED_COL not in df.columns:
        raise ValueError(f"{path}: must contain columns '{ID_COL}' and '{PRED_COL}'. Got: {df.columns.tolist()}")

    if not df[ID_COL].is_unique:
        dup = df[df[ID_COL].duplicated()][ID_COL].head(5).tolist()
        raise ValueError(f"{path}: '{ID_COL}' has duplicates. Example duplicated ids: {dup}")

    mn, mx = df[PRED_COL].min(), df[PRED_COL].max()
    if not (np.isfinite(mn) and np.isfinite(mx)):
        raise ValueError(f"{path}: non-finite predictions in '{PRED_COL}'")
    if mn < 0 or mx > 1:
        print(f"Warning: {path} '{PRED_COL}' out of [0,1] range: min={mn}, max={mx}")

    model_name = path.replace(".csv", "")
    df = df[[ID_COL, PRED_COL]].rename(columns={PRED_COL: model_name})
    dfs.append(df)

print("Loaded files:")
for d in dfs:
    print(" -", d.columns[1], "| rows:", len(d))

Loaded files:
 - submission_member1 | rows: 1674896
 - submission2_dedup | rows: 1674896
 - submission3 | rows: 1674896
 - submission4 | rows: 1674896


In [8]:
merged = dfs[0]
for df in dfs[1:]:
    merged = merged.merge(df, on=ID_COL, how="inner")

n_expected = len(dfs[0])
if len(merged) != n_expected:
    print(f"\n[!] Merge size differs: merged={len(merged)} vs first_file={n_expected}")
    base_ids = set(dfs[0][ID_COL])
    for df in dfs[1:]:
        other_ids = set(df[ID_COL])
        print(f"Missing in {df.columns[1]} relative to base: {len(base_ids - other_ids)}")
        print(f"Extra in {df.columns[1]} relative to base: {len(other_ids - base_ids)}")
    raise ValueError("Submissions do not align by 'id' (different test files or corrupted).")

pred_cols = [c for c in merged.columns if c != ID_COL]

merged["binds_mean"] = merged[pred_cols].mean(axis=1)

In [9]:
def rank_avg(df_preds: pd.DataFrame) -> np.ndarray:
    ranks = np.zeros(len(df_preds), dtype=float)
    for c in df_preds.columns:
        ranks += df_preds[c].rank(method="average").to_numpy()
    ranks /= len(df_preds.columns)
    ranks = (ranks - ranks.min()) / (ranks.max() - ranks.min() + 1e-12)
    return ranks

In [10]:
merged["binds_rankavg"] = rank_avg(merged[pred_cols])

In [11]:
out_mean = merged[[ID_COL, "binds_mean"]].rename(columns={"binds_mean": PRED_COL})
out_rank = merged[[ID_COL, "binds_rankavg"]].rename(columns={"binds_rankavg": PRED_COL})

out_mean.to_csv("submission_ensemble_mean_4_models.csv", index=False)
out_rank.to_csv("submission_ensemble_rankavg_4_models.csv", index=False)

# print("\nSaved:")
# print(" - submission_ensemble_mean.csv")
# print(" - submission_ensemble_rankavg.csv")

corr = merged[pred_cols].corr()
print("\nModel correlation (Pearson):")
print(corr)



Model correlation (Pearson):
                    submission_member1  submission2_dedup  submission3  \
submission_member1            1.000000           0.581731     0.581731   
submission2_dedup             0.581731           1.000000     1.000000   
submission3                   0.581731           1.000000     1.000000   
submission4                   0.705325           0.479336     0.479336   

                    submission4  
submission_member1     0.705325  
submission2_dedup      0.479336  
submission3            0.479336  
submission4            1.000000  


In [1]:
import pandas as pd
import numpy as np

FILES = [
    "submission_member1.csv",
    "submission2_dedup.csv",
    # "submission3.csv",
    # "submission4.csv",
    "submission_gnn_kfold.csv"
]

ID_COL = "id"
PRED_COL = "binds"

dfs = []
for path in FILES:
    df = pd.read_csv(path)
    df.columns = [c.strip() for c in df.columns]

    if ID_COL not in df.columns or PRED_COL not in df.columns:
        raise ValueError(f"{path}: must contain columns '{ID_COL}' and '{PRED_COL}'. Got: {df.columns.tolist()}")

    if not df[ID_COL].is_unique:
        dup = df[df[ID_COL].duplicated()][ID_COL].head(5).tolist()
        raise ValueError(f"{path}: '{ID_COL}' has duplicates. Example duplicated ids: {dup}")

    mn, mx = df[PRED_COL].min(), df[PRED_COL].max()
    if not (np.isfinite(mn) and np.isfinite(mx)):
        raise ValueError(f"{path}: non-finite predictions in '{PRED_COL}'")
    if mn < 0 or mx > 1:
        print(f"Warning: {path} '{PRED_COL}' out of [0,1] range: min={mn}, max={mx}")

    model_name = path.replace(".csv", "")
    df = df[[ID_COL, PRED_COL]].rename(columns={PRED_COL: model_name})
    dfs.append(df)

print("Loaded files:")
for d in dfs:
    print(" -", d.columns[1], "| rows:", len(d))

Loaded files:
 - submission_member1 | rows: 1674896
 - submission2_dedup | rows: 1674896
 - submission_gnn_kfold | rows: 1674896


In [2]:
merged = dfs[0]
for df in dfs[1:]:
    merged = merged.merge(df, on=ID_COL, how="inner")

n_expected = len(dfs[0])
if len(merged) != n_expected:
    print(f"\n[!] Merge size differs: merged={len(merged)} vs first_file={n_expected}")
    base_ids = set(dfs[0][ID_COL])
    for df in dfs[1:]:
        other_ids = set(df[ID_COL])
        print(f"Missing in {df.columns[1]} relative to base: {len(base_ids - other_ids)}")
        print(f"Extra in {df.columns[1]} relative to base: {len(other_ids - base_ids)}")
    raise ValueError("Submissions do not align by 'id' (different test files or corrupted).")

pred_cols = [c for c in merged.columns if c != ID_COL]

merged["binds_mean"] = merged[pred_cols].mean(axis=1)

In [3]:
def rank_avg(df_preds: pd.DataFrame) -> np.ndarray:
    ranks = np.zeros(len(df_preds), dtype=float)
    for c in df_preds.columns:
        ranks += df_preds[c].rank(method="average").to_numpy()
    ranks /= len(df_preds.columns)
    ranks = (ranks - ranks.min()) / (ranks.max() - ranks.min() + 1e-12)
    return ranks

In [4]:
merged["binds_rankavg"] = rank_avg(merged[pred_cols])

In [5]:
out_mean = merged[[ID_COL, "binds_mean"]].rename(columns={"binds_mean": PRED_COL})
out_rank = merged[[ID_COL, "binds_rankavg"]].rename(columns={"binds_rankavg": PRED_COL})

out_mean.to_csv("submission_ensemble_mean_4_models.csv", index=False)
out_rank.to_csv("submission_ensemble_rankavg_4_models.csv", index=False)

# print("\nSaved:")
# print(" - submission_ensemble_mean.csv")
# print(" - submission_ensemble_rankavg.csv")

corr = merged[pred_cols].corr()
print("\nModel correlation (Pearson):")
print(corr)


Model correlation (Pearson):
                      submission_member1  submission2_dedup  \
submission_member1              1.000000           0.581731   
submission2_dedup               0.581731           1.000000   
submission_gnn_kfold            0.705237           0.450031   

                      submission_gnn_kfold  
submission_member1                0.705237  
submission2_dedup                 0.450031  
submission_gnn_kfold              1.000000  
