In [15]:
import pandas as pd
import numpy as np

In [16]:
df = pd.read_csv("complex_median_weights.csv")

In [17]:
df.head(10)

Unnamed: 0,ASSEMBLY_STR,PDB_COMPLEX_ID,ASSEMBLY_NAME,ASSEMBLY_TYPE,ASSEMBLIES,NUM_ASSEMBLIES,NUM_COMPONENTS,MEDIAN_WT,MAX_WT
0,A0A010_2,PDB-CPX-100020,MoeN5,homomeric,"5b00_1,5b00_2,5b01_1,5b01_2,5b01_3,5b01_4,5b01_5",7,2,64.52,65.18
1,A0A011_2,PDB-CPX-100021,MoeO5,homomeric,"3vk5_1,3vka_1,3vkb_1,3vkc_1,3vkd_1",5,2,63.15,63.56
2,A0A022MQ12_2,PDB-CPX-100028,Amidohydrolase-related domain-containing protein,homomeric,"6sj0_1,6sj1_1,6sj2_1,6sj3_1,6sj4_1",5,2,113.78,114.64
3,A0A022MRT4_2,PDB-CPX-100030,AMP-dependent synthetase/ligase domain-contain...,homomeric,"6six_1,6siy_1,6siz_1",3,2,98.65,98.84
4,A0A023DFE8_2,PDB-CPX-100031,Metallo-beta-lactamase domain-containing protein,homomeric,"6n9i_1,6n9i_2,6n9q_1,9ayt_1,9ayt_2,9b2i_1,9b2i...",16,2,70.4,71.83
5,"A0A023FDY8_1,P80098_1",PDB-CPX-100035,Evasin P974 and C-C motif chemokine 7,heteromeric,"7s58_1,7s58_2,7s58_3,7s58_4",4,2,18.71,18.71
6,A0A023GPI8_4,PDB-CPX-100040,Lectin alpha chain,homomeric,"4k1y_1,4k1z_1,4k20_1,4k21_1",4,4,104.87,105.35
7,A0A023PFZ0_3,PDB-CPX-100059,Fusion glycoprotein F0,homomeric,"8v5a_1,8v5k_1,8v62_1",3,3,245.4,246.78
8,A0A023UGN9_2,PDB-CPX-100064,Beta-galactosidase,homomeric,"6etz_1,6h1p_1,6se8_1,6se9_1,6sea_1,6seb_1,6sec...",17,2,221.55,225.74
9,A0A023X3Z4_2,PDB-CPX-100068,RsiG-like domain-containing protein,homomeric,"7lq2_1,7lq2_2,7lq2_3,7lq3_1,7lq3_2",5,2,19.54,19.73


In [18]:
# Rename cols
df = df.rename(columns={"ASSEMBLY_STR": "asm_str", "PDB_COMPLEX_ID": "complex_id", "ASSEMBLY_NAME": "asm_name", "ASSEMBLY_TYPE": "asm_type",
                       "ASSEMBLIES": "assemblies", "NUM_ASSEMBLIES": "num_assemblies", "NUM_COMPONENTS": "num_components",
                       "MEDIAN_WT": "median_mw_kda", "MAX_WT": "max_mw_kda"})

In [19]:
df.head(5)

Unnamed: 0,asm_str,complex_id,asm_name,asm_type,assemblies,num_assemblies,num_components,median_mw_kda,max_mw_kda
0,A0A010_2,PDB-CPX-100020,MoeN5,homomeric,"5b00_1,5b00_2,5b01_1,5b01_2,5b01_3,5b01_4,5b01_5",7,2,64.52,65.18
1,A0A011_2,PDB-CPX-100021,MoeO5,homomeric,"3vk5_1,3vka_1,3vkb_1,3vkc_1,3vkd_1",5,2,63.15,63.56
2,A0A022MQ12_2,PDB-CPX-100028,Amidohydrolase-related domain-containing protein,homomeric,"6sj0_1,6sj1_1,6sj2_1,6sj3_1,6sj4_1",5,2,113.78,114.64
3,A0A022MRT4_2,PDB-CPX-100030,AMP-dependent synthetase/ligase domain-contain...,homomeric,"6six_1,6siy_1,6siz_1",3,2,98.65,98.84
4,A0A023DFE8_2,PDB-CPX-100031,Metallo-beta-lactamase domain-containing protein,homomeric,"6n9i_1,6n9i_2,6n9q_1,9ayt_1,9ayt_2,9b2i_1,9b2i...",16,2,70.4,71.83


In [20]:
# Ensure numeric
df["num_assemblies"] = pd.to_numeric(df["num_assemblies"], errors="coerce")
df["median_mw_kda"] = pd.to_numeric(df["median_mw_kda"], errors="coerce")

In [21]:
# Drop unusable rows
df = df.dropna(subset=["complex_id", "num_assemblies", "median_mw_kda"]).copy()

In [22]:
# -----------------------------
# Add bin labels (A_BIN and MW_BIN)
# -----------------------------
# A bins (compute-aware)
a_edges = [3, 5, 9, 21, 101, np.inf]  # [3-4], [5-8], [9-20], [21-100], [>=101]
a_labels = ["A1_3-4", "A2_5-8", "A3_9-20", "A4_21-100", "A5_101+"]

df["A_BIN"] = pd.cut(
    df["num_assemblies"],
    bins=a_edges,
    right=False,        # include left edge, exclude right edge
    labels=a_labels
)

mw_edges = [0, 60, 120, 300, np.inf]
mw_labels = ["MW1_<60", "MW2_60-120", "MW3_120-300", "MW4_>300"]

df["MW_BIN"] = pd.cut(
    df["median_mw_kda"],
    bins=mw_edges,
    right=False,
    labels=mw_labels
)

In [23]:
df.head(5)

Unnamed: 0,asm_str,complex_id,asm_name,asm_type,assemblies,num_assemblies,num_components,median_mw_kda,max_mw_kda,A_BIN,MW_BIN
0,A0A010_2,PDB-CPX-100020,MoeN5,homomeric,"5b00_1,5b00_2,5b01_1,5b01_2,5b01_3,5b01_4,5b01_5",7,2,64.52,65.18,A2_5-8,MW2_60-120
1,A0A011_2,PDB-CPX-100021,MoeO5,homomeric,"3vk5_1,3vka_1,3vkb_1,3vkc_1,3vkd_1",5,2,63.15,63.56,A2_5-8,MW2_60-120
2,A0A022MQ12_2,PDB-CPX-100028,Amidohydrolase-related domain-containing protein,homomeric,"6sj0_1,6sj1_1,6sj2_1,6sj3_1,6sj4_1",5,2,113.78,114.64,A2_5-8,MW2_60-120
3,A0A022MRT4_2,PDB-CPX-100030,AMP-dependent synthetase/ligase domain-contain...,homomeric,"6six_1,6siy_1,6siz_1",3,2,98.65,98.84,A1_3-4,MW2_60-120
4,A0A023DFE8_2,PDB-CPX-100031,Metallo-beta-lactamase domain-containing protein,homomeric,"6n9i_1,6n9i_2,6n9q_1,9ayt_1,9ayt_2,9b2i_1,9b2i...",16,2,70.4,71.83,A3_9-20,MW2_60-120


In [24]:
# Save "complexes_with_bins.csv" (the main formatted table)
out_with_bins = "complexes_with_bins.csv"
df_out = df[["complex_id", "num_assemblies", "median_mw_kda", "A_BIN", "MW_BIN"]].copy()
df_out.to_csv(out_with_bins, index=False)

In [25]:
df_out.head(10)

Unnamed: 0,complex_id,num_assemblies,median_mw_kda,A_BIN,MW_BIN
0,PDB-CPX-100020,7,64.52,A2_5-8,MW2_60-120
1,PDB-CPX-100021,5,63.15,A2_5-8,MW2_60-120
2,PDB-CPX-100028,5,113.78,A2_5-8,MW2_60-120
3,PDB-CPX-100030,3,98.65,A1_3-4,MW2_60-120
4,PDB-CPX-100031,16,70.4,A3_9-20,MW2_60-120
5,PDB-CPX-100035,4,18.71,A1_3-4,MW1_<60
6,PDB-CPX-100040,4,104.87,A1_3-4,MW2_60-120
7,PDB-CPX-100059,3,245.4,A1_3-4,MW3_120-300
8,PDB-CPX-100064,17,221.55,A3_9-20,MW3_120-300
9,PDB-CPX-100068,5,19.54,A2_5-8,MW1_<60


In [26]:
# Create cell counts table (how many complexes per bin cell)
cell_counts = (
    df_out.groupby(["A_BIN", "MW_BIN"])
          .size()
          .reset_index(name="n_complexes")
          .sort_values(["A_BIN", "MW_BIN"])
)
cell_counts.to_csv("cell_counts.csv", index=False)

  df_out.groupby(["A_BIN", "MW_BIN"])


In [27]:
cell_counts

Unnamed: 0,A_BIN,MW_BIN,n_complexes
0,A1_3-4,MW1_<60,2688
1,A1_3-4,MW2_60-120,2223
2,A1_3-4,MW3_120-300,1435
3,A1_3-4,MW4_>300,639
4,A2_5-8,MW1_<60,1470
5,A2_5-8,MW2_60-120,1340
6,A2_5-8,MW3_120-300,840
7,A2_5-8,MW4_>300,333
8,A3_9-20,MW1_<60,701
9,A3_9-20,MW2_60-120,669


In [28]:
# Choose the "median complex" in each (A_BIN, MW_BIN) cell
# Definition: sort by median_mw_kda, pick the middle row (upper median)
def pick_median_complex(group: pd.DataFrame) -> pd.DataFrame:
    g = group.sort_values("median_mw_kda", ascending=True).reset_index(drop=True)
    idx = len(g) // 2   # upper median for even n
    return g.iloc[[idx]]

median_complexes = (
    df_out.groupby(["A_BIN", "MW_BIN"], group_keys=False)
          .apply(pick_median_complex)
          .reset_index(drop=True)
)

median_complexes.to_csv("phase1_median_complexes.csv", index=False)

  df_out.groupby(["A_BIN", "MW_BIN"], group_keys=False)
  .apply(pick_median_complex)
