In [1]:
import pandas as pd
import os
from tqdm.auto import tqdm
from pdb_processor import PDBProcessor
from Bio.PDB import PDBParser

In [2]:
df = pd.read_excel("PPB-Affinity.xlsx").drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,Source Data Set,Complex ID,PDB,Mutations,Ligand Chains,Receptor Chains,Ligand Name,Receptor Name,KD(M),Affinity Method,Structure Method,Temperature(K),Resolution(Å),PDB PubMed ID,PDB Release Date,Affinity PubMed ID,Affinity Release Date,Subgroup
0,SKEMPI v2.0,"1A22:A, B::PMID=7504735",1A22,,A,B,Human growth hormone,hGH binding protein,9e-10,SPR,X-RAY DIFFRACTION,298,2.6,9571026.0,1998-04-29,7504735,1993 Dec 5,
1,SKEMPI v2.0,"1A4Y:A, B::PMID=9050852",1A4Y,,A,B,Ribonuclease inhibitor,Angiogenin,5e-16,Other,X-RAY DIFFRACTION,298,2.0,9311977.0,1998-10-14,9050852,1997 Mar 4,
2,SKEMPI v2.0,"1ACB:E, I::PMID=9048543",1ACB,,E,I,Bovine alpha-chymotrypsin,Eglin c,1.49e-12,IASP,X-RAY DIFFRACTION,294,2.0,1583684.0,1993-10-31,9048543,1997 Feb 18,
3,SKEMPI v2.0,"1AHW:A, B, C::PMID=9480775",1AHW,,"A, B",C,Immunoglobulin fab 5G9,Tissue factor,3.4e-09,IASP,X-RAY DIFFRACTION,298(assumed),3.0,9480775.0,1998-02-25,9480775,1998 Feb 6,
4,SKEMPI v2.0,"1AK4:A, D::PMID=9223641",1AK4,,A,D,Cyclophilin A,HIV-1 capsid protein,1.2e-05,SPR,X-RAY DIFFRACTION,298(assumed),2.36,8980234.0,1997-10-15,9223641,1997 Jun 27,


# Clean the data from wrong entries

The following entries have at least one mutation in which the specified
position contains an amino acid that is different from the wild-type amino
specified in the mutation itself. We don't know why this is the case, but we
will drop these entries for safety.

In [3]:
ids_to_drop = [
    # The mutation is C_K8E, but the wt aa at position 8 in chain C is L, not K.
    "3QIW:A, B, C, D, E:C_K8E:PMID=21490152",
    # One of the mutations E_G50A, but the wt aa at position 50 in chain E is I not G
    "2P5E:A, C, D, E:D_G97D, D_Q51T, D_S52P, D_S53W, D_S96L, D_S99T, D_T95L, E_A51I, E_G50A, E_G52Q, E_I53T, E_V95L:PMID=17644531",
    # The mutation is C_K8R, but the wt aa at position 8 in chain C is L not K
    "3QIU:A, B, C, D, E:C_K8R:PMID=21490152",
]
df = df[~df["Complex ID"].isin(ids_to_drop)].reset_index(drop=True)

## Chain Validation and Missing Chains Detection
This section validates the presence of ligand and receptor chains specified in the data against actual PDB files.

**Chain validation and correction steps (for each ligand and receptor in each row):**
1. Check if the chain is present in the PDB file.
2. If the chain is not present, check if the chain id is lowercase, and its uppercase version is present in the PDB file. If so, correct the corresponding list of chains to include the uppercase version only.
3. If the second check fails, drop the row.
4. If the chain is an empty string for example, ', C, D', drop the row.


In [4]:
parser = PDBParser(QUIET=True)
complex_ids_with_missing_chains = dict()
for i, row in tqdm(df.iterrows(), total=len(df)):
    pdb_id = row["PDB"]
    complex_id = row["Complex ID"]
    source_data = row["Source Data Set"]
    ligand_chains = row["Ligand Chains"].replace(" ", "").split(",")
    receptor_chains = row["Receptor Chains"].replace(" ", "").split(",")
    if source_data in ["SKEMPI v2.0", "Affinity Benchmark v5.5", "ATLAS"]:
        pdb_path = os.path.join("PDB", source_data, f"{pdb_id}.pdb")
    elif source_data == "SAbDab":
        pdb_path = os.path.join(
            "PDB", source_data, f"{pdb_id.lower()}.pdb"
        )
    elif source_data == "PDBbind v2020":
        pdb_path = os.path.join(
            "PDB", source_data, f"{pdb_id.lower()}.ent.pdb"
        )
    else:
        raise ValueError(f"Unknown source data set: {source_data}")
    structure = parser.get_structure(pdb_id, pdb_path)
    chains_in_pdb = [chain.id for chain in list(structure.get_chains())]
    for chain in ligand_chains:
        if chain not in chains_in_pdb:
            if chain.islower() and chain.upper() in ligand_chains:
                df.loc[i, "Ligand Chains"] = row["Ligand Chains"].replace(
                    f", {chain}", ""
                )
                continue
            if complex_id not in complex_ids_with_missing_chains:
                complex_ids_with_missing_chains[complex_id] = []
            complex_ids_with_missing_chains[complex_id].append(
                f"Missing ligand chain {chain}"
            )
    for chain in receptor_chains:
        if chain not in chains_in_pdb:
            if chain.islower() and chain.upper() in receptor_chains:
                df.loc[i, "Receptor Chains"] = row[
                    "Receptor Chains"
                ].replace(f", {chain}", "")
                continue
            if complex_id not in complex_ids_with_missing_chains:
                complex_ids_with_missing_chains[complex_id] = []
            complex_ids_with_missing_chains[complex_id].append(
                f"Missing receptor chain {chain}"
            )
print(
    "Found total of",
    len(complex_ids_with_missing_chains),
    "complexes with missing chains. Listing the complexes with missing chains:",
)

for complex_id, missing_chains in complex_ids_with_missing_chains.items():
    print(complex_id, missing_chains)

  0%|          | 0/12059 [00:00<?, ?it/s]

Found total of 11 complexes with missing chains. Listing the complexes with missing chains:
5Y9J:A, B, C, H, L::PMID=Shin et al., 2018 ['Missing receptor chain B', 'Missing receptor chain C']
3LVK:A, B, C::PMID=23281480 ['Missing ligand chain C']
3WD5:A, B, C, H, L::PMID=Hu et al., 2013 ['Missing receptor chain B', 'Missing receptor chain C']
1DEE:, C, D, G::PMID=10805799 ['Missing ligand chain ']
2WUB:A, H, J::PMID=nan ['Missing ligand chain J']
3PL6:A, B, C, D, E::PMID=21199956 ['Missing receptor chain E']
4EEF:, A, B, G::PMID=22634563 ['Missing ligand chain ']
5KVE:E, H, L::PMID=27475895 ['Missing ligand chain H']
6B0G:, C, D, E::PMID=29146922 ['Missing receptor chain ']
6O07:A, C, D::PMID=31155310 ['Missing receptor chain D']
4L3E:A, B, C, D, E:D_D26Y, E_L98W:PMID=24550723 ['Missing ligand chain B']


In [5]:
# Drop entries with missing chains
df = df[~df["Complex ID"].isin(list(complex_ids_with_missing_chains.keys()))]

In [6]:
ligand_sequences = []
receptor_sequences = []
pp = PDBProcessor()
for _, row in tqdm(df.iterrows(), total=len(df)):
    pdb_id = row["PDB"]
    complex_id = row["Complex ID"]
    source_data = row["Source Data Set"]
    mutations = row["Mutations"]
    if pd.isna(mutations):
        mutations = None
    else:
        mutations = mutations.replace(" ", "").split(",")
    ligand_chains = row["Ligand Chains"].replace(" ", "").split(",")
    receptor_chains = row["Receptor Chains"].replace(" ", "").split(",")
    if source_data in ["SKEMPI v2.0", "Affinity Benchmark v5.5", "ATLAS"]:
        pdb_path = os.path.join("PDB", source_data, f"{pdb_id}.pdb")
    elif source_data == "SAbDab":
        pdb_path = os.path.join("PDB", source_data, f"{pdb_id.lower()}.pdb")
        ligand_chains = set(ligand_chains)
        receptor_chains = set(receptor_chains)
        chains_to_remove = []
        for chain in list(ligand_chains):
            if chain.islower() and chain.upper() in ligand_chains:
                ligand_chains.remove(chain)
        for chain in list(receptor_chains):
            if chain.islower() and chain.upper() in receptor_chains:
                receptor_chains.remove(chain)
        ligand_chains = list(ligand_chains)
        receptor_chains = list(receptor_chains)
    elif source_data == "PDBbind v2020":
        pdb_path = os.path.join(
            "PDB", source_data, f"{pdb_id.lower()}.ent.pdb"
        )
    else:
        raise ValueError(f"Unknown source data set: {source_data}")
    if pdb_id == "3QIB" and mutations is not None and source_data == "ATLAS":
        for i, mutation in enumerate(mutations):
            # all mutations associated with 3QIB, ATLAS, and chain C are off by
            # one residue. This happened in multiple entries in the dataset,
            # so we correct it here.
            if mutation.startswith("C_"):
                mutation_loc = mutation.split("_")[1][1:-1]
                mutations[i] = mutation.replace(
                    mutation_loc, str(int(mutation_loc) + 1)
                )

    protein_seqs = pp.pdb_to_processed_seqs(
        pdb_id=pdb_id,
        pdb_path=pdb_path,
        chains=ligand_chains + receptor_chains,
        mutations=mutations,
        recover_missing_residues=True,
        remove_unk_residues=True,
    )
    ligand_seq = ",".join([protein_seqs[chain] for chain in ligand_chains])
    receptor_seq = ",".join([protein_seqs[chain] for chain in receptor_chains])
    ligand_sequences.append(ligand_seq)
    receptor_sequences.append(receptor_seq)
    
df["Ligand Sequences"] = ligand_sequences
df["Receptor Sequences"] = receptor_sequences
df.to_csv("PPB_Affinity_processed.csv", index=False)

  0%|          | 0/12048 [00:00<?, ?it/s]

