In [1]:
# Install dependencies
!pip install -q pyarrow &> /dev/null


In [2]:
from aiondata import BindingAffinity
import polars as pl


# Load BindingDB into a Polars DataFrame
df = BindingAffinity().to_df()

In [3]:

# Count the number of NULL values
null_count = df.select(pl.col('PDB ID').is_null().sum()).to_numpy()[0, 0]
# Count the number of non-NULL values
non_null_count = df.select(pl.col('PDB ID').is_not_null().sum()).to_numpy()[0, 0]

print(f"Number of NULL values: {null_count}")
print(f"Number of non-NULL values: {non_null_count}")


Number of NULL values: 559044
Number of non-NULL values: 32399


In [4]:

# Filter out rows with missing PDB IDs
cleaned_df = df.filter(pl.col('PDB ID').is_not_null())

#make csv from cleaned_df 
cleaned_df.write_csv('../data/cleaned_bindingdb.csv')

# change to pandas
cleaned_dfp = cleaned_df.to_pandas()
cleaned_dfp



Unnamed: 0,PDB ID,SMILES,Sequence,Ki (nM),IC50 (nM),Kd (nM),EC50 (nM),pH,Temp C,Organism,Source
0,3H1P,[H]OC(=O)C([H])([H])C([H])([H])[C@@]([H])(C(=O...,MENTENSVDSKSIKNLEPKIIHGSESMDSGISLDNSYKMDYPEMGL...,0.230,,,,7.5,25.0,Homo sapiens,Curated from the literature by BindingDB
1,"6DIF,6DIL,3SPK,2O4P,2O4L,2O4N,1D4S,1D4Y,4NJU",[H]c1nc(S(=O)(=O)N([H])c2c([H])c([H])c([H])c([...,PQVTLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,0.008,,,,5.0,22.0,Human immunodeficiency virus 1,Curated from the literature by BindingDB
2,4Y6K,[H]O[C@@]([H])(C([H])([H])[C@]([H])(C(=O)N([H]...,PQVTLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,,0.6,,,5.5,30.0,Human immunodeficiency virus 1,Curated from the literature by BindingDB
3,1HVH,[H]OC([H])([H])c1c([H])c([H])c(C([H])([H])N2C(...,PQVTLWQRPLVTIKIGGQLKEALLDTGADDTVLEEMSLPGRWKPKM...,11.000,,,,5.5,37.0,Human immunodeficiency virus 1,Curated from the literature by BindingDB
4,"4EIP,2R0P",[H]c1c([H])c([H])c2c(c1[H])c1c3c(c4c5c([H])c([...,MGNAAAAKKGSEQESVKEFLAKAKEDFLKKWENPAQNTAHLDQFER...,,25250.0,,,7.4,30.0,Bos taurus,Curated from the literature by BindingDB
...,...,...,...,...,...,...,...,...,...,...,...
32394,5NB7,[H]c1c(Br)nc(N([H])C(=O)[C@@]2([H])N(C(=O)C([H...,MHSWERLAVLVLLGAAACAAPPRGRILGGREAEAHARPYMASVQLN...,,6.0,,,,,,ChEMBL
32395,"5MO4,3CS9,3GP0",[H]c1nc(N([H])c2c([H])c(C(=O)N([H])c3c([H])c(-...,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,,30.0,,,,,Homo sapiens,ChEMBL
32396,"6DT6,3I7Z,1VOM,2V26,6PHS,1DFL,2HY3,3RNT,5I0I,3...",O=[V]([O-])([O-])[O-],MEMEKEFEQIDKSGSWAAIYQDIRHEASDFPCRVAKLPKNKNRNRY...,,2940.0,,,,,Homo sapiens,ChEMBL
32397,"6DT6,3I7Z,1VOM,2V26,6PHS,1DFL,2HY3,3RNT,5I0I,3...",O=[V]([O-])([O-])[O-],MPTTIEREFEELDTQRRWQPLYLEIRNESHDYPHRVAKFPENRNRN...,,1960.0,,,,,Homo sapiens,ChEMBL


In [5]:
#reverse the true and false values
multiPDF=cleaned_dfp['PDB ID'].str.contains(',').value_counts()
print(f"Number of rows with multiple PDB IDs: {multiPDF}")



Number of rows with multiple PDB IDs: PDB ID
True     19310
False    13089
Name: count, dtype: int64


In [6]:
# Create a frequency table of the 'Source' column
frequency_table_pd = cleaned_dfp['Source'].value_counts().reset_index()
print(frequency_table_pd)
# Write the pandas DataFrame to a CSV file
frequency_table_pd.to_csv('../data/source_frequency.csv', index=False)



# Create a frequency table of the 'organism' column
frequency_table_pd = cleaned_dfp['Organism'].value_counts().reset_index()
print(frequency_table_pd)
# Write the pandas DataFrame to a CSV file
frequency_table_pd.to_csv('../data/organism_frequency.csv', index=False)



                                     Source  count
0                                    ChEMBL  27272
1  Curated from the literature by BindingDB   1901
2                                   PDSP Ki   1360
3                                 US Patent   1104
4                                   PubChem    610
5                                       D3R    101
6                                      CSAR     41
7                                      WIPO      8
8               Taylor Research Group, UCSD      2
                                             Organism  count
0                                        Homo sapiens  21038
1                                   Rattus norvegicus   1003
2                                              Rattus    676
3                      Human immunodeficiency virus 1    424
4                                          Bos taurus    324
..                                                ...    ...
70                                     Macaca mulatta      1
71

In [8]:






# Ensure 'PDB ID' column contains strings
cleaned_dfp['PDB ID'] = cleaned_dfp['PDB ID'].astype(str)

# Remove brackets and split the 'PDB ID' column by commas
cleaned_dfp['PDB ID'] = cleaned_dfp['PDB ID'].str.strip('[]').str.split(',')

# Explode the DataFrame to expand lists into rows
df_expanded = cleaned_dfp.explode('PDB ID')

# Reset index if needed
df_expanded.reset_index(drop=True, inplace=True)

#make csv from df_expanded
df_expanded.to_csv('../data/expanded_bindingdb.csv', index=False)

In [None]:

# Get reference_sequence_identity using the PDBHandler
from aiondata import protein_structure
ph=protein_structure.PDBHandler()
js=ph.get_pdb_info("4Y6K")
js

In [None]:
# Get the reference_sequence_identity for the first 100 PDB IDs
ident={}
for f in df_expanded['PDB ID'][0:100]:
    js=ph.get_pdb_info(f)
    if "rcsb_binding_affinity" not in js:
        continue
    for i in js["rcsb_binding_affinity"]:
        if i["provenance_code"]=="BindingDB":
            ident[f]=i["reference_sequence_identity"]
            break
print(ident)



