### Load data from "PDBbind2019PLdata.csv". Filter out the rows with invalid ligands. A ligandName is invalid if it satisfies any of below:
1. Cannot be found in ligand_dict
2. Cannot be converted to ECFP succesfully

In [10]:
import json
import pandas as pd
import numpy as np

from rdkit.Chem import AllChem as Chem
from rdkit.Chem import MolFromSmiles  

In [21]:

## Filter out invalid ligandName which cannot be found in ligand_dict or cannot be converted to ECFP
print("Filter out invalid ligandName which cannot be found in ligand_dict or cannot be converted to ECFP")
with open('Ligand_Dict.json') as f:
    ligand_dict = json.load(f)
    
file_path = "PDBbind2019PLdata.csv"
pdb_data = pd.read_csv(file_path)
print("Original data size = ", pdb_data.shape)
invalid_row_indices = []
for i, ligandName in enumerate(pdb_data['ligandName']):
  #Filter out the rows with ligandName not found in ligand_dict 
  if ligandName not in ligand_dict:
    invalid_row_indices.append(i)
    continue

  # Filter out invalid ligandName which cannot be converted to ECFP
  smile = ligand_dict[ligandName]
  try:
      molecule = MolFromSmiles(smile)
      ECFP2 = Chem.GetMorganFingerprintAsBitVect(molecule, 2, nBits=NBITS).ToBitString()
      ECFP2arr = np.array(list(map(int, ECFP2)))
  except:
      invalid_row_indices.append(i)

print("len(invalid_row_indices) = ", len(invalid_row_indices))
pdb_data = pdb_data.drop(invalid_row_indices)
print("Current data size: ", pdb_data.shape)


Current data size:  (15128, 3)

len(invalid_row_indices) =  15128


KeyError: '[    3    26    27 ... 15085 15107 15108] not found in axis'

## Store into file 'PDBbind2019PLdata_cleaned.csv'

In [22]:
pdb_data.to_csv('PDBbind2019PLdata_cleaned.csv')

In [23]:
# Try reading:    
file_path = "../Data/Unused/PDBbind2019PLdata.csv"
pdb_data = pd.read_csv(file_path)
print("pdb_data.shape:", pdb_data.shape)


pdb_proteinName = np.array(pdb_data['proteinName'])
pdb_affinity = np.array(pdb_data['affinity'])
pdb_ligandName = np.array(pdb_data['ligandName'])
print("\npdb_proteinName.shape:", pdb_proteinName.shape)
print("pdb_affinity.shape:", pdb_affinity.shape)
print("pdb_ligandName.shape:", pdb_ligandName.shape)

pdb_data.shape: (17679, 3)

pdb_proteinName.shape: (17679,)
pdb_affinity.shape: (17679,)
pdb_ligandName.shape: (17679,)
