In [None]:
# import necessary packages
import pandas as pd
import rdkit
from rdkit import Chem
from standardiser import standardise
import logging
from rdkit.Chem import Descriptors

In [None]:
# define metal elements used in the following step to filter out certain compounds
METAL_ELEMENTS = ['Li', 'Be', 'Na', 'Mg', 'Al', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co',
                  'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh',
                  'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Cs', 'Ba', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os',
                  'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'Fr', 'Ra', 'Lr', 'Ho']

In [None]:
# load raw dataset csv file downloaded from database
df = pd.read_csv('PIM1.csv')

In [None]:
# delete rows where 'Smiles' is null
df.dropna(axis=0, subset=["Smiles"], inplace=True)

In [None]:
# delete molecules with metal element
for metal in METAL_ELEMENTS:
    df = df[~df['Smiles'].str.contains(metal)]
    print("no {}".format(metal),df.shape)    

In [None]:
# standardize molecules
for i in df.index:
    try:
        smi = df.loc[i, 'Smiles']
        # print(smi)
        mol = Chem.MolFromSmiles(smi)
        mol = Chem.AddHs(mol)
        parent = standardise.run(mol)
        mol_ok_smi = Chem.MolToSmiles(parent)
        df.loc[i, 'Smiles'] = mol_ok_smi
        # print(i, 'done')
    except standardise.StandardiseException as e:
        logging.warning(e.message)

In [None]:
# delete duplicated molecule
df.drop_duplicates(keep='first', inplace=True)

In [None]:
# extract molecules with 'Standard type' is 'IC50'
df1=df[df['Standard Type'].isin(["IC50"])]

In [None]:
# extract molecules with 'Standard type' is 'Ki'
df3=df[df['Standard Type'].isin(["Ki"])]

In [None]:
# extract molecules with 'Standard type' is 'Kd'
df4=df[df['Standard Type'].isin(["Kd"])]

In [None]:
# merge extracted three datasets
frames = [df1, df3, df4]
df = pd.concat(frames)

In [None]:
# delete molecules where 'Standard Relation' IS NOT '=' (not necessary if the dataset is small scale )
df=df[df['Standard Relation'].isin(["'='"])]

In [None]:
# delete molecules whose 'assay_type' IS NOT 'B' (B=binding,F=functional)
df=df[df['Assay Type'].isin(["B"])]

In [None]:
# delete molecules whose'Smiles''Standard Units''Molecule ChEMBL ID''Standard Value' containing null value
df=df.dropna(how='any',subset=(['Smiles','Standard Units','Molecule ChEMBL ID','Standard Value']))

In [None]:
# calculate molecular weight
molweight = []
for smi in list(df['Smiles']):
    molweight.append(Descriptors.MolWt(Chem.MolFromSmiles(smi)))
df['molecular_weight'] = molweight

In [None]:
# delete molecules with MW>1000
df = df[ df['molecular_weight']<=1000 ]

In [None]:
# calculate logP
logP = []
for smi in list(df['Smiles']):
    logP.append(Descriptors.MolLogP(Chem.MolFromSmiles(smi)))
df['logP'] = logP

In [None]:
# units conversion (convert all units to nanomole, nM)
df['molecular_weight']=df['molecular_weight'].astype("float64")
df2 = df[df['Standard Units'].isin(["nM"])]
df3 = df[df['Standard Units'].isin(["ug.mL-1"])]
df3['Standard Value'] = df3['Standard Value']/df3['molecular_weight']*1000000
df3['Standard Units'] = "nM"
df = df2.append(df3)

In [None]:
# calculate average value for molecules with same 'smiles'
df_mean = df.groupby('Molecule ChEMBL ID')['Standard Value'].mean()
print('df',df_mean.shape)

df_mean_dict = df_mean.to_dict()

df['standard_value_mean'] = df['Molecule ChEMBL ID'].apply(lambda x:df_mean_dict[x])

In [None]:
#delete averaged molecules
df.drop('Standard Value', axis=1, inplace=True)
df.drop_duplicates(subset=['Molecule ChEMBL ID'],inplace=True)
df.reset_index(inplace=True)

In [None]:
# label conversion
df.loc[df['standard_value_mean']<=10000,'standard_value_mean']=1
df.loc[df['standard_value_mean']>10000,'standard_value_mean']=0
df = pd.DataFrame(df,columns = ['Smiles','standard_value_mean'])
df.columns = ['SMILES','LABEL']

In [None]:
# export washed dataset to a csv file named 'PIM1_data_washed'
df.to_csv('PIM1_data_washed.csv', index=None)