In [16]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors



In [55]:
df_curatedSol = pd.read_csv('../Data/curated-solubility-dataset.csv')
df_biogen = pd.read_csv('../Data/ADME_public_set_3521.csv')
df_bioavailability = pd.read_csv('../Data/11095_2013_1222_MOESM2_ESM.csv', sep=';')
df_ESOL = pd.read_csv('../Data/ESOL_data.csv')
df_Water_Octanol = pd.read_csv('../Data/Water_Octanol_distribution_ph7_4.csv')
df_Bold_brain_barrier = pd.read_csv('../Data/Blood-brain_barrier_binary.csv')
df_Kinetic_solubility = pd.read_csv('../Data/KineticAqueousSolubility.csv')
df_permeability = pd.read_csv('../Data/Permeability_ph7_4.csv')
df_PAMPA_ph7_4 = pd.read_csv('../Data/PAMPA_ph_7_4.csv')

In [98]:
#df_curatedSol.head()
#df_bioavailability.head() #995
#df_biogen.head() #3521
#df_ESOL.head() #1128
#df_Water_Octanol.head()#4200
df_Bold_brain_barrier.head() #2050
#df_Kinetic_solubility.shape #2532
#df_permeability.shape #2530
#df_PAMPA_ph7_4.shape #2532

Unnamed: 0,num,name,p_np,smiles
0,1,Propanolol,1,[Cl].CC(C)NCC(O)COc1cccc2ccccc12
1,2,Terbutylchlorambucil,1,C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2,3,40730,1,c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3,4,24,1,C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4,5,cloxacillin,1,Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...


In [99]:
canonicalize(df_Bold_brain_barrier, 'smiles')
add_Lipinski_descriptors(df_Bold_brain_barrier, 'smiles')

[12:27:09] Explicit valence for atom # 1 N, 4, is greater than permitted


ArgumentError: Python argument types in
    rdkit.Chem.rdmolfiles.MolToSmiles(NoneType)
did not match C++ signature:
    MolToSmiles(RDKit::ROMol mol, bool isomericSmiles=True, bool kekuleSmiles=False, int rootedAtAtom=-1, bool canonical=True, bool allBondsExplicit=False, bool allHsExplicit=False, bool doRandom=False)
    MolToSmiles(RDKit::ROMol mol, RDKit::SmilesWriteParams params)

In [97]:
df_Bold_brain_barrier.head()


Unnamed: 0,CMPD_CHEMBLID,exp,smiles,MolW(Da),NumHAcceptors,NumHDonors,LogP,Lipinski_rule
0,CHEMBL596271,3.54,Cn1c(CN2CCN(c3ccc(Cl)cc3)CC2)nc2ccccc21,340.858,4,0,3.5489,1
1,CHEMBL1951080,-1.18,COc1cc(OC)c(S(=O)(=O)N2c3ccccc3CCC2C)cc1NC(=O)...,494.591,7,2,2.9901,1
2,CHEMBL1771,3.69,COC(=O)[C@H](c1ccccc1Cl)N1CCc2sccc2C1,321.829,4,0,3.6739,1
3,CHEMBL234951,3.37,O=C(NC1Cc2ccccc2N(C[C@@H](O)CO)C1=O)c1cc2cc(Cl...,419.89,5,4,1.9237,1
4,CHEMBL565079,3.1,Cc1cccc(C[C@H](NC(=O)c2cc(C(C)(C)C)nn2C)C(=O)N...,381.48,5,2,2.0069,1


In [59]:
def canonicalize(Dataframe: pd.DataFrame, column_name: str):
    
    """Canonicalizes the SMILES from Dataframe. A column called 'SMILES' is requiered

    Args: 
        Dataframe with 'SMILES' column contaning smiles. 
    """
    
    Dataframe[column_name] = Dataframe[column_name].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x))) #canonicalize smiles from a Dataframe                                          
    

In [60]:
def add_Lipinski_descriptors(Dataframe: pd.DataFrame, column_name: str):

   """This function takes a dataframe and adds the Lipinski descriptors 
      (Molecular weight in Dalton, Number of H-acceptors and donors and the 
      logP value) to it. It also adds a column witch says if the molecule follows 
      the Lipinski's rule of five. (1= yes, 0= no)
    
   Args: 
      Pandas Dataframe with a column witch contains the SMILES strings.
   """
   Dataframe['MolW(Da)'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.MolWt(Chem.MolFromSmiles(x)))

   Dataframe['NumHAcceptors'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.NumHAcceptors(Chem.MolFromSmiles(x)))

   Dataframe['NumHDonors'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.NumHDonors(Chem.MolFromSmiles(x)))

   Dataframe['LogP'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.MolLogP(Chem.MolFromSmiles(x)))

   Dataframe['Lipinski_rule'] = Dataframe.apply(lambda x: 1 if x['MolW(Da)'] <= 500 and x['NumHAcceptors'] <= 10 and x['NumHDonors'] <= 5 and x['LogP'] <= 5 else 0, axis=1)



