In [38]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
import math



In [73]:
df_curatedSol = pd.read_csv('../Data/CuratedSol.csv')
df_biogen = pd.read_csv('../Data/Biogen.csv')
df_bioavailability = pd.read_csv('../Data/Bioavailibility.csv')
df_ESOL = pd.read_csv('../Data/ESOL.csv')
df_Water_Octanol = pd.read_csv('../Data/Water_Octanol.csv')
df_Bold_brain_barrier = pd.read_csv('../Data/Blood-brain_barrier_binary.csv')
df_Kinetic_solubility = pd.read_csv('../Data/KineticAqueousSolubility.csv')
df_permeability = pd.read_csv('../Data/Permeability_ph7_4.csv')
df_PAMPA_ph7_4 = pd.read_csv('../Data/PAMPA_ph_7_4.csv')
#df_Pharma_drugs = pd.read_csv('../Data/Pharmaceutical Drugs_Dataset.csv')
df_oral_adsorption = pd.read_csv('../Data/Oral adsorbtion.csv')
df_binary_bioavailibility = pd.read_csv('../Data/Binary_Bioavailibility.csv')
df_water_sol_Ochem = pd.read_csv('../Data/Water_Sol_OCHEM.csv')

In [76]:
#df_curatedSol.head()
#df_bioavailability.head() #995
#df_biogen.head() #3521
#df_ESOL.head() #1128
#df_Water_Octanol.head()#4200
#df_Bold_brain_barrier.head() #2050
#df_Kinetic_solubility.head() #2532
#df_permeability.head() #2530
#df_PAMPA_ph7_4.head() #2532
#df_oral_adsorption.shape
#df_binary_bioavailibility.head(20)
df_water_sol_Ochem.head(100) 


Unnamed: 0,SMILES,CASRN,EXTERNALID,N,NAME,ARTICLEID,PUBMEDID,PAGE,TABLE,Water solubility,...,UNIT {Water solubility}.1,Dataset,Temperature,UNIT {Temperature},Ionic strength,UNIT {Ionic strength},comment (chemical),source,pH,UNIT {pH}
0,CC(N)=O,60-35-5,-,1,acetamide,A64,11749573,-,-,1.58,...,-log(M),Train,,-,,-,,,,-
1,CNN,60-34-4,-,2,methyl hydrazine,A64,11749573,-,-,1.34,...,-log(M),Train,,-,,-,,,,-
2,CC(O)=O,64-19-7,-,3,acetic acid,A64,11749573,-,-,1.22,...,-log(M),Train,,-,,-,,,,-
3,C1CCCN1,123-75-1,-,4,pyrrolidine,A64,11749573,-,-,1.15,...,-log(M),Train,,-,,-,,,,-
4,NC(NO)=O,127-07-1,-,5,hydroxyurea,A64,11749573,-,-,1.12,...,-log(M),Train,,-,,-,,,,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,O=C(C1=CC=CN=C1)OC,93-60-7,-,101,methyl nicotinate,A64,11749573,-,-,-0.46,...,-log(M),Train,,-,,-,,,,-
96,CC1=NC=NC2=C1N=CC=N2,2432-21-5,-,102,4-methylpteridine,A64,11749573,-,-,-0.47,...,-log(M),Train,,-,,-,,,,-
97,NCC(O)C1=CC=CC=C1,7568-93-6,-,103,phenylethanolamine,A64,11749573,-,-,-0.48,...,-log(M),Train,,-,,-,,,,-
98,O=C(C(CO)C1=CC=CC=C1)OC2CC3C4OC4C(C2)N3C,51-34-3,-,104,Scopolamine,A64,11749573,-,-,-0.48,...,-log(M),Train,,-,,-,,,,-


In [101]:
df_bioavailability.to_csv('../Data/Bioavailibility.csv', index=False)
df_biogen.to_csv('../Data/Biogen.csv', index=False)
df_ESOL.to_csv('../Data/ESOL.csv', index=False)
df_Water_Octanol.to_csv('../Data/Water_Octanol.csv', index=False)
df_curatedSol.to_csv('../Data/CuratedSol.csv', index=False)

In [39]:
def canonicalize(Dataframe: pd.DataFrame, column_name: str):
    
    """Canonicalizes the SMILES from Dataframe. A column called 'SMILES' is requiered

    Args: 
        Dataframe with 'SMILES' column contaning smiles. 
    """
    
    Dataframe[column_name] = Dataframe[column_name].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x))) #canonicalize smiles from a Dataframe                                          
    

In [60]:
def add_Lipinski_descriptors(Dataframe: pd.DataFrame, column_name: str):

   """This function takes a dataframe and adds the Lipinski descriptors 
      (Molecular weight in Dalton, Number of H-acceptors and donors and the 
      logP value) to it. It also adds a column witch says if the molecule follows 
      the Lipinski's rule of five. (1= yes, 0= no)
    
   Args: 
      Pandas Dataframe with a column witch contains the SMILES strings.
   """
   Dataframe['MolW(Da)'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.MolWt(Chem.MolFromSmiles(x)))

   Dataframe['NumHAcceptors'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.NumHAcceptors(Chem.MolFromSmiles(x)))

   Dataframe['NumHDonors'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.NumHDonors(Chem.MolFromSmiles(x)))

   Dataframe['LogP'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.MolLogP(Chem.MolFromSmiles(x)))

   Dataframe['Lipinski_rule'] = Dataframe.apply(lambda x: 1 if x['MolW(Da)'] <= 500 and x['NumHAcceptors'] <= 10 and x['NumHDonors'] <= 5 and x['LogP'] <= 5 else 0, axis=1)





In [40]:
def convert_solubility(Dataframe: pd.DataFrame, column_solubility: str, coulmn_Mw: str):
    
    """Converts the solubility values from a Dataframe from log(ug/L) to log (mol/L).

    Args: 
        Dataframe with column containing solubility values in log(ug/L).
    """
    
    Dataframe['Solubility_log(mol/L)'] = Dataframe[column_solubility].apply(lambda x: math.x-6-log(Dataframe[coulmn_Mw]))

    return Dataframe

In [45]:
def convert_solubility(Dataframe: pd.DataFrame, column_solubility: str, column_Mw: str):
    """
    Converts the solubility values from a DataFrame from log(ug/L) to log(mol/L).
    
    Args:
        dataframe (pd.DataFrame): DataFrame with the solubility values.
        column_solubility (str): Column name containing solubility values in log(ug/L).
        column_Mw (str): Column name containing molecular weight (Mw) values in g/mol.
    
    Returns:
        pd.DataFrame: DataFrame with an additional column for solubility in log(mol/L).
    """
    # Define a function to apply to each row
    def convert_log_solubility(row):
        log_ug_per_L = row[column_solubility]
        molar_mass = row[column_Mw]
        # Perform the conversion
        log_mol_per_L = log_ug_per_L - 3 - math.log10(molar_mass)
        return log_mol_per_L
    
    # Apply the conversion function to each row and create a new column
    Dataframe['Solubility_log(mol/L)'] = Dataframe.apply(convert_log_solubility, axis=1)
    
    return Dataframe

In [49]:
def calc_Solubility_molL(Dataframe: pd.DataFrame, coulmn_name: str):
    
    """Calculates the solubility in mol/L from a Dataframe with a column containing solubility in log(mol/L).

    Args: 
        Dataframe with column containing solubility values in log(mol/L).
    """
    
    Dataframe['Solubility(mol/L)'] = Dataframe[coulmn_name].apply(lambda x: 10**x)

    return Dataframe

In [58]:
convert_solubility(df_biogen, 'LOG SOLUBILITY PH 6.8 (ug/mL)', 'MolW(Da)')
calc_Solubility_molL(df_biogen, 'Solubility_log(mol/L)')

df_biogen.head(30)

Unnamed: 0,Internal ID,Vendor ID,SMILES,CollectionName,LOG HLM_CLint (mL/min/kg),LOG MDR1-MDCK ER (B-A/A-B),LOG SOLUBILITY PH 6.8 (ug/mL),LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound),LOG PLASMA PROTEIN BINDING (RAT) (% unbound),LOG RLM_CLint (mL/min/kg),MolW(Da),NumHAcceptors,NumHDonors,LogP,Lipinski_rule,Solubility_log(mol/L),Solubility(mol/L)
0,Mol1,317714313,CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...,emolecules,0.675687,1.493167,0.089905,0.991226,0.518514,1.392169,434.435,9,3,1.9007,1,-5.54802,3e-06
1,Mol2,324056965,CCOc1cc2nn(CCC(C)(C)O)cc2cc1NC(=O)c1cccc(C(F)F)n1,emolecules,0.675687,1.04078,0.550228,0.099681,0.268344,1.02792,418.444,6,2,4.1809,1,-5.071409,8e-06
2,Mol3,304005766,CN(c1ncc(F)cn1)[C@H]1CCCNC1,emolecules,0.675687,-0.358806,,2.0,2.0,1.02792,210.256,4,1,0.8039,1,,
3,Mol4,194963090,CC(C)(Oc1ccc(-c2cnc(N)c(-c3ccc(Cl)cc3)c2)cc1)C...,emolecules,0.675687,1.026662,1.657056,-1.158015,-1.403403,1.02792,382.847,4,2,4.8932,1,-3.925969,0.000119
4,Mol5,324059015,CC(C)(O)CCn1cc2cc(NC(=O)c3cccc(C(F)(F)F)n3)c(C...,emolecules,0.99638,1.010597,,1.015611,1.092264,1.629093,450.461,6,3,4.0908,1,,
5,Mol6,316230505,CC#CC(=O)N[C@H]1CCCN(c2c(F)cc(C(N)=O)c3[nH]c(C...,emolecules,1.397349,0.860626,1.033424,0.163857,-0.639533,2.376374,370.428,3,3,2.13104,1,-4.53528,2.9e-05
6,Mol7,299985775,CC(C)NCC(O)COc1cccc2ccccc12,emolecules,1.327232,-0.162401,,,1.347759,3.318276,259.349,3,2,2.5775,1,,
7,Mol8,538570,COc1ccc(Cl)cc1C(=O)NCCc1ccc(S(=O)(=O)NC(=O)NC2...,emolecules,1.458063,2.091561,,-0.917215,-0.943095,2.728062,494.013,5,3,3.6417,1,,
8,Mol9,EN300-97039,C=CC(=O)N1CCC[C@@H](n2nc(-c3ccc(Oc4ccccc4)cc3)...,enamineBB_pmc,2.7565,0.72516,0.93399,0.62603,-0.35853,3.2433,440.507,7,1,4.2173,1,-4.709963,2e-05
9,Mol10,44811418,CC(C)NC(=O)COc1cccc(-c2nc(Nc3ccc4[nH]ncc4c3)c3...,emolecules,1.630824,1.301829,,-1.384078,-0.504456,3.192741,452.518,6,3,4.8201,1,,


In [72]:
df_bioavailability.loc[830, 'Name']

'Selegiline (-)-Deprenil'