In [1]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import Descriptors
import math



## Data cleaning 
In this chapter we check which of the datsets we have are usefull for our purpose

In [171]:
df_curatedSol = pd.read_csv('../Data/CuratedSol.csv')
df_biogen = pd.read_csv('../Data/Biogen.csv')
df_bioavailability = pd.read_csv('../Data/Bioavailibility.csv')
df_ESOL = pd.read_csv('../Data/ESOL.csv')
df_Water_Octanol = pd.read_csv('../Data/Water_Octanol.csv')
df_Bold_brain_barrier = pd.read_csv('../Data/Blood-brain_barrier_binary.csv')
df_Kinetic_solubility = pd.read_csv('../Data/KineticAqueousSolubility.csv')
df_permeability = pd.read_csv('../Data/Permeability_ph7_4.csv')
df_PAMPA_ph7_4 = pd.read_csv('../Data/PAMPA_ph_7_4.csv')
#df_Pharma_drugs = pd.read_csv('../Data/Pharmaceutical Drugs_Dataset.csv')
df_oral_adsorption = pd.read_csv('../Data/Oral adsorbtion.csv')
df_binary_bioavailibility = pd.read_csv('../Data/Binary_Bioavailibility.csv')
df_OCHEM = pd.read_csv('../Data/OCHEM_Sol.csv')

In [135]:
#df_curatedSol.head()
#df_bioavailability.head() #995
df_biogen.head() #3521
#df_ESOL.head() #1128
#df_Water_Octanol.head()#4200
#df_Bold_brain_barrier.head() #2050
#df_Kinetic_solubility.head() #2532
#df_permeability.head() #2530
#df_PAMPA_ph7_4.head() #2532
#df_oral_adsorption.shape
#df_binary_bioavailibility.head(20)
#df_water_sol_Ochem.head(100) 


Unnamed: 0,Internal ID,Vendor ID,SMILES,CollectionName,LOG HLM_CLint (mL/min/kg),LOG MDR1-MDCK ER (B-A/A-B),LOG SOLUBILITY PH 6.8 (ug/mL),LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound),LOG PLASMA PROTEIN BINDING (RAT) (% unbound),LOG RLM_CLint (mL/min/kg),MolW(Da),NumHAcceptors,NumHDonors,LogP,Lipinski_rule,Solubility_log(mol/L),Solubility(mol/L)
0,Mol1,317714313,CNc1cc(Nc2cccn(-c3ccccn3)c2=O)nn2c(C(=O)N[C@@H...,emolecules,0.675687,1.493167,0.089905,0.991226,0.518514,1.392169,434.435,9,3,1.9007,1,-5.54802,3e-06
1,Mol2,324056965,CCOc1cc2nn(CCC(C)(C)O)cc2cc1NC(=O)c1cccc(C(F)F)n1,emolecules,0.675687,1.04078,0.550228,0.099681,0.268344,1.02792,418.444,6,2,4.1809,1,-5.071409,8e-06
2,Mol3,304005766,CN(c1ncc(F)cn1)[C@H]1CCCNC1,emolecules,0.675687,-0.358806,,2.0,2.0,1.02792,210.256,4,1,0.8039,1,,
3,Mol4,194963090,CC(C)(Oc1ccc(-c2cnc(N)c(-c3ccc(Cl)cc3)c2)cc1)C...,emolecules,0.675687,1.026662,1.657056,-1.158015,-1.403403,1.02792,382.847,4,2,4.8932,1,-3.925969,0.000119
4,Mol5,324059015,CC(C)(O)CCn1cc2cc(NC(=O)c3cccc(C(F)(F)F)n3)c(C...,emolecules,0.99638,1.010597,,1.015611,1.092264,1.629093,450.461,6,3,4.0908,1,,


## Functions for data preperation

In [5]:
def canonicalize(Dataframe: pd.DataFrame, column_name: str):
    
    """Canonicalizes the SMILES from Dataframe. A column called 'SMILES' is requiered

    Args: 
        Dataframe with 'SMILES' column contaning smiles. 
    """
    
    Dataframe[column_name] = Dataframe[column_name].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x))) #canonicalize smiles from a Dataframe                                          
    

In [6]:
def add_Lipinski_descriptors(Dataframe: pd.DataFrame, column_name: str):

   """This function takes a dataframe and adds the Lipinski descriptors 
      (Molecular weight in Dalton, Number of H-acceptors and donors and the 
      logP value) to it. It also adds a column witch says if the molecule follows 
      the Lipinski's rule of five. (1= yes, 0= no)
    
   Args: 
      Pandas Dataframe with a column witch contains the SMILES strings.
   """
   Dataframe['MolW(Da)'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.MolWt(Chem.MolFromSmiles(x)))

   Dataframe['NumHAcceptors'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.NumHAcceptors(Chem.MolFromSmiles(x)))

   Dataframe['NumHDonors'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.NumHDonors(Chem.MolFromSmiles(x)))

   Dataframe['LogP'] = Dataframe[column_name].apply(lambda x: Chem.Descriptors.MolLogP(Chem.MolFromSmiles(x)))

   Dataframe['Lipinski_rule'] = Dataframe.apply(lambda x: 1 if x['MolW(Da)'] <= 500 and x['NumHAcceptors'] <= 10 and x['NumHDonors'] <= 5 and x['LogP'] <= 5 else 0, axis=1)

In [14]:
def convert_solubility(Dataframe: pd.DataFrame, column_solubility: str, column_Mw: str):
    """
    Converts the solubility values from a DataFrame from log(ug/mL) to log(mol/L).
    
    Args:
        dataframe (pd.DataFrame): DataFrame with the solubility values.
        column_solubility (str): Column name containing solubility values in log(ug/L).
        column_Mw (str): Column name containing molecular weight (Mw) values in g/mol.
    
    Returns:
        pd.DataFrame: DataFrame with an additional column for solubility in log(mol/L).
    """
    # Define a function to apply to each row
    def convert_log_solubility(row):
        log_ug_per_L = row[column_solubility]
        molar_mass = row[column_Mw]
        # Perform the conversion
        log_mol_per_L = log_ug_per_L - 3 - math.log10(molar_mass)
        return log_mol_per_L
    
    # Apply the conversion function to each row and create a new column
    Dataframe['Solubility_log(mol/L)'] = Dataframe.apply(convert_log_solubility, axis=1)
    
    return Dataframe

In [8]:
def calc_Solubility_molL(Dataframe: pd.DataFrame, coulmn_name: str):
    
    """Calculates the solubility in mol/L from a Dataframe with a column containing solubility in log(mol/L).

    Args: 
        Dataframe with column containing solubility values in log(mol/L).
    """
    
    Dataframe['Solubility(mol/L)'] = Dataframe[coulmn_name].apply(lambda x: 10**x)

    return Dataframe

In [182]:
def drop_analysis(Dataframe: pd.DataFrame):

    """Drops duplicates and missing SOL data from a Dataframe.

    Args:
        Dataframe with a column containing SMILES strings and a column containing solubility values.
    """
    
    len = Dataframe.shape[0]
    data = Dataframe.drop_duplicates(subset="SMILES", keep='first')
    data2 = Dataframe.dropna(subset=['Solubility_log(mol/L)'])
    data3 = Dataframe.dropna(subset=['Solubility(mol/L)']).drop_duplicates(subset="SMILES", keep='first')
    print(f"dropped (missing SOL data): {len - data2.shape[0]}\ndropped (duplicates): {len - data.shape[0]}\ndropped (total): {len - data3.shape[0]}")

    return data3

# Preparing data for combining
Here we canonicalize the SMILES strings, add the Lipinski descriptors and add the Solubility in mol/L to the Dataframe. The Water Solubility in all compounds is given as log(mol/L).

In [169]:
canonicalize(df_OCHEM, 'SMILES')
add_Lipinski_descriptors(df_OCHEM, 'SMILES')
calc_Solubility_molL(df_OCHEM, 'Solubility_log(mol/L)')
#saving as csv
df_OCHEM.to_csv('../Data/OCHEM_Sol.csv', index=False)
df_OCHEM.shape #5000

(5000, 13)

In [170]:
canonicalize(df_ESOL, 'SMILES')
add_Lipinski_descriptors(df_ESOL, 'SMILES')
calc_Solubility_molL(df_ESOL, 'Solubility_log(mol/L)')
df_ESOL.to_csv('../Data/ESOL.csv', index=False)
df_ESOL.shape #1128

(1128, 16)

In [172]:
canonicalize(df_biogen, 'SMILES')
add_Lipinski_descriptors(df_biogen, 'SMILES')
calc_Solubility_molL(df_biogen, 'Solubility_log(mol/L)')
df_biogen.to_csv('../Data/Biogen.csv', index=False)
df_biogen.shape #3521

(3521, 17)

In [173]:
canonicalize(df_curatedSol, 'SMILES')
add_Lipinski_descriptors(df_curatedSol, 'SMILES')
calc_Solubility_molL(df_curatedSol, 'Solubility_log(mol/L)')
df_curatedSol.to_csv('../Data/CuratedSol.csv', index=False)
df_curatedSol.shape #9982



(9982, 30)

## Combining data
Combing all the solubility data (ESOL, curated, Biogen, OCHEM) to one big dataframe which contains SMILES, Solubility in log(mol/L), Mw and Lipinski descriptors

In [183]:
df_merged_1 = pd.merge(df_curatedSol, df_biogen, on=['SMILES', 'Solubility_log(mol/L)', 'Solubility(mol/L)', 'MolW(Da)', 'NumHAcceptors', 'NumHDonors', 'LogP', 'Lipinski_rule'], how='outer').filter(['SMILES', 'Solubility_log(mol/L)', 'Solubility(mol/L)', 'MolW(Da)', 'NumHAcceptors', 'NumHDonors', 'LogP', 'Lipinski_rule'])
df_merged_1_cleand = drop_analysis(df_merged_1)

dropped (missing SOL data): 1348
dropped (duplicates): 23
dropped (total): 1356


Here we saw: the biogen datasets has a lot of missing water solubility data.

In [184]:
df_merged_2 = pd.merge(df_merged_1_cleand, df_ESOL, on=['SMILES', 'Solubility_log(mol/L)', 'Solubility(mol/L)', 'MolW(Da)', 'NumHAcceptors', 'NumHDonors', 'LogP', 'Lipinski_rule'], how='outer').filter(['SMILES', 'Solubility_log(mol/L)', 'Solubility(mol/L)', 'MolW(Da)', 'NumHAcceptors', 'NumHDonors', 'LogP', 'Lipinski_rule'])
df_merged_2_cleand = drop_analysis(df_merged_2)

dropped (missing SOL data): 0
dropped (duplicates): 1035
dropped (total): 1035


Here we saw: the ESOL data contained some duplicates

In [185]:
df_merged_3 = pd.merge(df_merged_2_cleand, df_OCHEM, on=['SMILES', 'Solubility_log(mol/L)', 'Solubility(mol/L)', 'MolW(Da)', 'NumHAcceptors', 'NumHDonors', 'LogP', 'Lipinski_rule'], how='outer').filter(['SMILES', 'Solubility_log(mol/L)', 'Solubility(mol/L)', 'MolW(Da)', 'NumHAcceptors', 'NumHDonors', 'LogP', 'Lipinski_rule'])
df_merged_3_cleand = drop_analysis(df_merged_3)

dropped (missing SOL data): 0
dropped (duplicates): 4173
dropped (total): 4173


Here we saw: most of the OCHEM Smiles were already in the other datasets

In [186]:
df_merged_3_cleand.to_csv('../Data/Merged_solubility.csv', index=False)