# Chem data sanitization

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from rdkit import Chem
from rdkit.Chem import SaltRemover
from mordred import Calculator, descriptors

## Load the Dataset

In [None]:
# Load the dataset GSK3 into a pandas DataFrame
df_GSK3 = pd.read_csv("./datasets/GSK3_JNK3/GSK3/all.txt")

Saving all.txt to all.txt


## Database curation from chemical POV


In [None]:
# Function to obtain the Canonical SMILES from a SMILES string
def get_canonical_smiles(smiles):
    # Convert the SMILES string to a molecule
    mol = Chem.MolFromSmiles(smiles)

    # Convert the molecule to a canonical SMILES string
    canonical_smiles = Chem.MolToSmiles(mol)

    return canonical_smiles

# Calculate canonical smiles for every SMILES string in GSK3 DataFrame
canonical_df_GSK3 = [get_canonical_smiles(smiles) for smiles in df_GSK3["smiles"]]

# Save the canonical smiles in a new column called canonical_smiles in the dataframe
df_GSK3.insert(loc = 0, column = "canonical_smiles", value = canonical_df_GSK3)

In [None]:
# Salt and solvent removal
def salt_solvent_remover(smiles):
    # Define the salt remover https://github.com/rdkit/rdkit/blob/master/Data/Salts.txt
    remover = SaltRemover.SaltRemover()

    # Convert the SMILES string to a molecule
    mol = Chem.MolFromSmiles(smiles)

    # Remove salts and solvents from the molecule
    stripped_mol = remover.StripMol(mol)

    # Convert the stripped molecule to the stripped SMILES
    stripped_smiles = Chem.MolToSmiles(stripped_mol)

    return stripped_smiles

# Remove salt and solvent from every SMILES string in GSK3 DataFrame
stripped_df_GSK3 = [salt_solvent_remover(smiles) for smiles in df_GSK3["canonical_smiles"]]

# Save the stripped SMILES in a new column called stripped_smiles in the dataframe
df_GSK3["stripped_smiles"] = stripped_df_GSK3

In [None]:
# Duplicities analysis and removal
df_GSK3_without_duplicates = df_GSK3.drop_duplicates(subset=['stripped_smiles'])
df_GSK3 = df_GSK3_without_duplicates

In [None]:
# Save the updated DataFrame to a new file
df_GSK3.to_csv("dfA_GSK3.csv", index=False)

## Descriptors



In [None]:
# Load the dfA_GSK3 dataframe into a new pandas DataFrame
dfA_GSK3 = pd.read_csv("dfA_GSK3.csv")

In [None]:
# Drop chemical compounds that produce warnings computing their descriptors
dfA_GSK3 = dfA_GSK3.drop([2378, 8075, 13886, 20182, 22864, 28373, 29626, 42596, 42283, 44180, 44296, 46363, 47018, 52572])

# Save the updated DataFrame to a new file
dfA_GSK3.to_csv("dfB_GSK3.csv", index=False)

In [None]:
# Selection of descriptors types to compute: 0-1-2D
# Function to calculate descriptors for an array of SMILES strings
def calculate_descriptors(smiles):
    # Create descriptor calculator with all descriptors except 3D ones
    calc = Calculator(descriptors, ignore_3D = True)

    # Obtain an array of molecules from their SMILES strings
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]

    # Calculate descriptors for each molecule
    df = calc.pandas(mols)

    return df

In [None]:
# Load the dfB_GSK3 dataframe into a new pandas DataFrame
dfB_GSK3 = pd.read_csv("dfB_GSK3.csv")
# Obtain a dataframe of descriptors for every SMILES sztring
descriptors = calculate_descriptors(dfB_GSK3['stripped_smiles'])

100%|██████████| 52690/52690 [3:38:27<00:00,  4.02it/s]


In [None]:
descriptors

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,16.239595,14.261387,0,0,25.730612,2.543796,4.931079,25.730612,1.286531,3.980269,...,10.169576,70.560173,271.106925,8.215361,732,31,114.0,140.0,5.388889,4.250000
1,19.787059,15.693655,0,0,34.072053,2.453043,4.796035,34.072053,1.310464,4.173851,...,9.976599,74.472558,369.114712,8.202549,1769,37,130.0,150.0,7.777778,6.000000
2,23.282153,16.292753,0,1,39.256295,2.448552,4.811904,39.256295,1.353665,4.321034,...,10.292383,78.683446,386.196743,7.572485,2575,45,158.0,187.0,6.750000,6.277778
3,17.556349,14.148622,0,0,30.676848,2.451938,4.794928,30.676848,1.333776,4.058952,...,9.897017,70.969119,327.104148,8.177604,1219,33,116.0,135.0,6.416667,5.333333
4,27.301783,20.975227,0,0,44.680099,2.468249,4.853648,44.680099,1.276574,4.480914,...,10.466156,85.490995,466.211724,7.642815,3927,55,184.0,215.0,11.673611,7.638889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52685,13.463538,11.148546,0,0,22.505019,2.427625,4.751591,22.505019,1.323825,3.784034,...,9.683402,63.766111,241.067368,8.609549,517,23,90.0,105.0,4.305556,3.722222
52686,19.787059,16.067723,0,0,34.396121,2.362730,4.725461,34.396121,1.322928,4.169040,...,9.935325,60.385203,363.129300,7.726155,1693,37,128.0,146.0,7.166667,6.000000
52687,18.470305,17.053512,0,0,30.746578,2.496125,4.977714,30.746578,1.229863,4.117623,...,10.062754,72.565054,362.093643,8.420782,1381,39,122.0,143.0,9.638889,5.916667
52688,20.701015,17.413017,0,0,34.420895,2.486058,4.827938,34.420895,1.274848,4.216229,...,10.109648,77.729277,370.152872,7.554140,1958,40,138.0,162.0,9.250000,6.083333


In [None]:
# Convert the values in descriptors to numeric types and if a value cannot be converted, it is replaced with NaN
descriptors = descriptors.apply(pd.to_numeric, errors = 'coerce')
print(descriptors.info())

<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 52690 entries, 0 to 52689
Columns: 1613 entries, ABC to mZagreb2
dtypes: bool(2), float64(1301), int64(310)
memory usage: 647.7 MB
None


In [None]:
# Delete descriptors highly correlated
# Obtain the matrix with the correlation between every pair of descriptors
corr_matrix = descriptors.corr().abs()

# Define a limit of correlation accepted
threshold = 0.98

# Obtain the upper matrix as a dataframe
upper_matrix = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Obtain the columns that are >0.95 correlated
to_drop = [column for column in upper_matrix.columns if any(upper_matrix[column] > threshold)]

# Drop selected columns
descriptors.drop(to_drop,  inplace = True, axis=1)
descriptors

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpMax_A,SpDiam_A,SpMAD_A,VE1_A,VE2_A,VE3_A,...,JGI7,JGI8,JGI9,JGI10,JGT10,TopoShapeIndex,SRW05,TSRW10,WPol,mZagreb1
0,16.239595,14.261387,0,0,2.543796,4.931079,1.286531,4.011757,0.200588,2.082377,...,0.015997,0.012915,0.000000,0.000000,0.392766,0.600000,3.044522,70.560173,31,5.388889
1,19.787059,15.693655,0,0,2.453043,4.796035,1.310464,4.083236,0.157048,2.362401,...,0.014058,0.011896,0.005652,0.007034,0.370310,1.000000,2.397895,74.472558,37,7.777778
2,23.282153,16.292753,0,1,2.448552,4.811904,1.353665,4.118507,0.142017,2.480202,...,0.010417,0.007918,0.003800,0.005644,0.343392,0.888889,2.397895,78.683446,45,6.750000
3,17.556349,14.148622,0,0,2.451938,4.794928,1.333776,4.008801,0.174296,2.221401,...,0.010799,0.010154,0.007788,0.008833,0.345820,1.000000,2.397895,70.969119,33,6.416667
4,27.301783,20.975227,0,0,2.468249,4.853648,1.276574,4.611107,0.131746,2.781231,...,0.013473,0.013338,0.007813,0.007576,0.459354,0.888889,2.397895,85.490995,55,11.673611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52685,13.463538,11.148546,0,0,2.427625,4.751591,1.323825,3.587896,0.211053,1.808194,...,0.012156,0.007407,0.015625,0.000000,0.360525,0.800000,2.397895,63.766111,23,4.305556
52686,19.787059,16.067723,0,0,2.362730,4.725461,1.322928,4.512245,0.173548,2.462306,...,0.008198,0.004625,0.003886,0.004427,0.308286,1.000000,0.000000,60.385203,37,7.166667
52687,18.470305,17.053512,0,0,2.496125,4.977714,1.229863,4.076677,0.163067,2.321573,...,0.015935,0.007788,0.008086,0.005000,0.435791,1.000000,2.397895,72.565054,39,9.638889
52688,20.701015,17.413017,0,0,2.486058,4.827938,1.274848,4.135254,0.153158,2.412800,...,0.015506,0.010719,0.008047,0.006326,0.436836,0.875000,3.044522,77.729277,40,9.250000


In [None]:
# Delete constant descriptors
# Obtain all dataframe columns that have the same constant value for each compound
list_constantes = descriptors.columns[descriptors.eq(descriptors.iloc[0]).all()].tolist()
# Delete constant columns
descriptors.drop(list_constantes,  inplace = True, axis = 1)
descriptors

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpMax_A,SpDiam_A,SpMAD_A,VE1_A,VE2_A,VE3_A,...,JGI7,JGI8,JGI9,JGI10,JGT10,TopoShapeIndex,SRW05,TSRW10,WPol,mZagreb1
0,16.239595,14.261387,0,0,2.543796,4.931079,1.286531,4.011757,0.200588,2.082377,...,0.015997,0.012915,0.000000,0.000000,0.392766,0.600000,3.044522,70.560173,31,5.388889
1,19.787059,15.693655,0,0,2.453043,4.796035,1.310464,4.083236,0.157048,2.362401,...,0.014058,0.011896,0.005652,0.007034,0.370310,1.000000,2.397895,74.472558,37,7.777778
2,23.282153,16.292753,0,1,2.448552,4.811904,1.353665,4.118507,0.142017,2.480202,...,0.010417,0.007918,0.003800,0.005644,0.343392,0.888889,2.397895,78.683446,45,6.750000
3,17.556349,14.148622,0,0,2.451938,4.794928,1.333776,4.008801,0.174296,2.221401,...,0.010799,0.010154,0.007788,0.008833,0.345820,1.000000,2.397895,70.969119,33,6.416667
4,27.301783,20.975227,0,0,2.468249,4.853648,1.276574,4.611107,0.131746,2.781231,...,0.013473,0.013338,0.007813,0.007576,0.459354,0.888889,2.397895,85.490995,55,11.673611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52685,13.463538,11.148546,0,0,2.427625,4.751591,1.323825,3.587896,0.211053,1.808194,...,0.012156,0.007407,0.015625,0.000000,0.360525,0.800000,2.397895,63.766111,23,4.305556
52686,19.787059,16.067723,0,0,2.362730,4.725461,1.322928,4.512245,0.173548,2.462306,...,0.008198,0.004625,0.003886,0.004427,0.308286,1.000000,0.000000,60.385203,37,7.166667
52687,18.470305,17.053512,0,0,2.496125,4.977714,1.229863,4.076677,0.163067,2.321573,...,0.015935,0.007788,0.008086,0.005000,0.435791,1.000000,2.397895,72.565054,39,9.638889
52688,20.701015,17.413017,0,0,2.486058,4.827938,1.274848,4.135254,0.153158,2.412800,...,0.015506,0.010719,0.008047,0.006326,0.436836,0.875000,3.044522,77.729277,40,9.250000


In [None]:
# Delete descriptors which all values are NaN
descriptors.dropna(axis = 1, how = 'all')
descriptors

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpMax_A,SpDiam_A,SpMAD_A,VE1_A,VE2_A,VE3_A,...,JGI7,JGI8,JGI9,JGI10,JGT10,TopoShapeIndex,SRW05,TSRW10,WPol,mZagreb1
0,16.239595,14.261387,0,0,2.543796,4.931079,1.286531,4.011757,0.200588,2.082377,...,0.015997,0.012915,0.000000,0.000000,0.392766,0.600000,3.044522,70.560173,31,5.388889
1,19.787059,15.693655,0,0,2.453043,4.796035,1.310464,4.083236,0.157048,2.362401,...,0.014058,0.011896,0.005652,0.007034,0.370310,1.000000,2.397895,74.472558,37,7.777778
2,23.282153,16.292753,0,1,2.448552,4.811904,1.353665,4.118507,0.142017,2.480202,...,0.010417,0.007918,0.003800,0.005644,0.343392,0.888889,2.397895,78.683446,45,6.750000
3,17.556349,14.148622,0,0,2.451938,4.794928,1.333776,4.008801,0.174296,2.221401,...,0.010799,0.010154,0.007788,0.008833,0.345820,1.000000,2.397895,70.969119,33,6.416667
4,27.301783,20.975227,0,0,2.468249,4.853648,1.276574,4.611107,0.131746,2.781231,...,0.013473,0.013338,0.007813,0.007576,0.459354,0.888889,2.397895,85.490995,55,11.673611
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52685,13.463538,11.148546,0,0,2.427625,4.751591,1.323825,3.587896,0.211053,1.808194,...,0.012156,0.007407,0.015625,0.000000,0.360525,0.800000,2.397895,63.766111,23,4.305556
52686,19.787059,16.067723,0,0,2.362730,4.725461,1.322928,4.512245,0.173548,2.462306,...,0.008198,0.004625,0.003886,0.004427,0.308286,1.000000,0.000000,60.385203,37,7.166667
52687,18.470305,17.053512,0,0,2.496125,4.977714,1.229863,4.076677,0.163067,2.321573,...,0.015935,0.007788,0.008086,0.005000,0.435791,1.000000,2.397895,72.565054,39,9.638889
52688,20.701015,17.413017,0,0,2.486058,4.827938,1.274848,4.135254,0.153158,2.412800,...,0.015506,0.010719,0.008047,0.006326,0.436836,0.875000,3.044522,77.729277,40,9.250000


In [None]:
# Delete columns (descriptors) with >5% of NaN
descriptors.drop(descriptors.columns[descriptors.isna().mean() > 0.05], inplace = True, axis = 1)
print(descriptors.info())

<class 'mordred._base.pandas_module.MordredDataFrame'>
RangeIndex: 52690 entries, 0 to 52689
Columns: 793 entries, ABC to mZagreb1
dtypes: float64(660), int64(133)
memory usage: 318.8 MB
None


In [None]:
# Concatenate the bioctivity value of each chem compound with the descriptors associated to it
df_final = pd.concat([dfB_GSK3['gsk3'], descriptors], axis = 1)

In [None]:
df_final.to_csv("descriptorsGSK3.csv", index = False)