# Chem data sanitization

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
from rdkit import Chem
from rdkit.Chem import SaltRemover
from mordred import Calculator, descriptors

## Load the Dataset

In [None]:
# Load the dataset GSK3 into a pandas DataFrame
df_GSK3 = pd.read_csv("./datasets/GSK3_JNK3/GSK3/all.txt")

## Database curation from chemical POV


In [None]:
# Function to obtain the Canonical SMILES from a SMILES string
def get_canonical_smiles(smiles):
    # Convert the SMILES string to a molecule
    mol = Chem.MolFromSmiles(smiles)

    # Convert the molecule to a canonical SMILES string
    canonical_smiles = Chem.MolToSmiles(mol)

    return canonical_smiles

# Calculate canonical smiles for every SMILES string in GSK3 DataFrame
canonical_df_GSK3 = [get_canonical_smiles(smiles) for smiles in df_GSK3["smiles"]]

# Save the canonical smiles in a new column called canonical_smiles in the dataframe
df_GSK3.insert(loc = 0, column = "canonical_smiles", value = canonical_df_GSK3)

In [None]:
# Salt and solvent removal
def salt_solvent_remover(smiles):
    # Define the salt remover https://github.com/rdkit/rdkit/blob/master/Data/Salts.txt
    remover = SaltRemover.SaltRemover()

    # Convert the SMILES string to a molecule
    mol = Chem.MolFromSmiles(smiles)

    # Remove salts and solvents from the molecule
    stripped_mol = remover.StripMol(mol)

    # Convert the stripped molecule to the stripped SMILES
    stripped_smiles = Chem.MolToSmiles(stripped_mol)

    return stripped_smiles

# Remove salt and solvent from every SMILES string in GSK3 DataFrame
stripped_df_GSK3 = [salt_solvent_remover(smiles) for smiles in df_GSK3["canonical_smiles"]]

# Save the stripped SMILES in a new column called stripped_smiles in the dataframe
df_GSK3["stripped_smiles"] = stripped_df_GSK3

In [None]:
# Duplicities analysis and removal
df_GSK3_without_duplicates = df_GSK3.drop_duplicates(subset=['stripped_smiles'])
df_GSK3 = df_GSK3_without_duplicates

In [None]:
# Save the updated DataFrame to a new file
df_GSK3.to_csv("dfA_GSK3.csv", index=False)

## Descriptors



In [None]:
# Load the dfA_GSK3 dataframe into a new pandas DataFrame
dfA_GSK3 = pd.read_csv("dfA_GSK3.csv")

In [None]:
# Drop chemical compounds that produce warnings computing their descriptors
dfA_GSK3 = dfA_GSK3.drop([2378, 8075, 13886, 20182, 22864, 28373, 29626, 42596, 42283, 44180, 44296, 46363, 47018, 52572])

# Save the updated DataFrame to a new file
dfA_GSK3.to_csv("dfB_GSK3.csv", index=False)

In [None]:
# Selection of descriptors types to compute: 0-1-2D
# Function to calculate descriptors for an array of SMILES strings
def calculate_descriptors(smiles):
    # Create descriptor calculator with all descriptors except 3D ones
    calc = Calculator(descriptors, ignore_3D = True)

    # Obtain an array of molecules from their SMILES strings
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]

    # Calculate descriptors for each molecule
    df = calc.pandas(mols)

    return df

In [None]:
# Load the dfB_GSK3 dataframe into a new pandas DataFrame
dfB_GSK3 = pd.read_csv("dfB_GSK3.csv")
# Obtain a dataframe of descriptors for every SMILES sztring
descriptors = calculate_descriptors(dfB_GSK3['stripped_smiles'])

In [None]:
descriptors

In [None]:
# Convert the values in descriptors to numeric types and if a value cannot be converted, it is replaced with NaN
descriptors = descriptors.apply(pd.to_numeric, errors = 'coerce')
print(descriptors.info())

In [None]:
# Delete descriptors highly correlated
# Obtain the matrix with the correlation between every pair of descriptors
corr_matrix = descriptors.corr().abs()

# Define a limit of correlation accepted
threshold = 0.98

# Obtain the upper matrix as a dataframe
upper_matrix = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Obtain the columns that are >0.95 correlated
to_drop = [column for column in upper_matrix.columns if any(upper_matrix[column] > threshold)]

# Drop selected columns
descriptors.drop(to_drop,  inplace = True, axis=1)
descriptors

In [None]:
# Delete constant descriptors
# Obtain all dataframe columns that have the same constant value for each compound
list_constantes = descriptors.columns[descriptors.eq(descriptors.iloc[0]).all()].tolist()
# Delete constant columns
descriptors.drop(list_constantes,  inplace = True, axis = 1)
descriptors

In [None]:
# Delete descriptors which all values are NaN
descriptors.dropna(axis = 1, how = 'all')
descriptors

In [None]:
# Delete columns (descriptors) with >5% of NaN
descriptors.drop(descriptors.columns[descriptors.isna().mean() > 0.05], inplace = True, axis = 1)
print(descriptors.info())

In [None]:
# Concatenate the bioctivity value of each chem compound with the descriptors associated to it
df_final = pd.concat([dfB_GSK3['gsk3'], descriptors], axis = 1)

In [None]:
df_final.to_csv("descriptorsGSK3.csv", index = False)