In [1]:
import pandas as pd
import numpy as np

from rdkit.Chem import MolFromSmiles, MolToSmiles, Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

## Important

If you want only to predict the IE and EA of new molecules with the current models, you can just go to nb10_prediction_of_new_molecules.ipynb

## Canonical smiles

Two molecules might have different SMILES encodings,

In [None]:
smi1 = 'C=CCC'
mol1 = MolFromSmiles(smi1)
mol1

In [None]:
smi2 = 'CCC=C'
mol2 = MolFromSmiles(smi2)
mol2

but still be the same chemically. In other words their descriptors are the same:

In [4]:
descriptor_names = [x[0] for x in Descriptors._descList]
calc = MoleculeDescriptors.MolecularDescriptorCalculator(descriptor_names)

In [5]:
features1 = calc.CalcDescriptors(mol1)
features2 = calc.CalcDescriptors(mol2)

In [None]:
np.allclose(features1, features2)

If one of these molecules is in the training set and the other in the test set, this will cause train-test leakage. Therefore we will convert all SMILES to their canonical versions and check for duplicates. The way to do this is to convert

**smiles->mol->smiles**

which will return canonical smiles.

In [None]:
smi1 == smi2

In [None]:
smi1_canonical = MolToSmiles(MolFromSmiles(smi1))
smi2_canonical = MolToSmiles(MolFromSmiles(smi2))
print(smi1_canonical, smi2_canonical)
smi1_canonical == smi2_canonical

## Standardization

https://molvs.readthedocs.io/en/latest/guide/standardize.html

Not all SMILES are in their standard form.

In [None]:
non_standard_smi = 'C[n+]1c([N-](C))cccc1'
non_standard_smi

In [None]:
from molvs import Standardizer
s = Standardizer()

non_standard_mol = MolFromSmiles(non_standard_smi)
standard_mol = s.standardize(non_standard_mol)
standard_smi = MolToSmiles(standard_mol)
standard_smi

Note that this is different than canonical smiles. If a molecule is non-standard, after standardization its SMILES as well as its chemical properties will be different.

In [None]:
features_non_standard = calc.CalcDescriptors(non_standard_mol)
features_standard = calc.CalcDescriptors(standard_mol)
np.allclose(features_non_standard, features_standard)

If a molecule is already in its standard form, standardization will simply return its canonical form:

In [None]:
mol = MolFromSmiles('CCC=C')
smol = s.standardize(mol)
MolToSmiles(smol)

We can check if we have any non-standard SMILES by comparing its standard smiles to its canonical smiles. For standard smiles these will be equal, for non-standard smiles these will be different.

In [None]:
example_1 = 'C[n+]1c([N-](C))cccc1'
mol = MolFromSmiles(example_1)

# standard smiles
smol = s.standardize(mol)
s_smi = MolToSmiles(smol)

# canonical smiles
c_smi = MolToSmiles(mol)

if s_smi == c_smi:
    print('Standard')
else:
    print('Non standard')

In [None]:
example_2 = 'CCC=C'
mol = MolFromSmiles(example_2)

# standard smiles
smol = s.standardize(mol)
s_smi = MolToSmiles(smol)

# canonical smiles
c_smi = MolToSmiles(mol)

if s_smi == c_smi:
    print('Standard')
else:
    print('Non standard')

## Validation

Some SMILES are not even valid. We can detect these by checking what is returned from the `MolFromSmiles` method.

In [None]:
non_valid_smi = '[SiH2]1cc2ccc3ccc4-c5-nccc-c5n-c4c3c2c1'
mol = MolFromSmiles(non_valid_smi)
print(mol)

In [None]:
# Alternative
from molvs import validate_smiles
validate_smiles(non_valid_smi)

## Molecule check

Let's apply the principals explained above to our dataset.

1. Find non-valid SMILES
2. Find non-standard SMILES
3. Remove non-valid and non-standard SMILES
4. Convert all SMILES to their canonical versions
5. Remove duplicate SMILES

In [None]:
source_df_1 = pd.read_csv(filepath_or_buffer='source_data/source_dataset_1.csv', sep=';')
source_df_1 = source_df_1.drop(source_df_1.columns[1:-2], axis=1)
source_df_1 = source_df_1.rename(columns={'smiles':'Smiles'})
source_df_1

In [None]:
source_df_2 = pd.read_csv(filepath_or_buffer='source_data/source_dataset_2.csv', sep=',')
source_df_2

In [None]:
source_df = pd.concat([source_df_1, source_df_2], ignore_index=True)
source_df

In [20]:
smiles = list(source_df['Smiles'])
del source_df_1
del source_df_2

In [21]:
from molvs import Standardizer
s = Standardizer()

In [22]:
def molecule_checker(smiles):
    non_valid_smiles = []
    non_standard_smiles = []
    for i, smi in enumerate(smiles):
        mol = MolFromSmiles(smi)
        if mol is None:
            non_valid_smiles.append(i)
        else:
            s_smiles = MolToSmiles(s.standardize(mol))
            c_smiles = MolToSmiles(mol)
            if s_smiles != c_smiles:
                non_standard_smiles.append(i)

    return non_valid_smiles, non_standard_smiles

In [None]:
# It took 40 minutes
non_valid_smiles, non_standard_smiles = molecule_checker(smiles)

In [None]:
print(f'There are {len(non_valid_smiles)} non valid smiles.')
print(f'There are {len(non_standard_smiles)} non standard smiles.')

In [25]:
import pickle
with open('outputs/non_valid_smiles.pkl', 'wb') as f:
    pickle.dump(non_valid_smiles, f)

In [26]:
del smiles

In [None]:
source_df = source_df.drop(non_valid_smiles)
source_df

In [28]:
# Convert all SMILES to canonical
# It took 7 minutes 11 seconds
source_df['Smiles'] = source_df['Smiles'].apply(lambda row: MolToSmiles(MolFromSmiles(row)))

In [None]:
# Drop duplicates
source_df = source_df.drop_duplicates(subset=['Smiles'])
source_df

In [30]:
source_df.to_csv('outputs/processed_source_dataset.csv', sep=',', index=False)

In [31]:
del source_df

The first raw dataset contained 233 molecules and the second contained 2,310,850 molecules. These were combined into one raw dataset. 9821 non-valid and 0 non-standard SMILES were detected and removed which resulted in 2,301,262 molecules. These were converted to canonical SMILES and 28,240 duplicates were detected and removed. Processed dataset contains 2,273,022 valid, standard and unique molecules. Note that all non-valid and duplicate molecules were detected in the second raw dataset.