# Model Preparation


First we import all the necessary packages and scripts

In [5]:
import numpy as np
import pandas as pd
from collections import OrderedDict
from itertools import chain
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.utils import shuffle
from matplotlib import pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from colorama import Fore, Style

#### Generate a subset of SMILES so its faster to test code

In [6]:
def extract_smiles(csv_file_path, output_csv_file, limit=1000):
    # Read the CSV file
    data = pd.read_csv(csv_file_path)

    # Extract the first 'limit' number of SMILES
    subset = data.iloc[:limit]

    # Save the subset to a new CSV file
    subset.to_csv(output_csv_file, index=False)

    print(f"Saved {limit} SMILES to {output_csv_file}")

# Usage
csv_file_path = 'ChEMBL33.csv'
output_csv_file = 'subset_smiles.csv'
extract_smiles(csv_file_path, output_csv_file)

Saved 1000 SMILES to subset_smiles.csv


### Removing unwanted data and cannonicalizing
This involves removing invalid data, duplicates, salts, stereochems. Finally it cannonicalizes the smiles.

In [7]:
def remove_unwanted_data(filename_in='', filename_out='',  invalid=True, duplicates=True, salts=True, stereochem=True, canonicalize=True):
    """Pre-processing of SMILES based on the user-defined parameters
            :param filename_in     path to the file containing the SMILES to pretreat (SMILES only) -- default = ChEMBL
            :param filename_out    path for file export -- default = ../data/
            :param invalid         if True (default), removes invalid SMILES
            :param duplicates      if True (default), removes duplicates
            :param salts           if True (default), removes salts
            :param stereochem      if True (default), removes stereochemistry
            :param canonicalize    if True (default), produces canonical SMILES
    """

    from preprocessor import Preprocessor
    p = Preprocessor(filename_in)

    print(Fore.GREEN + 'Pre-processing of "' + filename_in + '" started.')


    # user-defined pretreatment
    if invalid:
        p.remove_not_valid()
        print(Fore.GREEN + ' invalid SMILES - removed.')
        print(Style.RESET_ALL)

    if duplicates:
        p.remove_duplicates()
        print(Fore.GREEN + ' duplicate SMILES - removed.')
        print(Style.RESET_ALL)

    if salts:
        p.remove_salts()
        print(Fore.GREEN + ' salts - removed.')
        print(Style.RESET_ALL)

    if stereochem:
        p.remove_stereochem()
        print(Fore.GREEN + ' stereochemistry - removed.')
        print(Style.RESET_ALL)

    if canonicalize:
        p.canonicalize()
        print(Fore.GREEN + ' canonicalized SMILES.')
        print(Style.RESET_ALL)

    #save data to file
    p.save_data(filename_out)

    data = p.get_data()
    print(data[:5])


remove_unwanted_data(filename_in='subset_smiles', filename_out='testPreprocessed_Data.csv')


[32mPre-processing of "subset_smiles" started.
[32m invalid SMILES - removed.
[0m
[32m duplicate SMILES - removed.
[0m
[32m salts - removed.
[0m
[32m stereochemistry - removed.
[0m
[32m canonicalized SMILES.
[0m
['Brc1cccc(Nc2ncnc3ccncc23)c1NCCN1CCOCC1' 'C#CC(C)(O)CC'
 'C#CC1CCC(C#N)N1C(=O)CNC12CC3CC(CC(O)(C3)C1)C2'
 'C#CC=CCCCCCCCCCCC=CCCCCCC'
 'C#CC=CCCCCCCCCCCCCCCC=CCCCCC(O)C=CCCCC#CC(O)C#CCCCCCCC=CC(O)C#C']


### Calculate Lipinski rule of 5
Here we will calculate the descriptors that are within the rule of 5: HBD, HBA, MW and LogP. This will be used to compare the subsets of data later.

In [8]:
# Load the preprocessed data
data = pd.read_csv('preprocessed_ChEMBL33.csv')

# Function to calculate Lipinski descriptors
def lipinski_descriptors(smiles):
    """Calculates the descriptors that are within the Lipinski rule of 5 and adds them as separate columns to the data."""
    mol = Chem.MolFromSmiles(smiles)
    hbd = Descriptors.NumHDonors(mol)
    hba = Descriptors.NumHAcceptors(mol)
    mw = Descriptors.MolWt(mol)
    logp = Descriptors.MolLogP(mol)
    return hbd, hba, mw, logp

# Apply the function to each row in the DataFrame
data[['HBD', 'HBA', 'MW', 'LogP']] = data.apply(
    lambda row: lipinski_descriptors(row['standardized_smiles']), axis=1, result_type='expand')

# Save the data with descriptors
data.to_csv('ChEMBL33_with_descriptors.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: 'preprocessed_ChEMBL33.csv'

### One Hot Encoding

In [9]:
def one_hot_encode_smiles(csv_file_path):
    """Converts SMILES to one-hot encoded sequences."""
    Smiles_data = pd.read_csv(csv_file_path)
    smiles_list = Smiles_data.iloc[:, 0].tolist()
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(smiles_list)

    sequences = tokenizer.texts_to_sequences(smiles_list)
    one_hot = tokenizer.sequences_to_matrix(sequences, mode='binary')

    return one_hot, tokenizer

csv_file_path = 'testPreprocessed_Data.csv'
one_hot_encoded_smiles, tokenizer = one_hot_encode_smiles(csv_file_path)

one_hot_encoded_smiles[0:5]

array([[0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0.]])

### Padding sequence

In [10]:
def pad_smiles_sequences(encoded_sequences, max_length=None):
    """Ensures all encoded sequences are of the same length by padding shorter sequences with zeros. You can choose a suitable maximum length or leave max_length set to None to let the function determine it based on the longest sequence in the data."""
    if max_length is None:
        max_length = max(len(seq) for seq in encoded_sequences)

    padded_sequences = pad_sequences(encoded_sequences, maxlen=max_length, padding='post')

    return padded_sequences


padded_smiles = pad_smiles_sequences(one_hot_encoded_smiles, max_length=None)
padded_smiles[0:5]

array([[0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0]])

### Splitting into subsets

method from skinnider paper, but adapted

In [98]:
#HELE BASALE CODE, MOET NOG VERBETERD WORDEN.
train_data = np.genfromtxt('testPreprocessed_Data.csv', dtype="U")

def subset_splitting(datafile, num_subsets=6, step_size=25000): 
    sample_sizes = [50000 + (i * step_size) for i in range(num_subsets)]
    
    #remove all subset sizes > than datafile size, as we cannot generate these
    #sample_sizes = [size for size in sample_sizes if size < len(datafile)]
    return sample_sizes
    
groups = subset_splitting(train_data)

def subset_creation(datafile, group_num, groups):
    df = pd.DataFrame(datafile)
    if group_num < len(groups):
        subset_size = groups[group_num]
        dataset = df.sample(subset_size, replace=True, random_state=42)
        return dataset
    else:
        print("Invalid group_num")

random_subset = subset_creation(train_data, 1, groups) #returns dataset from bigger file containing x amount of SMILES which are randomly sampled.

### Compare subsets based on rule of 5

### Split each subset into test/train/validation

In [100]:
validation_fraction = 0.2

# Print the length of the original data set
print(f"Original Data Set Length: {len(train_data)}")

# Split the data into training and validation sets
training, val_data = train_test_split(train_data, test_size=validation_fraction, random_state=42)

# Print the length of the updated training set
print(f"Updated Training Set Length: {len(training)}")

# Print the length of the validation set
print(f"Validation Set Length: {len(val_data)}")

Original Data Set Length: 992
Updated Training Set Length: 793
Validation Set Length: 199
