This series of notebooks will guide you through the process of visualizing a large number of compounds represented as SMILES as a part of the chemical space using the folllowing steps:

1. The compounds were first be converted into Morgan fingerprints in bit format.
2. Principal Component Analysis(PCA) was performed to reduce the number of dimensions.
3. t-SNE was performed to create a 2D visualization of the chemical space.

[REINVENT](github.com/MolecularAI/Reinvent)'s generative model and Reinforcement Learning(RL) architecture was used to generate active compounds for the DRD2 receptor. A random forest regressor model trained to predict pChEMBL values for the DRD2 receptor was used as the predictor model in the RL system. ChEMBL dataset was used to represent the entire chemical space.

At the end of this notebooks, the fingerprints were saved in .csv format to be used in part 2.

In [1]:
import os
import pandas as pd
import numpy as np
import random

from rdkit import Chem, DataStructs
import rdkit.Chem as rkc
import rdkit.Chem.AllChem as rkac
import rdkit.Chem.Scaffolds.MurckoScaffold as mrks

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [5]:
data_path = 'Data/drd2_cluster.csv'

In [6]:
df = pd.read_csv(data_path)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378990 entries, 0 to 378989
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   SMILES   378990 non-null  object
 1   Dataset  378990 non-null  object
dtypes: object(2)
memory usage: 5.8+ MB


In [10]:
df.groupby(['Dataset']).count()

'''
The loaded dataset contains the following compounds:

1. DRD2_training: Dataset used to train the RFR model to predict pChEMBL values
2. REINVENT: Active Compounds generated by REINVENT's Reinforcement Learning System for the DRD2 receptor
3. ReLeaSE: Active Compounds generated by ReLeaSE's Reinforcement Learning System for the DRD2 receptor
4. ChEMBL: All compounted enlisted in the ChEMBL_22 datset

'''

"\nThe loaded dataset contains the following compounds:\n\n1. DRD2_training: Dataset used to train the RFR model to predict pChEMBL values\n2. REINVENT: Active Compounds generated by REINVENT's Reinforcement Learning System for the DRD2 receptor\n3. ReLeaSE: Active Compounds generated by ReLeaSE's Reinforcement Learning System for the DRD2 receptor\n4. ChEMBL: All compounted enlisted in the ChEMBL_22 datset\n\n"

In [11]:
# separate the rest of the compounds from the chembl dataset
df_other = df[df['Dataset'] != 'chembl']
df_other.count()

SMILES     26460
Dataset    26460
dtype: int64

In [12]:
# we will only use a subset of the compounds in the space to save computational time
df_space = df[df['Dataset'] == 'chembl']
df_space = df_space.sample(n = 150000)

In [13]:
# concatenate the rest of the groups with the samples chembl dataset to create a final dataframe
df_c = pd.concat([df_other, df_space], axis = 0)
print(df_c.count())
df_c = df_c.reset_index(drop = True)
df_c.head()

SMILES     176460
Dataset    176460
dtype: int64


Unnamed: 0,SMILES,Dataset
0,O(CCCCN1CCN(c2ccccc2OC)CC1)c1ccc2c(c1)NC(=O)CC2,REINVENT
1,c1(C(=O)CCCCN2CCC3(N(c4ccccc4)CNC3=O)CC2)ccc(F...,REINVENT
2,N(C1CCC(CCN2CCN(c3cccc(Cl)c3Cl)CC2)CC1)C(N(C)C)=O,REINVENT
3,N(C(=O)C(C)C)C1CCC(CCN2CCN(c3cccc(Cl)c3Cl)CC2)CC1,REINVENT
4,Clc1ccc(C2CCN(CCCC(=O)c3ccc(Cl)cc3)CC2)cc1,REINVENT


In [14]:
df_c.tail()

Unnamed: 0,SMILES,Dataset
176455,Cn1cc(-c2cc(C(F)(F)F)ccc2-c2cccc3c2CCN(S(=O)(=...,chembl
176456,COc1ccc(-c2ccnc(NC3CCCCCC3)n2)cc1OC1CCCC1,chembl
176457,COc1cc(N(Cc2ccccc2)Cc2ccccc2)c2ncn(C(C)C)c2c1,chembl
176458,CCOC(=O)c1ccc(C(C)=NNC(N)=S)cc1,chembl
176459,CC(C)CC(NC(=O)CCSc1ccc2ccccc2c1)C(=O)NC1CC(=O)...,chembl


In [15]:
df_c.groupby(['Dataset']).count()

Unnamed: 0_level_0,SMILES
Dataset,Unnamed: 1_level_1
DRD2_training,8359
REINVENT,9050
ReLeaSE,9051
chembl,150000


In [16]:
df_c.isnull().sum()

SMILES     0
Dataset    0
dtype: int64

### Clean the dataset to remove invalid SMILES

In [17]:
# function to identify invalid smiles

def count_invalid(smi):
    
    global i
    if smi:
        # setting sanitize to false avoids explicit valence error
        # setting it to true gives the error and can be used to count invalid mols
        mol = rkc.MolFromSmiles(smi)
        if mol is None:
            return int(1)
        else:
            pass
        return int(0)

In [20]:
# extract SMILES from the df and convert tolist
smiles_list = df_c['SMILES'].tolist()

In [21]:
len(smiles_list)

176460

In [22]:
# we will use pooling here to run the job in parallel

import multiprocess as mp

# use (n - 10) number of CPUs
pool = mp.Pool(mp.cpu_count() - 10)

RDKit ERROR: [16:55:00] Explicit valence for atom # 15 C, 6, is greater than permitted
RDKit ERROR: [16:55:00] Explicit valence for atom # 36 O, 4, is greater than permitted
RDKit ERROR: [16:55:00] Explicit valence for atom # 19 N, 5, is greater than permitted


In [23]:
# create a list to mark invalid compounds: 1 = Invalid, 0 = Valid
invalid_list = pool.map(count_invalid, smiles_list)
pool.close()

In [24]:
len(invalid_list)

176460

In [25]:
# total number of invalid SMILES

# if this is 0 skip the next 6 cells
pd.Series(invalid_list).sum()

3

In [26]:
# convert list to series
invalid_ser = pd.Series(invalid_list, name = 'Invalid')

In [27]:
# concatenate list with dataframe to drop all invalid rows
df_c = df_c.join(invalid_ser)

In [28]:
df_c.head()

Unnamed: 0,SMILES,Dataset,Invalid
0,O(CCCCN1CCN(c2ccccc2OC)CC1)c1ccc2c(c1)NC(=O)CC2,REINVENT,0
1,c1(C(=O)CCCCN2CCC3(N(c4ccccc4)CNC3=O)CC2)ccc(F...,REINVENT,0
2,N(C1CCC(CCN2CCN(c3cccc(Cl)c3Cl)CC2)CC1)C(N(C)C)=O,REINVENT,0
3,N(C(=O)C(C)C)C1CCC(CCN2CCN(c3cccc(Cl)c3Cl)CC2)CC1,REINVENT,0
4,Clc1ccc(C2CCN(CCCC(=O)c3ccc(Cl)cc3)CC2)cc1,REINVENT,0


In [29]:
# also check for null values
df_c.isnull().sum()

SMILES     0
Dataset    0
Invalid    0
dtype: int64

In [30]:
# create new df with only valid SMILES
df = df_c.drop(df_c[df_c.Invalid == 1].index)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 176457 entries, 0 to 176459
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   SMILES   176457 non-null  object
 1   Dataset  176457 non-null  object
 2   Invalid  176457 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 5.4+ MB


In [32]:
# drop 'invalid' label column
df = df.drop(['Invalid'], axis = 1)
df.head()

Unnamed: 0,SMILES,Dataset
0,O(CCCCN1CCN(c2ccccc2OC)CC1)c1ccc2c(c1)NC(=O)CC2,REINVENT
1,c1(C(=O)CCCCN2CCC3(N(c4ccccc4)CNC3=O)CC2)ccc(F...,REINVENT
2,N(C1CCC(CCN2CCN(c3cccc(Cl)c3Cl)CC2)CC1)C(N(C)C)=O,REINVENT
3,N(C(=O)C(C)C)C1CCC(CCN2CCN(c3cccc(Cl)c3Cl)CC2)CC1,REINVENT
4,Clc1ccc(C2CCN(CCCC(=O)c3ccc(Cl)cc3)CC2)cc1,REINVENT


In [49]:
df.reset_index(drop = True, inplace = True)
df.tail()

Unnamed: 0,SMILES,Dataset
176452,Cn1cc(-c2cc(C(F)(F)F)ccc2-c2cccc3c2CCN(S(=O)(=...,chembl
176453,COc1ccc(-c2ccnc(NC3CCCCCC3)n2)cc1OC1CCCC1,chembl
176454,COc1cc(N(Cc2ccccc2)Cc2ccccc2)c2ncn(C(C)C)c2c1,chembl
176455,CCOC(=O)c1ccc(C(C)=NNC(N)=S)cc1,chembl
176456,CC(C)CC(NC(=O)CCSc1ccc2ccccc2c1)C(=O)NC1CC(=O)...,chembl


### Creating Fingerprints

In [50]:

# returns morganfingerprints of a molecule in an array format
def get_fp_array(mol):
    fp = rkac.GetMorganFingerprintAsBitVect(mol, radius = 2, nBits = 2048)  
    array = np.zeros((1,), np.int)
    DataStructs.ConvertToNumpyArray(fp,array)
    return array

# gets list of fingerprints for a list of SMILES
def get_fp(smile):
    mol = Chem.MolFromSmiles(smile)
    fp = get_fp_array(mol)
    return fp

In [51]:
# extract only smiles from the df
smiles_list = df['SMILES'].tolist()

In [52]:
pool = mp.Pool(mp.cpu_count() - 10)

In [53]:
# create a list of morgan/circular fingerprints in bit format
fps_list = pool.map(get_fp,smiles_list)
pool.close()

In [54]:
len(fps_list)

176457

In [55]:
# create a series to store the datset that each fingerprint belongs to, we will use this for labeling
dataset_name = df['Dataset']

In [56]:
# use dtype = bool to save space
fps_arr = np.array(fps_list, dtype = np.bool)

In [57]:
fps_arr.shape

(176457, 2048)

In [58]:
fps_df = pd.DataFrame(fps_arr)

In [59]:
fps_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176457 entries, 0 to 176456
Columns: 2048 entries, 0 to 2047
dtypes: bool(2048)
memory usage: 344.6 MB


In [60]:
fps_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,False,True,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [61]:
# append with series that contains the datset that each datapoint belongs to

df_final = pd.concat([dataset_name, fps_df], axis = 1)

In [62]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176457 entries, 0 to 176456
Columns: 2049 entries, Dataset to 2047
dtypes: bool(2048), object(1)
memory usage: 346.0+ MB


In [63]:
df_final.tail()

Unnamed: 0,Dataset,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
176452,chembl,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
176453,chembl,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
176454,chembl,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
176455,chembl,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
176456,chembl,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False


In [64]:
#check for null values
df_final.isnull().sum()

Dataset    0
0          0
1          0
2          0
3          0
          ..
2043       0
2044       0
2045       0
2046       0
2047       0
Length: 2049, dtype: int64

In [65]:
df_final = df_final.dropna(axis = 0)

In [66]:
df_final.head()

Unnamed: 0,Dataset,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,REINVENT,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,REINVENT,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,REINVENT,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,REINVENT,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,REINVENT,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [67]:
# save data
df_final.to_csv('Data/drd2_cluster_fingerprints.csv', index = False)