In [1]:
# import packages

import os
import pandas as pd
import numpy as np
import random

from rdkit import Chem
import rdkit.Chem as rkc
import rdkit.Chem.AllChem as rkac
import rdkit.Chem.Scaffolds.MurckoScaffold as mrks

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# provide path to data

data_path = "Data/DRD2_RAW_SMILES.csv"

df = pd.read_csv(data_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10983 entries, 0 to 10982
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Smiles         10968 non-null  object 
 1   pChEMBL Value  10983 non-null  float64
dtypes: float64(1), object(1)
memory usage: 171.7+ KB


In [3]:
df.head(20)

Unnamed: 0,Smiles,pChEMBL Value
0,CCCSc1nnc(-c2ccccc2)n1C,3.74
1,Cc1ccc(CNCC2(F)CCN(C(=O)c3cc(Br)cs3)CC2)nc1,4.0
2,COc1ccccc1N1CCC2(CCNCC2)CC1,4.0
3,Cl.NCCc1ccc(O)c(O)c1,4.0
4,Cn1c(SCCCN2CCCCC2)nnc1-c1ccccc1,4.01
5,O=C(CN1CCN(c2ccccn2)CC1)Nc1cncnc1,4.03
6,Cc1ccc(C2CCN(CC(=O)Nc3cccc(C)c3)CC2)cc1,4.05
7,CCc1cccc(NC(=O)CN2CCN(c3ccccn3)CC2)c1,4.05
8,O=C(CN1CCN(c2ccc(Cl)cn2)CC1)Nc1cccnc1,4.06
9,Oc1ccc(N2CCN(Cc3cnn4ccccc34)CC2)cc1,4.06


In [4]:
# provide column name containing the SMILES
smi_column = "Smiles"

# provide max SMILE length in characters
max_len = 130

In [5]:
# check for numm values and drop any of present
df.isnull().sum()

Smiles           15
pChEMBL Value     0
dtype: int64

In [6]:
df.dropna(inplace = True)

# reset index after performing drop
df.reset_index(drop = True, inplace = True)

# check again for null values
df.isnull().sum()

Smiles           0
pChEMBL Value    0
dtype: int64

In [7]:
# check the total number of smiles again
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10968 entries, 0 to 10967
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Smiles         10968 non-null  object 
 1   pChEMBL Value  10968 non-null  float64
dtypes: float64(1), object(1)
memory usage: 171.5+ KB


In [8]:
# this function removes stereochemistry from SMILES in the dataframe

def remove_stereochemistry():
    
    curated_smiles = []
    
    global df, smi_column
    smiles_ser = df[smi_column]
    smiles_list = smiles_ser.tolist()
    
    for smiles in smiles_list:
        smi_curated = smiles.replace('@','')
        curated_smiles.append(smi_curated)
    
    # drop column with old SMILES
    df.drop([smi_column], inplace = True, axis = 1)
    
    #convert list to new series
    curated_smiles_ser = pd.DataFrame(curated_smiles, columns = ['Smiles'])
    df = curated_smiles_ser.join(df, how = 'right')
    return None

### Run the cell below to remove stereochemistry

In [9]:
remove_stereochemistry()
df.tail(20)

Unnamed: 0,Smiles,pChEMBL Value
10948,CCCN(CCc1cccs1)[CH]1CCc2c(O)cccc2C1,10.22
10949,CC(C)C[CH](NC(=O)[CH]1CCCN1)C(=O)NCC(N)=O,10.22
10950,O=C1NCc2ccc(OCCCCN3CCN(c4cccc5c4CCC5)CC3)cc21,10.24
10951,O=C1NCc2ccc(OCCCCN3CCN(c4cccc5c4CCC5)CC3)cc21,10.24
10952,O=C(CCCN1[CH]2CC[CH]1CC(O)(c1ccc(Cl)cc1)C2)c1c...,10.26
10953,O=C(CCCN1[CH]2CC[CH]1CC(O)(c1ccc(Cl)cc1)C2)c1c...,10.28
10954,CCCCCCNC[CH]1CCc2ccc(O)cc2O1.O=C(O)C(=O)O,10.3
10955,NC(=O)[CH]1CS[CH]2CC[C]3(CCCN3C(=O)[CH]3CCCN3)...,10.3
10956,O=c1[nH]c2cccc(N3CCN(Cc4cccc(-c5ccccc5)c4)CC3)...,10.4
10957,CC(C)(C)[C]1(O)CCN2C[CH]3c4ccccc4CCc4cccc(c43)...,10.4


In [10]:
# this function removes invalid SMILES from the dataframe and returns the total number of invalid SMILES
# SMILES will also be marked as invalid of they are too large or repeated

def get_invalid_count():
    
    global df, smi_column
    
    invalid_count = 0
    invalid_list = []
    
    smiles_ser = df[smi_column]
    smiles_list = smiles_ser.tolist()
    
    
    for smiles in smiles_list:
        if len(str(smiles)) > max_len:
            invalid_list.append(int(1))
            invalid_count += 1
        elif smiles in invalid_list:
            invalid_list.append(int(1))
            invalid_count += 1
        else:
            # setting sanitize to false avoids explicit valence error
            # setting it to true gives the error and can be used to count invalid mols
            mol = rkc.MolFromSmiles(smiles, sanitize = True)
            if mol is None:
                invalid_list.append(int(1))
                invalid_count += 1
            else:
                invalid_list.append(int(0))
    
    # convert invalid_list to series
    invalid_ser = pd.Series(invalid_list, name ='Invalid')
    df = df.join(invalid_ser)
    
    # create a new instance with only valid entries
    df = df[df['Invalid'] == 0]
    
    # drop 'Invalid' column
    df.drop(['Invalid'], axis = 1, inplace = True)
    df.reset_index(drop = True, inplace = True)
    
    return invalid_count

### Run the cell below to remove invalid entries and get the invalid count

In [11]:
total_invalid = get_invalid_count()

# an RDKIT error will print for each invalid molecule

RDKit ERROR: [23:46:13] Explicit valence for atom # 21 N, 4, is greater than permitted
RDKit ERROR: [23:46:13] Explicit valence for atom # 17 N, 4, is greater than permitted
RDKit ERROR: [23:46:13] Explicit valence for atom # 14 N, 4, is greater than permitted
RDKit ERROR: [23:46:13] Explicit valence for atom # 21 N, 4, is greater than permitted
RDKit ERROR: [23:46:13] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [23:46:13] Can't kekulize mol.  Unkekulized atoms: 9 11 12 13 14 15 17
RDKit ERROR: 
RDKit ERROR: [23:46:13] Explicit valence for atom # 14 N, 4, is greater than permitted
RDKit ERROR: [23:46:13] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [23:46:13] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [23:46:13] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [23:46:13] Explicit valence for atom # 20 N, 4, is greater than permitted
RDKit ERROR: [23:46:13] Explicit 

In [12]:
print("The total number of invalid SMILES are " + str(total_invalid))

The total number of invalid SMILES are 160


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10808 entries, 0 to 10807
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Smiles         10808 non-null  object 
 1   pChEMBL Value  10808 non-null  float64
dtypes: float64(1), object(1)
memory usage: 169.0+ KB


### Save Dataframe as a .csv file

In [14]:
df.to_csv('Data/DRD2_clean_data.csv')

### Obtaining Murcko Scaffolds

Now we will use RDKit's module to obtain a series of murcko scaffold for all Valid SMILES.

In [15]:
def get_murcko_scaffolds():
    
    scaffold_list = []
    
    global df, smi_column
    smiles_ser = df[smi_column]
    smiles_list = smiles_ser.tolist()
    
    for smiles in smiles_list:
        
        # get mol from SMILES
        mol = rkc.MolFromSmiles(smiles)
        
        # get murcko scaffold
        scaffold = mrks.GetScaffoldForMol(mol)
        
        # convery scaffold mol to SMILES
        sca_smiles = rkc.MolToSmiles(scaffold, isomericSmiles=False)
        
        scaffold_list.append(sca_smiles)
        
    scaffold_ser = pd.Series(scaffold_list, name = 'scaffold')
    
    df = df.join(scaffold_ser)
    
    return None

In [16]:
get_murcko_scaffolds()

In [17]:
df.isnull().sum()

Smiles           0
pChEMBL Value    0
scaffold         0
dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10808 entries, 0 to 10807
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Smiles         10808 non-null  object 
 1   pChEMBL Value  10808 non-null  float64
 2   scaffold       10808 non-null  object 
dtypes: float64(1), object(2)
memory usage: 253.4+ KB


In [19]:
df.tail()

Unnamed: 0,Smiles,pChEMBL Value,scaffold
10803,NC(=O)C1CCS[CH]2C[C]3(CCCN3C(=O)[CH]3CCCN3)C(=...,10.51,O=C(C1CCCN1)N1CCCC12CC1SCCCN1C2=O
10804,O=C(NCCC(F)CN1CCN(c2cccc(Cl)c2Cl)CC1)c1cc2cccc...,10.52,O=C(NCCCCN1CCN(c2ccccc2)CC1)c1cc2ccccc2o1
10805,O=C(CCCN1CCC(n2c(O)nc3ccccc32)CC1)c1ccc(F)cc1,10.57,O=C(CCCN1CCC(n2cnc3ccccc32)CC1)c1ccccc1
10806,O=C(O)C(=O)O.Oc1ccc2c(c1)O[CH](CNCc1ccccc1)CC2,10.7,c1ccc(CNCC2CCc3ccccc3O2)cc1
10807,CCN(CC)C(=O)N[CH]1C=C2C3C=CC=C4NC=C(C[CH]2N(C)...,11.0,C1=CC2C3=CCCNC3CC3=CNC(=C1)C32


### Save scaffold file

In [20]:
df.to_csv('Data/drd2_clean_data_w_scaffolds.csv', index = False)