### In this notebook we will clean a file containing data for the DRD2 receptor. 

Compounds are represented as SMILES (Simplified Molecular Input Line Entry System). The data also contains LogP and pIC50 values for most molecules.

Motivation:

1. Removal of Stereochemistry
2. Removal of Invalid or duplicated SMILES
3. Removal of NUll entries
4. Removal of large unusual compounds (large compounds can behave like outliers and make the training process longer)

This data will be used to train a scikit-learn model to predict pChEMBL values for newly generated compounds.

In [99]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [100]:
# provide path
path = 'Data/DRD2_raw_data.csv'
df = pd.read_csv(path)

In [101]:
# get data info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10983 entries, 0 to 10982
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Smiles         10968 non-null  object 
 1   pChEMBL Value  10983 non-null  float64
dtypes: float64(1), object(1)
memory usage: 171.7+ KB


In [102]:
df.head()

Unnamed: 0,Smiles,pChEMBL Value
0,CCCSc1nnc(-c2ccccc2)n1C,3.74
1,Cc1ccc(CNCC2(F)CCN(C(=O)c3cc(Br)cs3)CC2)nc1,4.0
2,COc1ccccc1N1CCC2(CCNCC2)CC1,4.0
3,Cl.NCCc1ccc(O)c(O)c1,4.0
4,Cn1c(SCCCN2CCCCC2)nnc1-c1ccccc1,4.01


In [103]:
#rename columns
df.columns = ['SMILES', 'pChEMBL']

In [104]:
#check for null values
df.isnull().sum()

SMILES     15
pChEMBL     0
dtype: int64

In [105]:
# drop all rows with null values
df = df.dropna(axis = 'rows')

In [106]:
df.isnull().sum()

SMILES     0
pChEMBL    0
dtype: int64

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10968 entries, 0 to 10982
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   SMILES   10968 non-null  object 
 1   pChEMBL  10968 non-null  float64
dtypes: float64(1), object(1)
memory usage: 257.1+ KB


In [108]:
# reset indices after performing 'drop'
df.reset_index(drop=True, inplace=True)

In [109]:
smiles = df['SMILES']
activity = df['pChEMBL']

## Stereochemistry Removal

In [110]:
# Import RDkit packages

from rdkit import Chem
import rdkit.Chem as rkc
import rdkit.Chem.AllChem as rkac
import rdkit.Chem.Scaffolds.MurckoScaffold as mrks

In [111]:
# removing '@' from a SMILE will rmove stereochemistry without changing positioning of the atoms

def remove_stereochemistry(smi):
    can_smi = str(smi).replace('@','')
    return can_smi

In [112]:
smiles_list = smiles.tolist()

In [113]:
canonical_smiles = list(map(remove_stereochemistry, smiles_list))

In [114]:
# convert to a series
canonical_smiles = pd.Series(canonical_smiles, name = 'canonical')
print(canonical_smiles.size)

10968


In [115]:
# create another dataframe with canonical SMILES
clean_df = pd.concat([canonical_smiles,
                     activity], axis=1)
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10968 entries, 0 to 10967
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   canonical  10968 non-null  object 
 1   pChEMBL    10968 non-null  float64
dtypes: float64(1), object(1)
memory usage: 171.5+ KB


In [116]:
# check df tail to make sure df size and index match up
clean_df.tail()

Unnamed: 0,canonical,pChEMBL
10963,NC(=O)C1CCS[CH]2C[C]3(CCCN3C(=O)[CH]3CCCN3)C(=...,10.51
10964,O=C(NCCC(F)CN1CCN(c2cccc(Cl)c2Cl)CC1)c1cc2cccc...,10.52
10965,O=C(CCCN1CCC(n2c(O)nc3ccccc32)CC1)c1ccc(F)cc1,10.57
10966,O=C(O)C(=O)O.Oc1ccc2c(c1)O[CH](CNCc1ccccc1)CC2,10.7
10967,CCN(CC)C(=O)N[CH]1C=C2C3C=CC=C4NC=C(C[CH]2N(C)...,11.0


## Removal of Large and Invalid SMILES

In [117]:
# for this case we also declare compounds that are too large or too small as invalid

def is_invalid(smi):

    global i
    if smi:
        # setting sanitize to false avoids explicit valence error
        # setting it to true gives the error and can be used to count invalid mols
        mol = rkc.MolFromSmiles(smi, sanitize = True)
        if mol is None:
            return int(1)
        elif (len(smi)  > 120) or (len(smi) < 20):
            return int(1)
        else:
            pass
        return int(0)

In [118]:
# create a series of valid/invalid values. 0 = valid, 1 = invalid
invalid_list = list(map(is_invalid, clean_df['canonical']))

RDKit ERROR: [15:23:02] Explicit valence for atom # 21 N, 4, is greater than permitted
RDKit ERROR: [15:23:02] Explicit valence for atom # 17 N, 4, is greater than permitted
RDKit ERROR: [15:23:02] Explicit valence for atom # 14 N, 4, is greater than permitted
RDKit ERROR: [15:23:02] Explicit valence for atom # 21 N, 4, is greater than permitted
RDKit ERROR: [15:23:02] Explicit valence for atom # 12 N, 4, is greater than permitted
RDKit ERROR: [15:23:02] Can't kekulize mol.  Unkekulized atoms: 9 11 12 13 14 15 17
RDKit ERROR: 
RDKit ERROR: [15:23:02] Explicit valence for atom # 14 N, 4, is greater than permitted
RDKit ERROR: [15:23:02] Explicit valence for atom # 15 N, 4, is greater than permitted
RDKit ERROR: [15:23:02] Explicit valence for atom # 13 N, 4, is greater than permitted
RDKit ERROR: [15:23:02] Explicit valence for atom # 5 N, 4, is greater than permitted
RDKit ERROR: [15:23:02] Explicit valence for atom # 20 N, 4, is greater than permitted
RDKit ERROR: [15:23:02] Explicit 

In [119]:
print('The number of invalid SMILES in the dataframe is: ' + str(sum(invalid_list)))

The number of invalid SMILES in the dataframe is: 253


In [120]:
invalid_series = pd.Series(invalid_list, name = 'Invalid')

clean_df = pd.concat([clean_df, invalid_series], axis = 1)

In [121]:
clean_df.tail()

Unnamed: 0,canonical,pChEMBL,Invalid
10963,NC(=O)C1CCS[CH]2C[C]3(CCCN3C(=O)[CH]3CCCN3)C(=...,10.51,0
10964,O=C(NCCC(F)CN1CCN(c2cccc(Cl)c2Cl)CC1)c1cc2cccc...,10.52,0
10965,O=C(CCCN1CCC(n2c(O)nc3ccccc32)CC1)c1ccc(F)cc1,10.57,0
10966,O=C(O)C(=O)O.Oc1ccc2c(c1)O[CH](CNCc1ccccc1)CC2,10.7,0
10967,CCN(CC)C(=O)N[CH]1C=C2C3C=CC=C4NC=C(C[CH]2N(C)...,11.0,0


In [122]:
# create anew df where invalid == 0
clean_valid_df = clean_df[clean_df['Invalid'] == 0]
clean_valid_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10715 entries, 0 to 10967
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   canonical  10715 non-null  object 
 1   pChEMBL    10715 non-null  float64
 2   Invalid    10715 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 334.8+ KB


In [123]:
# reset index since 'clean_valid_df' is a subset of 'clean_df'
clean_valid_df.reset_index(drop=True, inplace=True)

#drop 'invalid' column
clean_valid_df = clean_valid_df.drop(['Invalid'], axis = 1)

clean_valid_df.tail()

Unnamed: 0,canonical,pChEMBL
10710,NC(=O)C1CCS[CH]2C[C]3(CCCN3C(=O)[CH]3CCCN3)C(=...,10.51
10711,O=C(NCCC(F)CN1CCN(c2cccc(Cl)c2Cl)CC1)c1cc2cccc...,10.52
10712,O=C(CCCN1CCC(n2c(O)nc3ccccc32)CC1)c1ccc(F)cc1,10.57
10713,O=C(O)C(=O)O.Oc1ccc2c(c1)O[CH](CNCc1ccccc1)CC2,10.7
10714,CCN(CC)C(=O)N[CH]1C=C2C3C=CC=C4NC=C(C[CH]2N(C)...,11.0


In [124]:
# remove duplicates
clean_valid_df.drop_duplicates(subset = ['canonical'], inplace = True)

#reset index, check info
clean_valid_df.reset_index(drop=True, inplace=True)
clean_valid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6770 entries, 0 to 6769
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   canonical  6770 non-null   object 
 1   pChEMBL    6770 non-null   float64
dtypes: float64(1), object(1)
memory usage: 105.9+ KB


In [125]:
# check df tail
clean_valid_df.tail()

Unnamed: 0,canonical,pChEMBL
6765,CS(=O)(=O)c1ccc2c(c1)N(CCCN1CCC(C(N)=O)CC1)c1c...,10.15
6766,O=C1NCc2ccc(OCCCCN3CCN(c4cccc5c4CCC5)CC3)cc21,10.24
6767,O=C(CCCN1[CH]2CC[CH]1CC(O)(c1ccc(Cl)cc1)C2)c1c...,10.26
6768,O=C(CCCN1CCC(n2c(O)nc3ccccc32)CC1)c1ccc(F)cc1,10.57
6769,CCN(CC)C(=O)N[CH]1C=C2C3C=CC=C4NC=C(C[CH]2N(C)...,11.0


In [126]:
# save file
clean_valid_df.to_csv('Data/DRD2_clean_data.csv', index = False)