# Setup

First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20.

In [5]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import pandas as pd
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [6]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem

from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit import DataStructs
from rdkit.DataStructs import ConvertToNumpyArray

from rdkit.Chem import PandasTools

In [7]:
# Import PyTorch and its modules
import torch
import torchvision
from torch import nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F

# Load the data

In [8]:
#
#datapath = os.path.join(".", "tox21.csv")
tox21 = pd.read_csv("data/raw/tox21.csv")
tox21.head()

Unnamed: 0,cano_smiles,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,group
0,s1c2cc(OCC)ccc2nc1S(=O)(=O)N,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,train
1,O[C@]1(CC[C@H]2[C@H]3[C@@H]([C@@H]4C(CC3)=CCCC...,,,,,,,,0.0,,0.0,,,train
2,O=C(Nc1c(cccc1C)C)C(N(CCC)CC)CC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,train
3,P(O)(O)(=O)C(P(O)(O)=O)(O)C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
4,O(OC(C)(C)C)C(CCC(OOC(C)(C)C)(C)C)(C)C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,train


## Data preprocessing

In [9]:
tox21.describe()

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
count,7245.0,6740.0,6529.0,5804.0,6175.0,6936.0,6432.0,5816.0,7054.0,6447.0,5795.0,6756.0
mean,0.04265,0.035015,0.117323,0.051516,0.128421,0.050317,0.028762,0.161279,0.037284,0.057236,0.15824,0.062463
std,0.202081,0.183831,0.321829,0.221067,0.334585,0.218614,0.167151,0.36782,0.18947,0.232311,0.364997,0.242012
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
#Generate data exploration
tox21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7811 entries, 0 to 7810
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cano_smiles    7811 non-null   object 
 1   NR-AR          7245 non-null   float64
 2   NR-AR-LBD      6740 non-null   float64
 3   NR-AhR         6529 non-null   float64
 4   NR-Aromatase   5804 non-null   float64
 5   NR-ER          6175 non-null   float64
 6   NR-ER-LBD      6936 non-null   float64
 7   NR-PPAR-gamma  6432 non-null   float64
 8   SR-ARE         5816 non-null   float64
 9   SR-ATAD5       7054 non-null   float64
 10  SR-HSE         6447 non-null   float64
 11  SR-MMP         5795 non-null   float64
 12  SR-p53         6756 non-null   float64
 13  group          7811 non-null   object 
dtypes: float64(12), object(2)
memory usage: 854.5+ KB


In [11]:
sample_incomplete_rows = tox21[tox21.isnull().any(axis=1)]
sample_incomplete_rows.head()

Unnamed: 0,cano_smiles,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,group
0,s1c2cc(OCC)ccc2nc1S(=O)(=O)N,0.0,0.0,1.0,,,0.0,0.0,1.0,0.0,0.0,0.0,0.0,train
1,O[C@]1(CC[C@H]2[C@H]3[C@@H]([C@@H]4C(CC3)=CCCC...,,,,,,,,0.0,,0.0,,,train
2,O=C(Nc1c(cccc1C)C)C(N(CCC)CC)CC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,,0.0,0.0,train
4,O(OC(C)(C)C)C(CCC(OOC(C)(C)C)(C)C)(C)C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,train
6,Ic1cc(cc(I)c1Oc1cc(I)c(O)cc1)CC(O)=O,0.0,,0.0,,1.0,,,1.0,0.0,1.0,0.0,1.0,train


In [12]:
len(sample_incomplete_rows)

4742

In [13]:
tox21.shape

(7811, 14)

In [14]:
tox21_dataset = tox21.dropna()

In [15]:
tox21_dataset.shape

(3069, 14)

In [16]:
tox21_dataset

Unnamed: 0,cano_smiles,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,group
3,P(O)(O)(=O)C(P(O)(O)=O)(O)C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
5,ClS(=O)(=O)c1ccccc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
11,O(C(=O)C(C)C)CC(C)C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
13,O=C([O-])Cc1c2c(ccc1)cccc2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,train
20,OCC#C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7802,SC(CC)C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,valid
7803,c12c(cccc1C)cccc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,valid
7806,O=C1N(CC(O)=O)C(=O)c2c3c1cccc3ccc2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,valid
7807,Clc1cc(OC(F)(F)C(F)C(F)(F)F)c(Cl)cc1NC(=O)NC(=...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,valid


In [17]:
#Returns number of unique values in the specifed column to make sure that no duplications there
tox21_dataset['cano_smiles'].nunique()

3068

In [18]:
#Returns number of total rows
tox21_dataset['cano_smiles'].count()

3069

In [19]:
#Drops the duplicated ones
tox21_dataset = tox21_dataset.drop_duplicates(['cano_smiles'])

In [20]:
tox21_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3068 entries, 3 to 7810
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cano_smiles    3068 non-null   object 
 1   NR-AR          3068 non-null   float64
 2   NR-AR-LBD      3068 non-null   float64
 3   NR-AhR         3068 non-null   float64
 4   NR-Aromatase   3068 non-null   float64
 5   NR-ER          3068 non-null   float64
 6   NR-ER-LBD      3068 non-null   float64
 7   NR-PPAR-gamma  3068 non-null   float64
 8   SR-ARE         3068 non-null   float64
 9   SR-ATAD5       3068 non-null   float64
 10  SR-HSE         3068 non-null   float64
 11  SR-MMP         3068 non-null   float64
 12  SR-p53         3068 non-null   float64
 13  group          3068 non-null   object 
dtypes: float64(12), object(2)
memory usage: 359.5+ KB


## Molecule genaration

In [21]:
# Create Mol rdkit objects for each smile
PandasTools.AddMoleculeColumnToFrame(frame=tox21_dataset, smilesCol='cano_smiles', molCol='Molecule')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame[molCol] = frame[smilesCol].map(Chem.MolFromSmiles)


There are a couple of SMILES that could be parsed by RDKit. For wrong SMILES, rdkit would return 'None'.

In [22]:
sum(tox21_dataset['Molecule'].map(lambda x: x is None))

0

In [23]:
tox21_dataset.head()

Unnamed: 0,cano_smiles,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,group,Molecule
3,P(O)(O)(=O)C(P(O)(O)=O)(O)C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,
5,ClS(=O)(=O)c1ccccc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,
11,O(C(=O)C(C)C)CC(C)C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,
13,O=C([O-])Cc1c2c(ccc1)cccc2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,train,
20,OCC#C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,


In [24]:
#Generate data exploration
tox21_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3068 entries, 3 to 7810
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cano_smiles    3068 non-null   object 
 1   NR-AR          3068 non-null   float64
 2   NR-AR-LBD      3068 non-null   float64
 3   NR-AhR         3068 non-null   float64
 4   NR-Aromatase   3068 non-null   float64
 5   NR-ER          3068 non-null   float64
 6   NR-ER-LBD      3068 non-null   float64
 7   NR-PPAR-gamma  3068 non-null   float64
 8   SR-ARE         3068 non-null   float64
 9   SR-ATAD5       3068 non-null   float64
 10  SR-HSE         3068 non-null   float64
 11  SR-MMP         3068 non-null   float64
 12  SR-p53         3068 non-null   float64
 13  group          3068 non-null   object 
 14  Molecule       3068 non-null   object 
dtypes: float64(12), object(3)
memory usage: 383.5+ KB


### Morgan Fingerprint generation
A fingerprint is generated for each compound in the "Molecule" column with a radius of 2 and a bit length of 2048.

In [25]:
#create a column for Morgan fingerprints
n_Bits = 2048
tox21_dataset["MorgFP"] = [AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits = n_Bits, useFeatures=True) for m in tox21_dataset['Molecule']] ## Molecule genaration

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [26]:
tox21_dataset["MorgFP"].head()

3     [1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5     [1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
11    [1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
13    [1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
20    [1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: MorgFP, dtype: object

In [27]:
type(tox21_dataset["MorgFP"])

pandas.core.series.Series

In [28]:
tox21_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3068 entries, 3 to 7810
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cano_smiles    3068 non-null   object 
 1   NR-AR          3068 non-null   float64
 2   NR-AR-LBD      3068 non-null   float64
 3   NR-AhR         3068 non-null   float64
 4   NR-Aromatase   3068 non-null   float64
 5   NR-ER          3068 non-null   float64
 6   NR-ER-LBD      3068 non-null   float64
 7   NR-PPAR-gamma  3068 non-null   float64
 8   SR-ARE         3068 non-null   float64
 9   SR-ATAD5       3068 non-null   float64
 10  SR-HSE         3068 non-null   float64
 11  SR-MMP         3068 non-null   float64
 12  SR-p53         3068 non-null   float64
 13  group          3068 non-null   object 
 14  Molecule       3068 non-null   object 
 15  MorgFP         3068 non-null   object 
dtypes: float64(12), object(4)
memory usage: 407.5+ KB


## Obtain descriptors

**- DescriptorCalculator(): Various bits and pieces for calculating descriptors.**
**- CalcDescriptors(): 

In [29]:
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
descriptors = list(np.array(Descriptors._descList)[:,0])
print(np.array(Descriptors._descList)[:,0])

['MaxEStateIndex' 'MinEStateIndex' 'MaxAbsEStateIndex' 'MinAbsEStateIndex'
 'qed' 'MolWt' 'HeavyAtomMolWt' 'ExactMolWt' 'NumValenceElectrons'
 'NumRadicalElectrons' 'MaxPartialCharge' 'MinPartialCharge'
 'MaxAbsPartialCharge' 'MinAbsPartialCharge' 'FpDensityMorgan1'
 'FpDensityMorgan2' 'FpDensityMorgan3' 'BalabanJ' 'BertzCT' 'Chi0' 'Chi0n'
 'Chi0v' 'Chi1' 'Chi1n' 'Chi1v' 'Chi2n' 'Chi2v' 'Chi3n' 'Chi3v' 'Chi4n'
 'Chi4v' 'HallKierAlpha' 'Ipc' 'Kappa1' 'Kappa2' 'Kappa3' 'LabuteASA'
 'PEOE_VSA1' 'PEOE_VSA10' 'PEOE_VSA11' 'PEOE_VSA12' 'PEOE_VSA13'
 'PEOE_VSA14' 'PEOE_VSA2' 'PEOE_VSA3' 'PEOE_VSA4' 'PEOE_VSA5' 'PEOE_VSA6'
 'PEOE_VSA7' 'PEOE_VSA8' 'PEOE_VSA9' 'SMR_VSA1' 'SMR_VSA10' 'SMR_VSA2'
 'SMR_VSA3' 'SMR_VSA4' 'SMR_VSA5' 'SMR_VSA6' 'SMR_VSA7' 'SMR_VSA8'
 'SMR_VSA9' 'SlogP_VSA1' 'SlogP_VSA10' 'SlogP_VSA11' 'SlogP_VSA12'
 'SlogP_VSA2' 'SlogP_VSA3' 'SlogP_VSA4' 'SlogP_VSA5' 'SlogP_VSA6'
 'SlogP_VSA7' 'SlogP_VSA8' 'SlogP_VSA9' 'TPSA' 'EState_VSA1'
 'EState_VSA10' 'EState_VSA11' 'EState_VSA2' 

In [30]:
calculator = MoleculeDescriptors.MolecularDescriptorCalculator(descriptors)
# Define a custom function to calculate molecular descriptors
def computeDescriptors(mol, calculator):
    res = list(calculator.CalcDescriptors(mol))
    if not np.all(np.isfinite(res)):
        return None  
# Make it's easier to identify problematic molecules (e.g.: infinty descriptor values) later 
    return res

In [31]:
# Compute the descriptors
tox21_dataset['Descriptors'] = tox21_dataset['Molecule'].map(lambda x: computeDescriptors(x,calculator))
# Remove rows with non-finite descriptor values (seems to be only 1 by comparing the counts)
tox21_dataset = tox21_dataset[tox21_dataset['Descriptors'].map(lambda x: x is not None)]
tox21_dataset.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,cano_smiles,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53,group,Molecule,MorgFP,Descriptors
3,P(O)(O)(=O)C(P(O)(O)=O)(O)C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,,"[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[10.251875, -5.1977237654321, 10.251875, 0.383..."
5,ClS(=O)(=O)c1ccccc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,,"[1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[10.598302469135803, -3.5306018518518507, 10.5..."
11,O(C(=O)C(C)C)CC(C)C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[10.825046296296296, -0.10305555555555523, 10...."
13,O=C([O-])Cc1c2c(ccc1)cccc2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,train,,"[1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[10.50128448601663, -1.0404138321995455, 10.50..."
20,OCC#C,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,,"[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[7.638888888888889, -0.15277777777777768, 7.63..."


In [32]:
df_descriptors = pd.DataFrame(descriptors, columns=['desc_name'])
df_descriptors

Unnamed: 0,desc_name
0,MaxEStateIndex
1,MinEStateIndex
2,MaxAbsEStateIndex
3,MinAbsEStateIndex
4,qed
...,...
195,fr_thiazole
196,fr_thiocyan
197,fr_thiophene
198,fr_unbrch_alkane


In [33]:
desc_name = [df_descriptors['desc_name'][i] for i in range(len(df_descriptors))]
desc_val = [list(l) for l in tox21_dataset['Descriptors']]
df_desc = pd.DataFrame(desc_val, index = tox21_dataset.index, columns=desc_name)
df_desc.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
3,10.251875,-5.197724,10.251875,0.383488,0.365583,206.027,197.963,205.974526,68,0,...,0,0,0,0,0,0,0,0,0,0
5,10.598302,-3.530602,10.598302,0.135802,0.609724,176.624,171.584,175.969878,54,0,...,0,0,0,0,0,0,0,0,0,0
11,10.825046,-0.103056,10.825046,0.001157,0.564836,144.214,128.086,144.11503,60,0,...,0,0,0,0,0,0,0,0,0,0
13,10.501284,-1.040414,10.501284,0.026435,0.703273,185.202,176.13,185.060803,70,0,...,0,0,0,0,0,0,0,0,0,0
20,7.638889,-0.152778,7.638889,0.152778,0.37469,56.064,52.032,56.026215,22,0,...,0,0,0,1,0,0,0,0,0,0


In [34]:
df_desc['group'] = tox21_dataset[['group']].copy()

In [35]:
df_desc.head()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,group
3,10.251875,-5.197724,10.251875,0.383488,0.365583,206.027,197.963,205.974526,68,0,...,0,0,0,0,0,0,0,0,0,train
5,10.598302,-3.530602,10.598302,0.135802,0.609724,176.624,171.584,175.969878,54,0,...,0,0,0,0,0,0,0,0,0,train
11,10.825046,-0.103056,10.825046,0.001157,0.564836,144.214,128.086,144.11503,60,0,...,0,0,0,0,0,0,0,0,0,train
13,10.501284,-1.040414,10.501284,0.026435,0.703273,185.202,176.13,185.060803,70,0,...,0,0,0,0,0,0,0,0,0,train
20,7.638889,-0.152778,7.638889,0.152778,0.37469,56.064,52.032,56.026215,22,0,...,0,0,1,0,0,0,0,0,0,train


In [36]:
#Finds Null data in any row if any
sample_incomplete_rows = df_desc[df_desc.isnull().any(axis=1)]
sample_incomplete_rows.head(len(sample_incomplete_rows))

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,group


In [37]:
df_desc.describe()

Unnamed: 0,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,NumRadicalElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
count,3050.0,3050.0,3050.0,3050.0,3050.0,3050.0,3050.0,3050.0,3050.0,3050.0,...,3050.0,3050.0,3050.0,3050.0,3050.0,3050.0,3050.0,3050.0,3050.0,3050.0
mean,9.387625,-0.691471,9.387625,0.38434,0.545164,217.881249,202.765545,217.585525,81.083934,0.003934,...,0.030164,0.037377,0.007213,0.004918,0.001639,0.00918,0.000984,0.005574,0.694098,0.030164
std,2.933705,1.526462,2.933705,0.477248,0.165186,124.981533,118.77075,124.841556,44.881837,0.111569,...,0.18927,0.212544,0.092062,0.069967,0.040462,0.098767,0.031352,0.078743,2.214298,0.190995
min,1.5,-9.243265,1.5,0.0,0.012914,32.042,28.01,32.026215,14.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8.285236,-1.068652,8.285236,0.086817,0.43799,139.1055,128.086,139.024252,52.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,10.326422,-0.350665,10.326422,0.229167,0.540883,186.339,174.091,186.16198,70.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,11.353222,0.253149,11.353222,0.532809,0.659961,266.341,248.225,266.164283,100.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,15.645307,4.0,15.645307,8.02672,0.944994,1950.681,1904.313,1949.500417,592.0,4.0,...,3.0,2.0,2.0,1.0,1.0,2.0,1.0,2.0,24.0,4.0


### Splitting and Visualizing the data

# Splite the data


In [38]:
#Returns the training set
X_train = df_desc[df_desc["group"]=="train"]
y_train = tox21_dataset[tox21_dataset["group"]=="train"]

In [39]:
#Returns the test set
X_test = df_desc[df_desc["group"]=="test"]
y_test = tox21_dataset[tox21_dataset["group"]=="test"]

In [40]:
#Returns the valid set
X_valid = df_desc[df_desc["group"]=="valid"]
y_valid = tox21_dataset[tox21_dataset["group"]=="valid"]

In [41]:
# Specifing the columns for dependent and independent variables (Training dataset)
X_train = X_train.drop(['group'], axis=1)

In [42]:
y_train = y_train.drop(['group', 'cano_smiles', 'Molecule', 'Descriptors', 'MorgFP'], axis=1)

In [43]:
# Specifing the columns for dependent and independent variables (Training dataset)
X_valid = X_valid.drop(['group'], axis=1)

In [44]:
y_valid = y_valid.drop(['group', 'cano_smiles', 'Molecule', 'Descriptors', 'MorgFP'], axis=1)

In [45]:
# Specifing the columns for dependent and independent variables (Training dataset)
X_test = X_test.drop(['group'], axis=1)

In [46]:
y_test = y_test.drop(['group', 'cano_smiles', 'Molecule', 'Descriptors', 'MorgFP'], axis=1)

**Convert the RDKit explicit vectors into numpy arrays**

In [47]:
X_train = np.array(X_train)
X_test = np.array(X_test)
X_valid = np.array(X_valid)
y_train = np.array(y_train)
y_test = np.array(y_test)
y_valid = np.array(y_valid)

In [48]:
print(type(X_train))
print(type(y_train))
print(type(X_test))
print(type(y_test))
print(type(X_valid))
print(type(y_valid))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [49]:
print(X_train.shape, y_train.shape)

(2417, 200) (2417, 12)


In [50]:
print(X_valid.shape, y_valid.shape)

(318, 200) (318, 12)


In [51]:
print(X_test.shape, y_test.shape)

(315, 200) (315, 12)


## Feature Scaling

In [52]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)
X_test = sc.transform(X_test)

In [53]:
from torch.utils.data import TensorDataset

X_train = torch.as_tensor(X_train).float()
y_train = torch.as_tensor(y_train).float()
X_valid = torch.as_tensor(X_valid).float()
y_valid = torch.as_tensor(y_valid).float()
X_test = torch.as_tensor(X_test).float()
y_test = torch.as_tensor(y_test).float()


training_data =  TensorDataset(X_train, y_train)


In [54]:
train_loader = torch.utils.data.DataLoader(training_data,batch_size=32)

In [55]:
# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using {} device".format(device))


Using cuda device


## To make an NN model 

In [81]:
class nn_classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.models = nn.ModuleList(
       [ nn.Linear(X_train.shape[-1],32),
        nn.BatchNorm1d(32),
        nn.ReLU(),
        nn.Linear(32, 12*2)]
        )
        
    def forward(self, data):
        for m in self.models:
            data = m(data)
        # Here each molecule has 12 tasks, each task has 2 classes (0 or 1)
        # see https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html for why I did this
        out = data.view(-1, 2, 12)
        return out

model = nn_classifier()
model = model.to(device)

### to define an optimizer, training and evaluation procedure.

In [82]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
ce_loss = nn.CrossEntropyLoss()

def train():
    for X, y in train_loader:
        X = X.to(device)
        y = y.to(device).long()
        pred = model(X)
        loss = ce_loss(pred, y).mean()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def CE_Loss(X, y):
    X = X.to(device)
    y = y.to(device).long()
    with torch.no_grad():
        pred = model(X)
        loss = ce_loss(pred, y).mean()
        return loss, pred

In [83]:
for epoch in range(100):
    MAE_train, pred_train =  CE_Loss(X_train, y_train)
    MAE_valid, pred_valid =  CE_Loss(X_valid, y_valid)
    MAE_test, pred_test =  CE_Loss(X_test, y_test)

    print(f"epoch {epoch:3d}: Training CE  {MAE_train.item():>0.2f}; Validation CE  {MAE_valid.item():>0.2f}; Test CE  {MAE_test.item():>0.2f} ")
    train()


epoch   0: Training CE  0.69; Validation CE  0.69; Test CE  0.69 
epoch   1: Training CE  0.20; Validation CE  0.21; Test CE  0.21 
epoch   2: Training CE  0.12; Validation CE  0.13; Test CE  0.14 
epoch   3: Training CE  0.10; Validation CE  0.11; Test CE  0.13 
epoch   4: Training CE  0.10; Validation CE  0.11; Test CE  0.12 
epoch   5: Training CE  0.09; Validation CE  0.11; Test CE  0.12 
epoch   6: Training CE  0.09; Validation CE  0.11; Test CE  0.12 
epoch   7: Training CE  0.09; Validation CE  0.10; Test CE  0.12 
epoch   8: Training CE  0.09; Validation CE  0.10; Test CE  0.12 
epoch   9: Training CE  0.08; Validation CE  0.10; Test CE  0.12 
epoch  10: Training CE  0.08; Validation CE  0.10; Test CE  0.12 
epoch  11: Training CE  0.08; Validation CE  0.11; Test CE  0.12 
epoch  12: Training CE  0.08; Validation CE  0.11; Test CE  0.12 
epoch  13: Training CE  0.08; Validation CE  0.11; Test CE  0.13 
epoch  14: Training CE  0.08; Validation CE  0.11; Test CE  0.13 
epoch  15:

In [89]:
target_names = tox21_dataset.columns.to_list()[1:-4]
target_names

['NR-AR',
 'NR-AR-LBD',
 'NR-AhR',
 'NR-Aromatase',
 'NR-ER',
 'NR-ER-LBD',
 'NR-PPAR-gamma',
 'SR-ARE',
 'SR-ATAD5',
 'SR-HSE',
 'SR-MMP',
 'SR-p53']

In [93]:
def accuracy(X, y):
    X = X.to(device)
    y = y.to(device).long()
    pred = model(X)
    pred_label = torch.argmax(pred, dim=1)
    for i, name in enumerate(target_names):
        compare = (pred_label[:, i] == y[:, i])
        print("{} accuracy: {:.2f}%".format(name, 100.*compare.sum()/compare.shape[0]))
    
accuracy(X_test, y_test)

NR-AR accuracy: 98.10%
NR-AR-LBD accuracy: 98.73%
NR-AhR accuracy: 94.29%
NR-Aromatase accuracy: 97.14%
NR-ER accuracy: 86.35%
NR-ER-LBD accuracy: 97.14%
NR-PPAR-gamma accuracy: 98.73%
SR-ARE accuracy: 92.70%
SR-ATAD5 accuracy: 100.00%
SR-HSE accuracy: 97.46%
SR-MMP accuracy: 94.60%
SR-p53 accuracy: 99.37%


In [102]:
from sklearn.metrics import roc_auc_score

def roc_auc(X, y):
    X = X.to(device)
    y = y.to(device).long()
    pred = model(X)
    for i, name in enumerate(target_names):
        try:
            score = roc_auc_score(y[:, i].cpu().detach().numpy(), pred[:, 0, i].cpu().detach().numpy())
            print("{} roc_auc: {:.2f}".format(name, score))
        except ValueError:
            print("{} roc_auc not defined...".format(name))
    
roc_auc(X_test, y_test)

NR-AR roc_auc: 0.36
NR-AR-LBD roc_auc: 0.28
NR-AhR roc_auc: 0.29
NR-Aromatase roc_auc: 0.63
NR-ER roc_auc: 0.42
NR-ER-LBD roc_auc: 0.36
NR-PPAR-gamma roc_auc: 0.16
SR-ARE roc_auc: 0.35
SR-ATAD5 roc_auc not defined...
SR-HSE roc_auc: 0.43
SR-MMP roc_auc: 0.26
SR-p53 roc_auc: 0.10
