In [6]:
import argparse
import sys

import numpy as np

from utils import *
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

from sklearn.metrics import accuracy_score, roc_auc_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import seaborn as sns

parser = argparse.ArgumentParser()
args = parser.parse_args("")
args.seed = 123
args.n_bits = 2048
args.n_splits = 5
args.test_size = 0.15
args.val_size = 0.15
args.shuffle = True

np.random.seed(args.seed)
torch.manual_seed(args.seed)

#torch.set_default_dtype(torch.float)

tox_types = ['nr-ahr', 'nr-ar-lbd', 
             'nr-ar', 'nr-aromatase', 
             'nr-er-lbd', 'nr-er', 
             'nr-ppar-gamma', 'sr-are', 
             'sr-atad5', 'sr-hse', 
             'sr-mmp', 'sr-p53']

In [10]:
dict_partition = dict()
for tox_type in tox_types:
    args.tox_type = 'tox21/'+tox_type
    smiles, toxs = make_dataset(args)
    args.tox_type = tox_type
    dict_partition[args.tox_type] = partition(smiles, toxs, args)
    print(dict_partition[args.tox_type])

{'train': <utils.ToxDataset object at 0x1a239200b8>, 'val': <utils.ToxDataset object at 0x1a28d66eb8>, 'test': <utils.ToxDataset object at 0x1a28d665f8>}
{'train': <utils.ToxDataset object at 0x1a28d54390>, 'val': <utils.ToxDataset object at 0x10a560748>, 'test': <utils.ToxDataset object at 0x1a28d66630>}
{'train': <utils.ToxDataset object at 0x1a28d66710>, 'val': <utils.ToxDataset object at 0x1a28d666d8>, 'test': <utils.ToxDataset object at 0x1a28d66668>}
{'train': <utils.ToxDataset object at 0x1a28d666a0>, 'val': <utils.ToxDataset object at 0x1a28e88048>, 'test': <utils.ToxDataset object at 0x1a28e88080>}
{'train': <utils.ToxDataset object at 0x1a28e88198>, 'val': <utils.ToxDataset object at 0x1a28e88208>, 'test': <utils.ToxDataset object at 0x1a28e881d0>}
{'train': <utils.ToxDataset object at 0x1a28e88320>, 'val': <utils.ToxDataset object at 0x1a28e88358>, 'test': <utils.ToxDataset object at 0x1a28e88390>}
{'train': <utils.ToxDataset object at 0x1a28e88518>, 'val': <utils.ToxDataset

In [9]:
exp_result = dict()
dict_partition

{'tox21/nr-ahr': {'train': <utils.ToxDataset at 0x1a23794908>,
  'val': <utils.ToxDataset at 0x10a4aecf8>,
  'test': <utils.ToxDataset at 0x10a554a20>},
 'tox21/nr-ar-lbd': {'train': <utils.ToxDataset at 0x1a23889ef0>,
  'val': <utils.ToxDataset at 0x1a23889f60>,
  'test': <utils.ToxDataset at 0x1a23889eb8>},
 'tox21/nr-ar': {'train': <utils.ToxDataset at 0x1a23889f98>,
  'val': <utils.ToxDataset at 0x1a23889f28>,
  'test': <utils.ToxDataset at 0x1a23a28048>},
 'tox21/nr-aromatase': {'train': <utils.ToxDataset at 0x1a23a28198>,
  'val': <utils.ToxDataset at 0x1a23a281d0>,
  'test': <utils.ToxDataset at 0x1a23a28208>},
 'tox21/nr-er-lbd': {'train': <utils.ToxDataset at 0x1a23a282e8>,
  'val': <utils.ToxDataset at 0x1a23a28358>,
  'test': <utils.ToxDataset at 0x1a23a28320>},
 'tox21/nr-er': {'train': <utils.ToxDataset at 0x1a23a28470>,
  'val': <utils.ToxDataset at 0x1a23a284a8>,
  'test': <utils.ToxDataset at 0x1a23a284e0>},
 'tox21/nr-ppar-gamma': {'train': <utils.ToxDataset at 0x1a23a

In [4]:
def construct_model(args):
    layers = list()
    layers.append(nn.Linear(args.input_dim, args.hidden_dim))
    layers.append(nn.ReLU())
    
    for i in range(args.n_layer-2):
        layers.append(nn.Linear(args.hidden_dim, args.hidden_dim))
        layers.append(nn.ReLU())
        if args.dropout_rate > 0:
            layers.append(nn.Dropout(args.dropout_rate))
            
    layers.append(nn.Linear(args.hidden_dim, args.output_dim))
    layers.append(nn.Sigmoid())
    
    model = nn.Sequential(*layers)
    return model

In [15]:
def train(model, partition, optimizer, criterion, args):
    dict_train_loss = dict()
    
    data_iter = DataLoader(partition['train'], 
                           batch_size=args.batch_size, 
                           shuffle=args.shuffle)
    
    for epoch in range(args.epoch):
        train_loss_epoch = 0
        for i, batch in enumerate(data_iter):
            fps = torch.tensor(batch[0], device=args.device)
            toxs = torch.tensor(batch[1], device=args.device)
            
            model.train()
            optimizer.zero_grad()
            
            pred_toxs = model(fps)
            pred_toxs.require_grad = False
            
            train_loss = criterion(pred_toxs, toxs)
            train_loss_epoch += train_loss.item()
            train_loss.backward()
            optimizer.step()
            
        dict_train_loss[(epoch+1)*args.batch_size/len(data_iter)] = train_loss.item()
    
    return model, pd.Series(dict_train_loss)

In [11]:
def validate(model, parition, criterion, args):
    dict_

tensor([[0, 0],
        [1, 1]], dtype=torch.int8)

In [19]:
def experiment(args):
    dict_train_loss = dict()
    dict_val_loss = dict()
    
    model = construct_model(args)
    model.to(args.device)
    
    optimizer = args.optim(model.parameters(), 
                           lr=args.lr, 
                           weight_decay=args.l2_coef)
    
    data_train = DataLoader(args.partition['train'],
                            batch_size=args.batch_size,
                            shuffle=args.shuffle)
    
    data_val = DataLoader(args.partition['val'],
                          batch_size=args.batch_size,
                          shuffle=args.shuffle)
    
    data_test = DataLoader(args.partition['test'],
                           batch_size=args.batch_size,
                           shuffle=args.shuffle)
    
    for epoch in range(args.epoch):
        model.train()
        epoch_train_loss = 0
        for i, batch in enumerate(data_train):
            fps = torch.tensor(batch[0], device=args.device)
            toxs = torch.tensor(batch[1], device=args.device)
            
            optimizer.zero_grad()
            
            pred_toxs = model(fps)
            pred_toxs.require_grad = False
            
            train_loss = criterion(pred_toxs, toxs)
            epoch_train_loss += train_loss.item()
            train_loss.backward()
            optimizer.step()
    
        dict_train_loss[epoch] = train_loss.item()
        
        model.eval()
        epoch_val_loss = 0
        with torch.no_grad():
            for i, batch in enumerate(data_val):
                fps = torch.tensor(batch[0], device=args.device)
                toxs = torch.tensor(batch[1], device=args.device)

                pred_toxs = model(fps)
                
                val_loss = criterion(pred_toxs, toxs)
                epoch_val_loss += val_loss.item()
                
        dict_val_loss[epoch] = val_loss.item()
    
    model.eval()
    output = list()
    with torch.no_grad():
        for i, batch in enumerate(data_test):
            fps = torch.tensor(batch[0], device=args.device)
            toxs = torch.tensor(batch[1], device=args.device)
            
            pred_toxs = model(fps)
            
            output.append([0 if x<0.5 else 1 for x in pred_toxs])
        
        output = np.concatenate(output, axis=0)
      

In [16]:
args.exp_name = 'exp_hidden_dim'
args.input_dim = 2048
args.output_dim = 1
args.n_layer = 3
args.dropout_rate = 0.0
args.lr = 0.001
args.l2_coef = 0
args.optim = optim.Adam
args.criterion = nn.BCELoss()
args.epoch = 100
args.batch_size = 128
args.device = 'cpu'

exp_result[args.exp_name] = list()
hidden_dims = [64, 128, 256, 1024]

for tox_type in tox_types[:1]:
    for hidden_dim in hidden_dims:
        args.tox_type = tox_type
        args.hidden_dim = hidden_dim
        args.partition = dict_partition[args.tox_type]
        experiemnt(args)

SyntaxError: invalid syntax (<ipython-input-16-994a6ef30cb0>, line 1)

In [18]:
pred_toxs = [0.1, 0.1, 0.8]
output = [0 if x<0.5 else 1 for x in pred_toxs]
output

[0, 0, 1]

In [8]:
args.exp_name = 'exp_hidden_dim'
args.input_dim = 2048
args.output_dim = 1
args.n_layer = 3
args.dropout_rate = 0.0
args.lr = 0.001
args.l2_coef = 0
args.optim = optim.Adam
args.criterion = nn.BCELoss()
args.epoch = 100
args.batch_size = 128
args.device = 'cpu'

args.tox_type = tox_types[0]
args.hidden_dim = 128

dict_train_loss = dict()
dict_val_loss = dict()

model = construct_model(args)
model.to(args.device)

optimizer = args.optim(model.parameters(), 
                       lr=args.lr, 
                       weight_decay=args.l2_coef)

data_train = DataLoader(dict_partition[args.tox_type]['train'],
                        batch_size=args.batch_size,
                        shuffle=args.shuffle)

data_val = DataLoader(dict_partition[args.tox_type]['val'],
                      batch_size=args.batch_size,
                      shuffle=args.shuffle)

data_test = DataLoader(dict_partition[args.tox_type]['test'],
                       batch_size=args.batch_size,
                       shuffle=args.shuffle)

print(len(data_train))

for epoch in range(args.epoch):
    model.train()
    epoch_train_loss = 0
    for i, batch in enumerate(data_train):
        fps = torch.tensor(batch[0], device=args.device)
        toxs = torch.tensor(batch[1], device=args.device)

        optimizer.zero_grad()

        pred_toxs = model(fps)
        pred_toxs.require_grad = False

        train_loss = criterion(pred_toxs, toxs)
        epoch_train_loss += train_loss.item()
        train_loss.backward()
        optimizer.step()
        
        print("Epoch: ", epoch, "\t batch: ", i, "\t Training")

    dict_train_loss[epoch] = epoch_train_loss

    model.eval()
    epoch_val_loss = 0
    with torch.no_grad():
        for i, batch in enumerate(data_val):
            fps = torch.tensor(batch[0], device=args.device)
            toxs = torch.tensor(batch[1], device=args.device)

            pred_toxs = model(fps)

            val_loss = criterion(pred_toxs, toxs)
            epoch_val_loss += val_loss.item()

    dict_val_loss[epoch] = epoch_val_loss

model.eval()
output = list()
with torch.no_grad():
    for i, batch in enumerate(data_test):
        fps = torch.tensor(batch[0], device=args.device)
        toxs = torch.tensor(batch[1], device=args.device)

        pred_toxs = model(fps)

        output.append([0 if x<0.5 else 1 for x in pred_toxs])

    output = np.concatenate(output, axis=0)

KeyError: 'nr-ahr'