In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from tox21_project import deeplearning as dl
from tox21_project import data_preprocessing as dp

import torch
import torch.nn as nn
import torch.nn.functional as F

# import warnings
# warnings.filterwarnings('ignore')

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

## 1. 数据

In [2]:
source_data = pd.read_csv('data/data_dups_removed.csv', index_col=0)
descriptors = pd.read_csv('data/molecular_descriptors.csv', index_col=0)
fingerprints = pd.read_csv('data/morgan_fingerprints_10.csv', index_col=0)
similarity = pd.read_csv('data/similarity_scaffold_known_toxic.csv', index_col=0)

### 1.1 数据分割

分割原始数据，并获取mask和分割的index

In [3]:
source_features = ['FW','SMILES']
targets = ['SR-HSE','NR-AR', 'SR-ARE', 'NR-Aromatase', 'NR-ER-LBD', 'NR-AhR', 'SR-MMP',\
       'NR-ER', 'NR-PPAR-gamma', 'SR-p53', 'SR-ATAD5', 'NR-AR-LBD']

raw_y = source_data[targets]
raw_X = source_data[source_features]

null_mask = np.array(np.logical_not(raw_y.isnull().values), int)
raw_y = raw_y.fillna(0.0)
mask_df = pd.DataFrame(null_mask, columns=[str(i) + '_mask' for i in targets], index=raw_y.index)
raw_y = pd.concat([raw_y, mask_df], axis=1)

test_size = 0.2
train_X_source, test_X_source, train_y, test_y = train_test_split(raw_X, raw_y, test_size=test_size, random_state=42)

train_index = train_X_source.index
test_index = test_X_source.index

确保所需文件的index都相同，方便下面数据分割用index来索引

In [4]:
[(source_data.index == x.index).all() for x in [descriptors, fingerprints, similarity]]

[True, True, True]

缺失值查询，否则下面归一化操作无法进行

In [5]:
[x.isnull().sum().any() for x in [descriptors, fingerprints, similarity]]

[False, False, False]

根据index来分割descriptors, fingerprints和similarity

In [6]:
# descriptors
des_col = list(descriptors.columns)[2:]
train_X_des = descriptors[des_col].loc[train_index]
test_X_des = descriptors[des_col].loc[test_index]

# fingerprints
fp_col = list(fingerprints.columns)[2:]
train_X_fp = fingerprints[fp_col].loc[train_index].values
test_X_fp = fingerprints[fp_col].loc[test_index].values

# similarity
si_col = list(similarity.columns)
train_X_si = similarity[si_col].loc[train_index]
test_X_si = similarity[si_col].loc[test_index]

# bow
smiles = train_X_source['SMILES']
bow = dp.BagOfWords(smiles)
train_X_bow = bow.fit()
test_X_bow = bow.transform(test_X_source['SMILES'])
train_X_bow = np.insert(train_X_bow, 0, train_X_source['FW'], axis=1)
test_X_bow = np.insert(test_X_bow, 0, test_X_source['FW'], axis=1)

# y和mask
train_y, train_mask = train_y[targets], train_y[mask_df.columns]
test_y, test_mask = test_y[targets], test_y[mask_df.columns]

### 1.2 数据工程

fit、transform和fit_transform的区别点击[这里](https://cloud.tencent.com/developer/article/1770568)

In [7]:
def transform(train, test, apply):
    train_new = apply.fit_transform(train)
    test_new = apply.transform(test)
    return train_new, test_new

数据标准化

In [8]:
# 标准化descriptors
train_X_des, test_X_des = transform(train_X_des, test_X_des, StandardScaler())
# 标准化similarity
train_X_si, test_X_si = transform(train_X_si, test_X_si, StandardScaler())
# 标准化BOW
train_X_bow, test_X_bow = transform(train_X_bow, test_X_bow, StandardScaler())

## 2. 模型

### 2.1 数据集的获取

In [9]:
def prepare_data(with_pca, train_X, train_y, train_mask, test_X, test_y, test_mask, batch_size):
    """PCA降维一半"""
    if with_pca:
        pca_shape = train_X.shape[1] // 2
        pca = PCA(pca_shape)
        train_X = pca.fit_transform(train_X)
        test_X = pca.transform(test_X)
    return dl.get_data(train_X, train_y, train_mask, test_X, test_y, test_mask, batch_size)

In [10]:
train_X_types = [train_X_des, train_X_fp, train_X_si, train_X_bow]
test_X_types = [test_X_des, test_X_fp, test_X_si, test_X_bow]
types_names = ['descriptors', 'fingerprint', 'similarity', 'bow']

### 2.2 训练

In [11]:
def run(epochs, layer, type_n, with_pca):
    # 数据获取
    train_X, test_X = train_X_types[type_n], test_X_types[type_n]
    train_set, valid_set, train_loader = prepare_data(with_pca, train_X, train_y, train_mask, test_X, test_y, test_mask, batch_size)
    input_size = len(train_set[0][0])
    
    # 参数设置
    net = dl.net(input_size, output_size, layer).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=4e-5, weight_decay=1e-5)
    criterion = nn.BCELoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', verbose=False)
    early_stop = dl.EarlyStopping()
    trainer = dl.Trainer(net, optimizer, criterion, epochs, device, scheduler, early_stop)
    if with_pca:
        model_name = types_names[type_n] + '_' + '_'.join([str(l) for l in layer]) + '_pca'
    else:
        model_name = types_names[type_n] + '_' + '_'.join([str(l) for l in layer])
    
    # 训练
    print(f'\nrunning: {layer}_{types_names[type_n]}_pca_{with_pca}')
    res = trainer.train_model(train_loader, valid_set, model_name)
    
    # 保存数据
    res.to_csv('data/dnn_data/' + model_name + '.csv')

In [12]:
output_size = 12
batch_size = 128
layers = [[1024],[1024,2048],[1024,2048,4196]]
epochs = 10
model_name = 'test_model'

In [13]:
# 网格筛选
for i in range(len(train_X_types)):
    for layer in layers:
        for with_pca in [False, True]:
            run(epochs, layer, i, with_pca)


running: [1024]_descriptors_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.673275  val loss: 39.209000  AUPRC: 0.202  AUCROC: 0.703

running: [1024]_descriptors_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.657938  val loss: 39.029694  AUPRC: 0.192  AUCROC: 0.715

running: [1024, 2048]_descriptors_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.521158  val loss: 38.539165  AUPRC: 0.274  AUCROC: 0.766

running: [1024, 2048]_descriptors_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.507130  val loss: 38.359886  AUPRC: 0.279  AUCROC: 0.773

running: [1024, 2048, 4196]_descriptors_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.534277  val loss: 36.835808  AUPRC: 0.290  AUCROC: 0.782

running: [1024, 2048, 4196]_descriptors_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.695755  val loss: 35.258804  AUPRC: 0.280  AUCROC: 0.778

running: [1024]_fingerprint_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.039106  val loss: 40.452213  AUPRC: 0.184  AUCROC: 0.677

running: [1024]_fingerprint_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.216347  val loss: 40.165344  AUPRC: 0.132  AUCROC: 0.625

running: [1024, 2048]_fingerprint_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.125540  val loss: 37.221294  AUPRC: 0.194  AUCROC: 0.691

running: [1024, 2048]_fingerprint_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.182045  val loss: 37.828281  AUPRC: 0.169  AUCROC: 0.676

running: [1024, 2048, 4196]_fingerprint_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.003901  val loss: 33.987999  AUPRC: 0.169  AUCROC: 0.697

running: [1024, 2048, 4196]_fingerprint_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.012756  val loss: 33.263004  AUPRC: 0.207  AUCROC: 0.691

running: [1024]_similarity_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.607906  val loss: 39.952042  AUPRC: 0.152  AUCROC: 0.649

running: [1024]_similarity_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.797607  val loss: 40.333721  AUPRC: 0.158  AUCROC: 0.661

running: [1024, 2048]_similarity_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.036999  val loss: 39.774479  AUPRC: 0.181  AUCROC: 0.673

running: [1024, 2048]_similarity_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.035490  val loss: 38.689140  AUPRC: 0.168  AUCROC: 0.672

running: [1024, 2048, 4196]_similarity_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.441854  val loss: 38.809132  AUPRC: 0.176  AUCROC: 0.677

running: [1024, 2048, 4196]_similarity_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.437428  val loss: 38.390938  AUPRC: 0.176  AUCROC: 0.655

running: [1024]_bow_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 31.052158  val loss: 39.966751  AUPRC: 0.149  AUCROC: 0.675

running: [1024]_bow_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.631819  val loss: 39.849686  AUPRC: 0.161  AUCROC: 0.699

running: [1024, 2048]_bow_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.484863  val loss: 38.941853  AUPRC: 0.229  AUCROC: 0.756

running: [1024, 2048]_bow_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.512138  val loss: 38.306313  AUPRC: 0.217  AUCROC: 0.735

running: [1024, 2048, 4196]_bow_pca_False


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.515987  val loss: 36.134499  AUPRC: 0.252  AUCROC: 0.757

running: [1024, 2048, 4196]_bow_pca_True


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch [1/10]----train loss: 30.604494  val loss: 37.883144  AUPRC: 0.232  AUCROC: 0.739
