In [2]:
# -*- coding: utf-8 -*-

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from sklearn.metrics import auc as calculate_auc

import matplotlib.pyplot as plt
import seaborn as sns

from aggmap import AggMap, AggMapNet, loadmap

np.random.seed(666)

def prc_auc_score (y_true, y_score):
    precision, recall, threshold = precision_recall_curve(y_true, y_score) # PRC_AUC
    auc = calculate_auc(recall, precision)
    return auc

In [2]:
import tensorflow as tf
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "5"
physical_gpus = tf.config.experimental.list_physical_devices('GPU') 
print("physical_gpus:")
print(physical_gpus)
tf.config.experimental.set_memory_growth(physical_gpus[0], True)

physical_gpus:
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [3]:
dataset = './dataset/dataset_1647_all.csv'
data_df = pd.read_csv(dataset)

In [4]:
data_df.columns[-42:].tolist()

['alkane',
 '-CH-',
 '-CH2-',
 '-0.071712206',
 '-CH(CH3)2',
 '-C(CH3)3',
 'alkene',
 '#NAME?',
 '-CH=CH-',
 '-C=CH2',
 '-C=CH-',
 '-C=C-',
 '-CH=C=CH2',
 'alkyne',
 'aromatics',
 'alkyl halides',
 'alcohols',
 'primary alcohols',
 'secondary alcohols',
 'tertiary alcohols',
 'phenol',
 'esters',
 'ketones',
 'aldehydes',
 'carboxylic acids',
 'ether',
 'acyl halides',
 'amines',
 'primary amines',
 'secondary amines',
 'tertiary amines',
 'amides',
 'primary amides',
 'secondary amides',
 'tertiary amides',
 'nitriles',
 'nitro',
 '-N=C=O',
 '-N=C=S',
 'ortho',
 'meta',
 'para']

In [1]:
func_grps = ['alkane', 'alkene', 'alkyne', 'aromatics', 'alkyl halides', 'alcohols', 'esters', 'ketones',\
 'aldehydes', 'carboxylic acids', 'ether', 'acyl halides', 'amines', 'amides', 'nitriles',\
 'nitro', '-N=C=O', '-N=C=S', 'ortho', 'meta', 'para']

# func_grps = dfy.columns.tolist()

# func_grps = ['primary alcohols', 'secondary alcohols', 'tertiary alcohols', 'phenol']   # alcohols
print(len(func_grps))

21


In [5]:
dfx = data_df[data_df.columns[1:-42]]
dfy = data_df[func_grps] 
print(dfx.shape)
print(dfy.shape)

(8272, 1647)
(8272, 21)


In [7]:
# channels = [10]
# mp = AggMap(dfx, metric='correlation')
# for c in channels:
#     mp = mp.fit(cluster_channels=c, verbose=0)
#     mp.save('./model/1647_IR_alcohols_aggmap_correlation_c{}.mp'.format(c))

In [8]:
X = dfx.values
Y = dfy.values.astype(float)

In [9]:
channels = 10
# mp = AggMap(dfx, metric='correlation')
# mp.fit(cluster_channels=channels, verbose=0)
# mp.save('./model/1647_IR_aggmap_alkane_c{}.mp'.format(channels))

mp = loadmap('./model/1647_IR_aggmap_correlation_c{}.mp'.format(channels))

In [12]:
results = {}
results_num = {}

for func in func_grps:
    results[func] = []
    results_num[func] = []

for random_seed in [128]:   # 随机种子用于划分数据集
    outer = KFold(n_splits=5, shuffle=True, random_state=random_seed)
    fold_idx = 0
    outer_split = outer.split(X)
    
    for train_idx, test_idx in outer_split: # 五折交叉
        train_X, test_X = X[train_idx], X[test_idx]
        trainY, testY = Y[train_idx], Y[test_idx]
        
        trainX = mp.batch_transform(train_X, scale_method='standard')
        testX = mp.batch_transform(test_X, scale_method='standard')
        print("trainX shape is: " + str(trainX.shape))
        print("testX shape is: " + str(testX.shape))
        
        clf = AggMapNet.MultiLabelEstimator(epochs=300, batch_size=4, dense_layers=[256, 128], dropout=0.1, batch_norm=True, verbose=-2)
        clf.fit(trainX, trainY)

        # clf.save_model('./model/1647_model_IR_MultiLabel_c10_fold{}.h5'.format(fold_idx))
        
        print('Training finished.')
        y_true = testY
        y_pred = clf.predict(testX)
        y_score = clf.predict_proba(testX)
        
        for i in range(len(func_grps)):
            tn, fp, fn, tp = confusion_matrix(y_true[:, i], y_pred[:, i]).ravel()
            
            acc = (tp + tn) / sum([tn, fp, fn, tp])
            sensitivity = tp / sum([tp, fn])
            specificity = tn / sum([tn, fp])

            # prc_auc = prc_auc_score(y_true[i], y_score[i])
            # roc_auc = roc_auc_score(y_true[i], y_score[i])

            precision = tp / sum([tp, fp])
            recall = sensitivity
            F1 = 2 * precision * sensitivity / (precision + sensitivity)
        
            # res 记录结果用来画图
            res = clf.history   # dictionary
            res['fold'] = fold_idx
            res['channel'] = channels
            res['random_seed'] = random_seed

            # res_num 记录结果用来看数值
            fold_num = "fold_%s" % str(fold_idx).zfill(2)
            res_num = {'fold': fold_num,
                  'random_seed': random_seed,
                  'accuracy': acc,
                  #'prc_auc': prc_auc,
                  #'roc_auc': roc_auc,
                  'sensitivity': sensitivity,
                  'specificity': specificity,
                  'precision': precision,
                  'recall': recall,
                  'F1': F1}

            results[func_grps[i]].append(res)
            results_num[func_grps[i]].append(res_num)
        
        fold_idx += 1

  0%|          | 0/6617 [00:00<?, ?it/s]

100%|##########| 6617/6617 [04:02<00:00, 27.31it/s]
100%|##########| 1655/1655 [00:30<00:00, 54.21it/s]


trainX shape is: (6617, 41, 41, 10)
testX shape is: (1655, 41, 41, 10)
MultiLabelEstimator(batch_norm=True, batch_size=4, dense_layers=[256, 128],
                    dropout=0.1, epochs=300, gpuid='0',
                    name='AggMap MultiLabels Estimator', verbose=-2)
X type is ndarray


2024-02-07 01:31:05.785833: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-02-07 01:31:06.746413: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30987 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:86:00.0, compute capability: 7.0


Epoch 1/300


2024-02-07 01:31:09.985224: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8101


1655/1655 - 26s - loss: 0.2668 - val_loss: 0.1928 - 26s/epoch - 16ms/step
Epoch 2/300
1655/1655 - 21s - loss: 0.1970 - val_loss: 0.1587 - 21s/epoch - 13ms/step
Epoch 3/300
1655/1655 - 22s - loss: 0.1681 - val_loss: 0.1388 - 22s/epoch - 13ms/step
Epoch 4/300
1655/1655 - 21s - loss: 0.1499 - val_loss: 0.1200 - 21s/epoch - 13ms/step
Epoch 5/300
1655/1655 - 21s - loss: 0.1364 - val_loss: 0.1087 - 21s/epoch - 13ms/step
Epoch 6/300
1655/1655 - 22s - loss: 0.1250 - val_loss: 0.1036 - 22s/epoch - 13ms/step
Epoch 7/300
1655/1655 - 22s - loss: 0.1164 - val_loss: 0.0938 - 22s/epoch - 13ms/step
Epoch 8/300
1655/1655 - 21s - loss: 0.1084 - val_loss: 0.0917 - 21s/epoch - 13ms/step
Epoch 9/300
1655/1655 - 22s - loss: 0.1001 - val_loss: 0.0794 - 22s/epoch - 13ms/step
Epoch 10/300
1655/1655 - 21s - loss: 0.0935 - val_loss: 0.0754 - 21s/epoch - 13ms/step
Epoch 11/300
1655/1655 - 21s - loss: 0.0882 - val_loss: 0.0747 - 21s/epoch - 13ms/step
Epoch 12/300
1655/1655 - 21s - loss: 0.0819 - val_loss: 0.0641 -

In [None]:
# output results

result_path = './result/aggmapnet_1647_all_resnet.csv'
res_num_all = []

for func in func_grps:
    path = './result/result_1647_all_resnet/result_1647_{}_resnet.csv'.format(func)
    
    res_num_df = pd.DataFrame(results_num[func])
    res_num_df.to_csv(path)
    
    res_mean = res_num_df.groupby('random_seed').apply(np.mean).mean().round(3)
    res_std = res_num_df.groupby('random_seed').apply(np.std).mean().round(3)
    print('--------------{}---------------'.format(func))
    print('mean:')
    print(res_mean[['accuracy','precision','recall','sensitivity','specificity','F1']])
    res_num_all.append([func] + res_mean[['accuracy','precision','recall','sensitivity','specificity','F1']].tolist())
    # print('std:')
    # print(res_std)
res_num_all = pd.DataFrame(res_num_all, columns=['func', 'accuracy','precision','recall','sensitivity','specificity','F1'])
res_num_all.to_csv(result_path, index=False)

--------------alkane---------------
mean:
accuracy       0.950
precision      0.971
recall         0.970
sensitivity    0.970
specificity    0.840
F1             0.970
dtype: float64
--------------alkene---------------
mean:
accuracy       0.959
precision      0.907
recall         0.770
sensitivity    0.770
specificity    0.988
F1             0.832
dtype: float64
--------------alkyne---------------
mean:
accuracy       0.994
precision      0.915
recall         0.842
sensitivity    0.842
specificity    0.998
F1             0.876
dtype: float64
--------------aromatics---------------
mean:
accuracy       0.978
precision      0.979
recall         0.983
sensitivity    0.983
specificity    0.971
F1             0.981
dtype: float64
--------------alkyl halides---------------
mean:
accuracy       0.912
precision      0.866
recall         0.811
sensitivity    0.811
specificity    0.951
F1             0.836
dtype: float64
--------------alcohols---------------
mean:
accuracy       0.974
precision 

In [None]:
color = sns.color_palette("rainbow_r", 6) #PiYG
sns.palplot(color)

In [None]:
# 绘制训练过程的accuracy和loss曲线
res_df = pd.DataFrame(results[func_grps[0]])

sns.set(style='white', font_scale=2)

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(18,16), sharex=False, sharey=False)
ax0, ax1, ax2, ax3 = axes.ravel()

col = 'roc_auc'
acc_mean = res_df.groupby(['channel']).agg({col: lambda x:x.tolist()})[col].apply(lambda x:np.array(x).mean(axis=0)).apply(pd.Series).T
acc_mean.plot(ax=ax0, lw=4, color=color)
ax0.set_xlabel('Epochs')
ax0.set_ylabel('Train Roc_Accuracy')

col = 'loss'
acc_mean = res_df.groupby(['channel']).agg({col: lambda x:x.tolist()})[col].apply(lambda x:np.array(x).mean(axis=0)).apply(pd.Series).T
acc_mean.plot(ax=ax1, lw=4, color=color)
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Train Loss')

col = 'val_roc_auc'
acc_mean = res_df.groupby(['channel']).agg({col: lambda x:x.tolist()})[col].apply(lambda x:np.array(x).mean(axis=0)).apply(pd.Series).T
acc_mean.plot(ax=ax2, lw=4, color=color)
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Validation Roc_Accuracy')

col = 'val_loss'
acc_mean = res_df.groupby(['channel']).agg({col: lambda x:x.tolist()})[col].apply(lambda x:np.array(x).mean(axis=0)).apply(pd.Series).T
acc_mean.plot(ax=ax3, lw=4, color=color)
ax3.set_xlabel('Epochs')
ax3.set_ylabel('Validation Loss')

fig.tight_layout()
plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0.3)
# plt.savefig('./result/450_%s_5FCV_correlation_valid.png' % func_grp, bbox_inches='tight', dpi=400)