In [None]:
import os

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from keras.models import load_model
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
import random
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
from imblearn.over_sampling import SMOTE, ADASYN

def auc_11(fileplace):
    models = ['DNN']
    features = ['ACF', 'ASA', 'AAINDEX', 'BTA', 'CKSAAP', 'GPS', 'OBC', 'PSEAAC', 'PSSM', 'SS', 'transformer']
    scorelist = []
    
    # 读取第一个文件以获得pepname
    first_file = fileplace + r'/' + models[0] + r'/' + features[0] + r'/' + features[0] + r'_y_label&score.csv'
    df_first = pd.read_csv(first_file)
    peptidenames = df_first['pepname']
    
    # 初始化一个 DataFrame 来保存所有的分数
    all_scores = pd.DataFrame(peptidenames, columns=['pepname'])
    
    # 处理每个文件
    for mod in models:
        for feature in features:
            df = pd.read_csv(fileplace + r'/' + mod + r'/' + feature + r'/' + feature + r'_y_label&score.csv')
            # 根据 pepname 进行匹配和添加分数
            df = df[df['pepname'].isin(peptidenames)]
            df = df[['pepname', 'score']]
            all_scores = pd.merge(all_scores, df, on='pepname', how='left', suffixes=('', f'_{feature}'))
    
    # 将分数列提取出来
    score_columns = [col for col in all_scores.columns if col != 'pepname']
    
    # 提取所有的分数列
    scorelist = all_scores[score_columns].values
    
    return np.array(scorelist)

In [136]:
###用了这个
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

def trans_training_DNN(pepID, x, y):
    print('\n')
    print('———————————— Training small sample integrated DNN model ————————————')
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import StratifiedKFold
    import gc

    def reset_keras():
        from tensorflow.keras.backend import clear_session
        clear_session()
        gc.collect()

    feature_size = x.shape[1]

    # 使用 5 折交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    count = 1
    y_label = []
    y_score = []
    peplist = []

    for train_index, test_index in skf.split(x, y):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        x_resampled, y_resampled = ADASYN().fit_resample(x_train, y_train)
        xy = list(zip(x_resampled, y_resampled))
        random.shuffle(xy)
        x_resampled[:], y_resampled[:] = zip(*xy)
        x_train = x_resampled
        y_train = y_resampled
        
        dnn = get_model(feature_size)
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
            ModelCheckpoint(f'./transfer/models/integrated_DNN/iDNN_{count}.h5', save_best_only=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
        ]

        dnn.fit(
            x_train, y_train,
            epochs=100,
            batch_size=64,
            validation_data=(x_test, y_test),
            callbacks=callbacks,
            verbose=1
        )

        y_label.append(list(y_test))
        y_test_score = dnn.predict(x_test).flatten()
        y_score.append(list(y_test_score))
        dnn.save(f'./models/integrated_DNN/iDNN_{count}.h5')
        peplist.append(list(pepID[test_index]))

        auc_score = roc_auc_score(y_test, y_test_score)
        print(f'第 {count} 个模型的AUC分数为：{auc_score}')
        count += 1
        reset_keras()

    from itertools import chain
    df_y = pd.concat([
        pd.DataFrame(list(chain.from_iterable(peplist)), columns=['pepname']),
        pd.DataFrame(list(chain.from_iterable(y_label)), columns=['label']),
        pd.DataFrame(list(chain.from_iterable(y_score)), columns=['score'])
    ], axis=1)
    df_y.to_csv('./transfer/models/integrated_DNN/iDNN_y_label&score.csv', index=False)

    AUC_score = roc_auc_score(list(chain.from_iterable(y_label)), list(chain.from_iterable(y_score)))
    print(f'———————————— 模型的最终AUC分数为：{AUC_score} ————————————')
    return AUC_score

In [119]:
def get_name_and_label(fileplace):
    df = pd.read_csv(fileplace + 'models/DNN/ASA/ASA_y_label&score.csv')
    name = df['pepname']
    name = name.values
    label = df['label']
    label = label.values
    return name, label


def get_model(size):
    from tensorflow.keras.layers import Dense
    from tensorflow.keras import Sequential
    dnn = Sequential()
    dnn.add(Dense(11, input_shape=(size,), bias_initializer='ones', name='Input'))
    dnn.add(Dense(128, activation='relu', name='Hidden1'))
    dnn.add(layers.Dropout(0.5, name='Dropout1'))
    dnn.add(Dense(64, activation='relu', name='Hidden2'))
    dnn.add(layers.Dropout(0.5, name='Dropout2'))
    dnn.add(Dense(1, activation='sigmoid', name='Output'))
    dnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
    return dnn

In [None]:
###用了这个
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

def training_DNN(pepID, x, y):
    print('\n')
    print('———————————— Training integrated DNN model ————————————')
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import StratifiedKFold
    import gc

    def reset_keras():
        from tensorflow.keras.backend import clear_session
        clear_session()
        gc.collect()

    feature_size = x.shape[1]

    # 使用 5 折交叉验证
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    count = 1
    y_label = []
    y_score = []
    peplist = []

    for train_index, test_index in skf.split(x, y):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]

        dnn = get_model(feature_size)
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
            ModelCheckpoint(f'./models/integrated_DNN/iDNN_{count}.h5', save_best_only=True),
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)
        ]

        dnn.fit(
            x_train, y_train,
            epochs=100,
            batch_size=64,
            validation_data=(x_test, y_test),
            callbacks=callbacks,
            verbose=1
        )

        y_label.append(list(y_test))
        y_test_score = dnn.predict(x_test).flatten()
        y_score.append(list(y_test_score))
        dnn.save(f'./models/integrated_DNN/iDNN_{count}.h5')
        peplist.append(list(pepID[test_index]))

        auc_score = roc_auc_score(y_test, y_test_score)
        print(f'第 {count} 个模型的AUC分数为：{auc_score}')
        count += 1
        reset_keras()

    from itertools import chain
    df_y = pd.concat([
        pd.DataFrame(list(chain.from_iterable(peplist)), columns=['pepname']),
        pd.DataFrame(list(chain.from_iterable(y_label)), columns=['label']),
        pd.DataFrame(list(chain.from_iterable(y_score)), columns=['score'])
    ], axis=1)
    df_y.to_csv('./models/integrated_DNN/iDNN_y_label&score.csv', index=False)

    AUC_score = roc_auc_score(list(chain.from_iterable(y_label)), list(chain.from_iterable(y_score)))
    print(f'———————————— 模型的最终AUC分数为：{AUC_score} ————————————')
    return AUC_score

In [None]:
# pre-training
#'''
dataset = auc_11('./models')
peplist, labels = get_name_and_label('./')
DNN_score = training_DNN(peplist, dataset, labels)
#'''

# trans-training
dataset = auc_11('./transfer/models')
peplist, labels = get_name_and_label('./transfer/')
DNN_score = trans_training_DNN(peplist, dataset, labels)
print('最高分数为：' + str(DNN_score))


In [139]:
# collect the auc scores of 11 DNNs and integrated_DNN's results
# '''
df_iDNN = pd.read_csv('./transfer/models/integrated_DNN/iDNN_y_label&score.csv')
label = df_iDNN['label']
score = df_iDNN['score']
df_scores = []
feature_list = ['ACF', 'ASA', 'AAINDEX', 'BTA', 'CKSAAP', 'GPS', 'OBC', 'PSEAAC', 'PSSM', 'SS', 'transformer']
modelname = 'DNN'
for featurename in feature_list:
    df = pd.read_csv(f'./transfer/models/%s/%s/%s_y_label&score.csv' % (modelname, featurename, featurename))
    label_array = df['label']
    score_array = df['score']
    auc_score_array = roc_auc_score(list(label_array), list(score_array))
    #precision, recall, _ = precision_recall_curve(label_array, score_array)
    #auc_score_array = auc(recall, precision)
    df_scores.append(auc_score_array)
    print(f'%s特征的%s模型的rocauc分数为：%s' % (featurename, modelname, str(auc_score_array)))
df_scores.append(roc_auc_score(list(label), list(score)))
pd.DataFrame(df_scores).to_csv('./transfer/models/scores.csv', index=False)

ACF特征的DNN模型的rocauc分数为：0.5667040192573356
ASA特征的DNN模型的rocauc分数为：0.5682729762899984
AAINDEX特征的DNN模型的rocauc分数为：0.6062577447335811
BTA特征的DNN模型的rocauc分数为：0.5753143843047972
CKSAAP特征的DNN模型的rocauc分数为：0.7250790036937562
GPS特征的DNN模型的rocauc分数为：0.6885787042801326
OBC特征的DNN模型的rocauc分数为：0.632040666891968
PSEAAC特征的DNN模型的rocauc分数为：0.5833451912986249
PSSM特征的DNN模型的rocauc分数为：0.6399773512862929
SS特征的DNN模型的rocauc分数为：0.5784337702993544
transformer特征的DNN模型的rocauc分数为：0.6970300608137714
