In [None]:
# ADMET性质预测
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import random
import os
import math
import lightgbm as lgb
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit, GridSearchCV
from sklearn.manifold import TSNE
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score,precision_score, recall_score, f1_score, confusion_matrix,roc_curve, auc
from scipy import stats
import itertools
sns.set_style('ticks')

# # 1 导入数据
ADMET = pd.read_excel('./数据/ADMET.xlsx')
data = pd.read_excel('./数据/Molecular_Descriptor.xlsx')
test_x = pd.read_excel('./数据/Molecular_Descriptor.xlsx',sheet_name='test')
ADMET_test = pd.read_excel('./数据/ADMET.xlsx',sheet_name='test')

X1_fea = list(pd.read_excel('./数据/Caco-2-51个特征排序.xlsx')['features'][:20])
X2_fea = list(pd.read_excel('./数据/CYP3A4特征排序.xlsx')['features'][:20])
X3_fea = list(pd.read_excel('./数据/hERG特征排序.xlsx')['features'][:20])
X4_fea = list(pd.read_excel('./数据/HOB特征排序.xlsx')['features'][:20])
X5_fea = list(pd.read_excel('./数据/MN特征排序.xlsx')['features'][:20])

X1, X2, X3, X4, X5 = data[X1_fea], data[X2_fea], data[X3_fea], data[X4_fea], data[X5_fea]
y1,y2,y3,y4,y5 = ADMET['Caco-2'], ADMET['CYP3A4'], ADMET['hERG'], ADMET['HOB'], ADMET['MN']
X1_train,X1_test,y1_train,y1_test=train_test_split(np.array(X1),np.array(y1),test_size=0.2,shuffle=True)
X2_train,X2_test,y2_train,y2_test=train_test_split(np.array(X2),np.array(y2),test_size=0.2,shuffle=True)
X3_train,X3_test,y3_train,y3_test=train_test_split(np.array(X3),np.array(y3),test_size=0.2,shuffle=True)
X4_train,X4_test,y4_train,y4_test=train_test_split(np.array(X4),np.array(y4),test_size=0.2,shuffle=True)
X5_train,X5_test,y5_train,y5_test=train_test_split(np.array(X5),np.array(y5),test_size=0.2,shuffle=True)

test_x1, test_x2, test_x3, test_x4, test_x5 = test_x[X1_fea], test_x[X2_fea], test_x[X3_fea], test_x[X4_fea], test_x[X5_fea]

# # 2 模型训练
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.title(title,fontsize=18,fontweight='bold')
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0,fontsize=16,fontweight='bold')
    plt.yticks(tick_marks, classes,fontsize=16,fontweight='bold')
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.colorbar()

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",fontsize=14)

def train_model(X_train,y_train,X_test,y_test,save_path1=None,save_path2=None,save_path3 =None):
    auc_score, pre_score, rec_score, f_score = {},{},{},{}
    #逻辑斯蒂回归
    LR = LogisticRegression(solver='newton-cg')
    LR_score = LR.fit(X_train,y_train).predict_proba(X_test)
    LR_fpr,LR_tpr,LR_threshold = roc_curve(y_test, LR_score[:,1])
    LR_roc_auc = auc(LR_fpr,LR_tpr)
    LR_pred = LR.predict(X_test)
    auc_score['LR'] = roc_auc_score(y_test,LR_pred)
    pre_score['LR'] = precision_score(y_test,LR_pred)
    rec_score['LR'] = recall_score(y_test,LR_pred)
    f_score['LR'] = f1_score(y_test,LR_pred)
    #SVM
    svc = SVC(kernel='linear')
    svc_score = svc.fit(X_train,y_train).decision_function(X_test)
    svc_fpr,svc_tpr,svc_threshold = roc_curve(y_test, svc_score) ###计算真正率和假正率
    svc_roc_auc = auc(svc_fpr,svc_tpr)
    svc_pred = svc.predict(X_test)
    auc_score['SVC'] = roc_auc_score(y_test,svc_pred)
    pre_score['SVC'] = precision_score(y_test,svc_pred)
    rec_score['SVC'] = recall_score(y_test,svc_pred)
    f_score['SVC'] = f1_score(y_test,svc_pred)
    #RF
    RF = RandomForestClassifier(random_state=5)
    RF_score = RF.fit(X_train,y_train).predict_proba(X_test)
    RF_fpr,RF_tpr,RF_threshold = roc_curve(y_test, RF_score[:,1]) ###计算真正率和假正率
    RF_roc_auc = auc(RF_fpr,RF_tpr)
    RF_pred = RF.predict(X_test)
    auc_score['RF'] = roc_auc_score(y_test,RF_pred)
    pre_score['RF'] = precision_score(y_test,RF_pred)
    rec_score['RF'] = recall_score(y_test,RF_pred)
    f_score['RF'] = f1_score(y_test,RF_pred)
    # XGB
    xgb= XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5, min_child_weight=1,gamma=0,subsample=0.6,random_state=5,colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1,seed=27)
    xgb_score = xgb.fit(X_train,y_train).predict_proba(X_test)
    xgb_fpr,xgb_tpr,xgb_threshold = roc_curve(y_test, xgb_score[:,1]) 
    xgb_roc_auc = auc(xgb_fpr,xgb_tpr)
    xgb_pred = xgb.predict(X_test)
    auc_score['XGBoost'] = roc_auc_score(y_test,xgb_pred)
    pre_score['XGBoost'] = precision_score(y_test,xgb_pred)
    rec_score['XGBoost'] = recall_score(y_test,xgb_pred)
    f_score['XGBoost'] = f1_score(y_test,xgb_pred)
    # GBDT
    gbdt = GradientBoostingClassifier()
    gbdt_score = gbdt.fit(X_train,y_train).predict_proba(X_test)
    gbdt_fpr,gbdt_tpr,gbdt_threshold = roc_curve(y_test, gbdt_score[:,1]) 
    gbdt_roc_auc = auc(gbdt_fpr,gbdt_tpr)
    gbdt_pred = gbdt.predict(X_test)
    auc_score['GBDT'] = roc_auc_score(y_test,gbdt_pred)
    pre_score['GBDT'] = precision_score(y_test,gbdt_pred)
    rec_score['GBDT'] = recall_score(y_test,gbdt_pred)
    f_score['GBDT'] = f1_score(y_test,gbdt_pred)
    # lgb
    gmb = lgb.LGBMClassifier(num_leaves=30, learning_rate=0.05, n_estimators=200)
    gmb_score = gmb.fit(X_train,y_train).predict_proba(X_test)
    gmb_fpr,gmb_tpr,gmb_threshold = roc_curve(y_test, gmb_score[:,1]) 
    gmb_roc_auc = auc(gmb_fpr,gmb_tpr)
    gmb_pred = gmb.predict(X_test)
    auc_score['LightGBM'] = roc_auc_score(y_test,gmb_pred)
    pre_score['LightGBM'] = precision_score(y_test,gmb_pred)
    rec_score['LightGBM'] = recall_score(y_test,gmb_pred)
    f_score['LightGBM'] = f1_score(y_test,gmb_pred)
    plt.figure(figsize=(20,12))
    plt.subplot(231)
    gmb_matrix = confusion_matrix(y_test,gmb_pred)
    print("Recall metric in the testing dataset: ",gmb_matrix[1,1]/(gmb_matrix[1,0]+ gmb_matrix[1,1]))

    class_names = [0,1]
    plot_confusion_matrix(gmb_matrix,classes=class_names,title='LightGBM') 
    plt.ylabel('True label',fontsize=18,fontweight='bold')
    
    plt.subplot(232)
    xgb_matrix = confusion_matrix(y_test,xgb_pred)
    print("Recall metric in the testing dataset: ",xgb_matrix[1,1]/(xgb_matrix[1,0]+xgb_matrix [1,1]))
    class_names = [0,1]
    plot_confusion_matrix(xgb_matrix,classes=class_names ,title='XGBoost')
    
    plt.subplot(233)
    RF_matrix = confusion_matrix(y_test,RF_pred)
    print("Recall metric in the testing dataset: ",RF_matrix[1,1]/(RF_matrix[1,0]+RF_matrix [1,1]))

    class_names = [0,1]
    plot_confusion_matrix(RF_matrix,classes=class_names ,title='RandomForest')
    
    plt.subplot(234)
    svc_matrix = confusion_matrix(y_test,svc_pred)
    print("Recall metric in the testing dataset: ",svc_matrix[1,1]/(svc_matrix[1,0]+svc_matrix [1,1]))

    class_names = [0,1]
    plot_confusion_matrix(svc_matrix,classes=class_names,title='SVM')
    plt.ylabel('True label',fontsize=18,fontweight='bold')
    plt.xlabel('Predicted label',fontsize=18,fontweight='bold')
    
    plt.subplot(235)
    gbdt_matrix = confusion_matrix(y_test,gbdt_pred)

    print("Recall metric in the testing dataset: ",gbdt_matrix[1,1]/(gbdt_matrix[1,0]+ gbdt_matrix[1,1]))

    class_names = [0,1]
    plot_confusion_matrix(gbdt_matrix,classes=class_names,title='GBDT')  
    plt.xlabel('Predicted label',fontsize=18,fontweight='bold')
    
    plt.subplot(236)
    lr_matrix = confusion_matrix(y_test,LR_pred)
    print("Recall metric in the testing dataset: ",lr_matrix[1,1]/(lr_matrix[1,0]+lr_matrix[1,1]))
    class_names = [0,1]
    plot_confusion_matrix(lr_matrix,classes=class_names,title='Logistic Regression')
    plt.xlabel('Predicted label',fontsize=18,fontweight='bold')

    plt.tight_layout()
    plt.savefig(save_path1,dpi=100)
    plt.show()
    lw = 3
    
    plt.figure(figsize=(10,8))
    plt.plot(svc_fpr, svc_tpr, color='green', lw=lw, label='SVM roc curve (area = %0.2f)' % svc_roc_auc)
    plt.plot(gbdt_fpr, gbdt_tpr, color='deeppink', lw=lw, label='GBDT roc curve (area = %0.2f)' % gbdt_roc_auc)
    plt.plot(LR_fpr, LR_tpr, color='gray', lw=lw, label='Logistic regression roc curve (area = %0.2f)' % LR_roc_auc)
    plt.plot(gmb_fpr, gmb_tpr, color='red', lw=lw, label='LightGBM roc curve (area = %0.2f)' % gmb_roc_auc) 
    plt.plot(xgb_fpr, xgb_tpr, color='gold', lw=lw, label='XGBoost roc curve (area = %0.2f)' % xgb_roc_auc)
    plt.plot(RF_fpr, RF_tpr, color='royalblue', lw=lw, label='RandomForest roc curve (area = %0.2f)' % RF_roc_auc) 
    
    plt.plot([0, 1], [0, 1], color='black', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate',fontsize=18,fontweight='bold')
    plt.ylabel('True Positive Rate',fontsize=18,fontweight='bold')
    plt.title('ROC_AUC',fontsize=18,fontweight='bold')
    plt.legend(loc=4,fontsize=12)
    plt.savefig(save_path2,dpi=100)
    plt.show()
    
    plt.figure(figsize=(10,8))
    plt.plot(gmb_fpr[45:73], gmb_tpr[45:73], color='red', lw=lw, label='LightGBM roc curve (area = %0.2f)' % gmb_roc_auc) 
    plt.plot(xgb_fpr[45:70], xgb_tpr[45:70], color='gold', lw=lw, label='XGBoost roc curve (area = %0.2f)' % xgb_roc_auc)
    plt.plot(RF_fpr[35:60], RF_tpr[35:60], color='royalblue', lw=lw, label='RandomForest roc curve (area = %0.2f)' % RF_roc_auc)
    plt.plot(gbdt_fpr[50:84], gbdt_tpr[50:84], color='deeppink', lw=lw, label='GBDT roc curve (area = %0.2f)' % gbdt_roc_auc)
    plt.xlabel('False Positive Rate',fontsize=18,fontweight='bold')
    plt.ylabel('True Positive Rate',fontsize=18,fontweight='bold')
    plt.title('ROC_AUC',fontsize=18,fontweight='bold')
    plt.savefig(save_path3,dpi=100)
    plt.show()
    return auc_score,pre_score,rec_score,f_score

if __name__ == '__main__':
    save_path1 = './HOB.tif' 
    save_path2 = './HOB_roc1.tif' 
    save_path3 = './HOB_roc2.tif' 
    x4_auc, x4_pre,x4_rec,x4_f1 =train_model(X4_train,y4_train,X4_test,y4_test,save_path1, save_path2,save_path3)

coca = RandomForestClassifier(random_state=5)
coca.fit(X1,y1)
copy_model = XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5,min_child_weight=1,gamma=0,subsample=0.6,random_state=5,colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1,seed=27)
copy_model.fit(X2,y2)
herg_model = XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5,min_child_weight=1,gamma=0,subsample=0.6,random_state=5,colsample_bytree=0.8,objective= 'binary:logistic',scale_pos_weight=1,seed=27)
herg_model.fit(X3,y3)
hob_model = lgb.LGBMClassifier(num_leaves=30, learning_rate=0.05, n_estimators=200)
hob_model.fit(X4,y4)
mn=XGBClassifier(learning_rate=0.1,n_estimators=1000,max_depth=5,min_child_weight=1,gamma=0,subsample=0.6,random_state=5,colsample_bytree=0.8,objective='binary:logistic',scale_pos_weight=1,seed=27)
mn.fit(X5,y5)

caco_pred = coca.predict(test_x1)
copy_pred = copy_model.predict(test_x2)
herg_pred = herg_model.predict(test_x3)
hob_pred = hob_model.predict(test_x4)
mn_pred = mn.predict(test_x5)

ADMET_test['Caco-2'] =  caco_pred
ADMET_test['CYP3A4'] =  copy_pred
ADMET_test['hERG'] =  herg_pred
ADMET_test['HOB'] =  hob_pred
ADMET_test['MN'] =  mn_pred

ADMET_test.to_excel('./prediction2.xlsx',index=False)


